diff --git a/usr/src/Makefile.lint b/usr/src/Makefile.lint
index 94e57d091b94..99f32c89ea2d 100644
--- a/usr/src/Makefile.lint
+++ b/usr/src/Makefile.lint
@@ -132,6 +132,7 @@ COMMON_SUBDIRS = \
 	cmd/fs.d/udfs/mount \
 	cmd/fs.d/ufs/mount \
 	cmd/fs.d/ufs/fsirand\
+	cmd/fs.d/zfs/fstyp \
 	cmd/fuser \
 	cmd/gcore \
 	cmd/getconf \
@@ -272,12 +273,16 @@ COMMON_SUBDIRS = \
 	cmd/xstr \
 	cmd/yes \
 	cmd/yppasswd \
+	cmd/zdb \
 	cmd/zdump \
+	cmd/zfs \
 	cmd/zlogin \
 	cmd/zoneadm \
 	cmd/zoneadmd \
 	cmd/zonecfg \
 	cmd/zonename \
+	cmd/zpool \
+	cmd/ztest \
 	lib/abi \
 	lib/auditd_plugins \
 	lib/crypt_modules \
@@ -349,6 +354,8 @@ COMMON_SUBDIRS = \
 	lib/libwanboot \
 	lib/libwanbootutil \
 	lib/libxnet \
+	lib/libzfs \
+	lib/libzfs_jni \
 	lib/libzonecfg \
 	lib/libzoneinfo \
 	lib/lvm \
diff --git a/usr/src/Targetdirs b/usr/src/Targetdirs
index d2e041185cab..8e4b8933f78d 100644
--- a/usr/src/Targetdirs
+++ b/usr/src/Targetdirs
@@ -81,6 +81,7 @@ ROOT.SYS= \
 	/etc/dfs  \
 	/etc/fs  \
 	/etc/fs/nfs  \
+	/etc/fs/zfs \
 	/etc/ftpd  \
 	/etc/rpcsec	\
 	/etc/security	\
@@ -307,6 +308,7 @@ ROOT.SYS2= \
 	/usr/lib/fs \
 	/usr/lib/fs/nfs \
 	/usr/lib/fs/proc \
+	/usr/lib/fs/zfs \
 	/usr/lib/mdb \
 	/usr/lib/mdb/kvm \
 	/usr/lib/mdb/proc \
diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile
index 14d2a8a8b25c..c3f7131069f9 100644
--- a/usr/src/cmd/Makefile
+++ b/usr/src/cmd/Makefile
@@ -49,6 +49,7 @@ FIRST_SUBDIRS=		\
 
 COMMON_SUBDIRS=		\
 	agents		\
+	availdevs	\
 	lp		\
 	perl		\
 	man		\
@@ -420,13 +421,17 @@ COMMON_SUBDIRS=		\
 	yes		\
 	ypcmd		\
 	yppasswd	\
+	zdb		\
 	zdump		\
+	zfs		\
 	zic		\
 	zlogin		\
 	zoneadm		\
 	zoneadmd	\
 	zonecfg		\
-	zonename
+	zonename	\
+	zpool		\
+	ztest
 
 i386_SUBDIRS=		\
 	addbadsec	\
@@ -664,12 +669,14 @@ MSGSUBDIRS=		\
 	xargs		\
 	yppasswd	\
 	zdump		\
+	zfs		\
 	zic		\
 	zlogin		\
 	zoneadm		\
 	zoneadmd	\
 	zonecfg		\
-	zonename
+	zonename	\
+	zpool
 
 sparc_MSGSUBDIRS=	\
 	fruadm		\
diff --git a/usr/src/cmd/allocate/Makefile b/usr/src/cmd/allocate/Makefile
index 36a28e3dd00e..6b3055bce1e4 100644
--- a/usr/src/cmd/allocate/Makefile
+++ b/usr/src/cmd/allocate/Makefile
@@ -20,7 +20,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 1989, 1998-2002 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -84,7 +84,7 @@ $(ROOTSECDEV)/% :=              GROUP = bin
 
 $(ROOTSECLIB)/% :=		FILEMODE = 0751
 
-allocate :=	LDLIBS += -lbsm -lsecdb
+allocate :=	LDLIBS += -lbsm -lsec -lsecdb
 
 .KEEP_STATE:
 
diff --git a/usr/src/cmd/allocate/allocate3.c b/usr/src/cmd/allocate/allocate3.c
index 3488421c1ebe..0a5e0d0d9d6f 100644
--- a/usr/src/cmd/allocate/allocate3.c
+++ b/usr/src/cmd/allocate/allocate3.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1999-2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -255,24 +255,7 @@ list_devices(int optflg, uid_t uid, char *device)
 static int
 newdac(char *file, uid_t owner, gid_t group, o_mode_t mode)
 {
-	int	err = 0;
-	aclent_t	min_acl[MIN_ACL_ENTRIES];
-
-	min_acl[0].a_type = USER_OBJ;
-	min_acl[0].a_id   = owner;
-	min_acl[0].a_perm = ((mode & 0700) >> 6);
-
-	min_acl[1].a_type = GROUP_OBJ;
-	min_acl[1].a_id   = group;
-	min_acl[1].a_perm = ((mode & 0070) >> 3);
-
-	min_acl[2].a_type = CLASS_OBJ;
-	min_acl[2].a_id   = (uid_t)-1;
-	min_acl[2].a_perm = ((mode & 0070) >> 3);
-
-	min_acl[3].a_type = OTHER_OBJ;
-	min_acl[3].a_id   = (uid_t)-1;
-	min_acl[3].a_perm = (mode & 0007);
+	int		err = 0;
 
 	do {
 		if (chown(file, owner, group) == -1) {
@@ -281,7 +264,9 @@ newdac(char *file, uid_t owner, gid_t group, o_mode_t mode)
 		}
 	} while (fdetach(file) == 0);
 
-	if (acl(file, SETACL, MIN_ACL_ENTRIES, min_acl) < 0) {
+	err = acl_strip(file, owner, group, (mode_t)mode);
+
+	if (err != 0) {
 		dperror("newdac, unable to setacl");
 		err = SETACL_PERR;
 	}
diff --git a/usr/src/cmd/availdevs/Makefile b/usr/src/cmd/availdevs/Makefile
new file mode 100644
index 000000000000..99015803ba3d
--- /dev/null
+++ b/usr/src/cmd/availdevs/Makefile
@@ -0,0 +1,65 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+PROG=		availdevs
+OBJS_COMMON=	availdevs.o
+OBJS=		$(OBJS_COMMON)
+SRCS=		$(OBJS_COMMON:%.o=%.c)
+
+include ../Makefile.cmd
+
+ROOTCMDDIR=	$(ROOTLIB)/zfs
+
+INCS += -I../../lib/libzfs_jni/common \
+	-I/usr/include/libxml2
+
+LDLIBS += -lzfs_jni -lxml2
+CPPFLAGS += $(INCS) -D_LARGEFILE64_SOURCE=1 -D_REENTRANT
+
+.KEEP_STATE:
+
+.PARALLEL:
+
+all: $(PROG)
+
+$(PROG): $(OBJS)
+	$(LINK.c) -o $@ $(OBJS) $(LDLIBS)
+	$(POST_PROCESS)
+
+%.o: %.c
+	$(COMPILE.c) -o $@ $<
+	$(POST_PROCESS_O)
+
+install: all $(ROOTCMD)
+
+clean:
+	$(RM) $(OBJS)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/availdevs/availdevs.c b/usr/src/cmd/availdevs/availdevs.c
new file mode 100644
index 000000000000..703897f39470
--- /dev/null
+++ b/usr/src/cmd/availdevs/availdevs.c
@@ -0,0 +1,158 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include "availdevs.h"
+#include <libzfs_jni_diskmgt.h>
+#include <libxml/parser.h>
+
+/*
+ * Function prototypes
+ */
+
+static void handle_error(const char *, va_list);
+static int add_disk_to_xml(dmgt_disk_t *, void *);
+static xmlDocPtr create_doc();
+int main();
+
+/*
+ * Static functions
+ */
+
+static void
+handle_error(const char *fmt, va_list ap)
+{
+	(void) vfprintf(stderr, fmt, ap);
+	(void) fprintf(stderr, "\n");
+}
+
+static int
+add_disk_to_xml(dmgt_disk_t *dp, void *data)
+{
+	int i, n;
+	char tmp[64];
+	xmlNodePtr available = *((xmlNodePtr *)data);
+
+	xmlNodePtr disk = xmlNewChild(
+	    available, NULL, (xmlChar *)ELEMENT_DISK, NULL);
+	xmlSetProp(disk,
+	    (xmlChar *)ATTR_DISK_NAME, (xmlChar *)dp->name);
+	n = snprintf(tmp, sizeof (tmp) - 1, "%llu", dp->size);
+	tmp[n] = '\0';
+	xmlSetProp(disk, (xmlChar *)ATTR_DISK_SIZE, (xmlChar *)tmp);
+
+	if (dp->aliases != NULL) {
+		for (i = 0; dp->aliases[i] != NULL; i++) {
+			xmlNodePtr alias = xmlNewChild(
+			    disk, NULL, (xmlChar *)ELEMENT_ALIAS, NULL);
+			xmlSetProp(alias,
+			    (xmlChar *)ATTR_ALIAS_NAME,
+			    (xmlChar *)dp->aliases[i]);
+		}
+	}
+
+	if (dp->slices != NULL) {
+		for (i = 0; dp->slices[i] != NULL; i++) {
+			dmgt_slice_t *sp = dp->slices[i];
+			xmlNodePtr slice = xmlNewChild(
+			    disk, NULL, (xmlChar *)ELEMENT_SLICE, NULL);
+			xmlSetProp(slice,
+			    (xmlChar *)ATTR_SLICE_NAME, (xmlChar *)sp->name);
+
+			n = snprintf(tmp, sizeof (tmp) - 1, "%llu", sp->size);
+			tmp[n] = '\0';
+			xmlSetProp(slice, (xmlChar *)ATTR_SLICE_SIZE,
+			    (xmlChar *)tmp);
+
+			n = snprintf(tmp, sizeof (tmp) - 1, "%llu", sp->start);
+			tmp[n] = '\0';
+			xmlSetProp(slice, (xmlChar *)ATTR_SLICE_START,
+			    (xmlChar *)tmp);
+
+			if (sp->used_name != NULL) {
+				xmlSetProp(slice,
+				    (xmlChar *)ATTR_SLICE_USED_NAME,
+				    (xmlChar *)sp->used_name);
+			}
+
+			if (sp->used_by != NULL) {
+				xmlSetProp(slice, (xmlChar *)ATTR_SLICE_USED_BY,
+				    (xmlChar *)sp->used_by);
+			}
+		}
+	}
+
+	return (0);
+}
+
+static xmlDocPtr
+create_doc(void)
+{
+	/* Create the XML document */
+	xmlDocPtr doc = xmlNewDoc((xmlChar *)"1.0");
+
+	/* Create the root node */
+	xmlNodePtr root = xmlNewDocNode(
+	    doc, NULL, (xmlChar *)ELEMENT_ROOT, NULL);
+	xmlAddChild((xmlNodePtr) doc, (xmlNodePtr)root);
+
+	/* Create the available node */
+	xmlNewChild(root, NULL, (xmlChar *)ELEMENT_AVAILABLE, NULL);
+
+	return (doc);
+}
+
+/*
+ * Main entry to availdisks.
+ *
+ * @return      0 on successful exit, non-zero otherwise
+ */
+int
+main(void)
+{
+	int error;
+	xmlDocPtr doc;
+	xmlNodePtr root;
+	xmlNodePtr available;
+
+	/* diskmgt.o error handler */
+	dmgt_set_error_handler(handle_error);
+
+	doc = create_doc();
+	root = xmlDocGetRootElement(doc);
+	available = xmlGetLastChild(root);
+
+	error = dmgt_avail_disk_iter(add_disk_to_xml, &available);
+	if (!error) {
+		/* Print out XML */
+		xmlDocFormatDump(stdout, doc, 1);
+	}
+
+	xmlFreeDoc(doc);
+
+	return (error != 0);
+}
diff --git a/usr/src/cmd/availdevs/availdevs.h b/usr/src/cmd/availdevs/availdevs.h
new file mode 100644
index 000000000000..3868b237aa72
--- /dev/null
+++ b/usr/src/cmd/availdevs/availdevs.h
@@ -0,0 +1,62 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _AVAILDEVS_H
+#define	_AVAILDEVS_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Constants
+ */
+
+/* Must match the values in XMLDataModel.java */
+#define	ELEMENT_ROOT			"zfsconfig"
+#define	ELEMENT_AVAILABLE		"available"
+#define	ELEMENT_DISK			"disk"
+#define	ELEMENT_ALIAS			"alias"
+#define	ELEMENT_SLICE			"slice"
+#define	ATTR_DISK_NAME			"name"
+#define	ATTR_DISK_SIZE			"size"
+#define	ATTR_DISK_INUSE			"inuse"
+#define	ATTR_ALIAS_NAME			"name"
+#define	ATTR_SLICE_NAME			"name"
+#define	ATTR_SLICE_SIZE			"size"
+#define	ATTR_SLICE_START		"start"
+#define	ATTR_SLICE_USED_NAME		"used-name"
+#define	ATTR_SLICE_USED_BY		"used-by"
+#define	VAL_ATTR_TRUE			"true"
+#define	VAL_ATTR_FALSE			"false"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _AVAILDEVS_H */
diff --git a/usr/src/cmd/bart/create.c b/usr/src/cmd/bart/create.c
index 17770c35f4f5..a676cd480bbc 100644
--- a/usr/src/cmd/bart/create.c
+++ b/usr/src/cmd/bart/create.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -31,6 +31,7 @@
 #include <sys/statvfs.h>
 #include <sys/wait.h>
 #include "bart.h"
+#include <aclutils.h>
 
 static int	sanitize_reloc_root(char *root, size_t bufsize);
 static int	create_manifest_filelist(char **argv, char *reloc_root);
@@ -623,46 +624,28 @@ sanitized_fname(const char *fname, boolean_t canon_path)
 static char *
 get_acl_string(const char *fname, const struct stat64 *statb, int *err_code)
 {
-	aclent_t	*aclbuf;
-	int		num_acls, ret;
-	char		*acl_info;
+	acl_t		*aclp;
+	char		*acltext;
+	int		error;
 
 	if (S_ISLNK(statb->st_mode)) {
 		return (safe_strdup("-"));
 	}
 
-	/* First, figure out how many ACL entries this file has */
-	num_acls = acl(fname, GETACLCNT, 0, NULL);
-	if (num_acls < 0) {
-		*err_code = WARNING_EXIT;
-		perror(fname);
-		return (safe_strdup("-"));
-	}
-
 	/*
-	 * Next, create a buffer which is big enough for all the ACL entries.
-	 * Then go get the raw data.
+	 *  Include trivial acl's
 	 */
-	aclbuf = (aclent_t *)safe_calloc(sizeof (aclent_t) * num_acls);
-	ret = acl(fname, GETACL, num_acls, aclbuf);
-	if (ret < 0) {
-		*err_code = WARNING_EXIT;
-		perror(fname);
-		return (safe_strdup("-"));
-	}
-
-	/* Convert the raw entries to text */
-	acl_info = acltotext(aclbuf, num_acls);
-
-	/* Free up the buffer which held the raw ACL entries */
-	free(aclbuf);
+	error = acl_get(fname, 0, &aclp);
 
-	if (acl_info == NULL) {
+	if (error != 0) {
 		*err_code = WARNING_EXIT;
-		perror(fname);
+		(void) fprintf(stderr, "%s: %s\n", fname, acl_strerror(error));
 		return (safe_strdup("-"));
-	} else
-		return (acl_info);
+	} else {
+		acltext = acl_totext(aclp);
+		acl_free(aclp);
+		return (acltext);
+	}
 }
 
 
diff --git a/usr/src/cmd/chmod/Makefile b/usr/src/cmd/chmod/Makefile
index e7f71f4c8d58..fad270c1b074 100644
--- a/usr/src/cmd/chmod/Makefile
+++ b/usr/src/cmd/chmod/Makefile
@@ -22,7 +22,7 @@
 #
 #ident	"%Z%%M%	%I%	%E% SMI"
 #
-# Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # cmd/chmod/Makefile
@@ -42,6 +42,8 @@ CPPFLAGS += -D_FILE_OFFSET_BITS=64
 
 LINTFLAGS += -erroff=E_NAME_DEF_NOT_USED2
 
+LDLIBS += -lsec
+
 POFILE= chmod_cmd.po
 XGETFLAGS= -a -x chmod.xcl
 
diff --git a/usr/src/cmd/chmod/chmod.c b/usr/src/cmd/chmod/chmod.c
index 37f07e0b2e06..71dc1de1dfb3 100644
--- a/usr/src/cmd/chmod/chmod.c
+++ b/usr/src/cmd/chmod/chmod.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -44,6 +44,7 @@
  * chmod option mode files
  * where
  *	mode is [ugoa][+-=][rwxXlstugo] or an octal number
+ *	mode is [<+|->A[# <number] ]<aclspec>
  *	option is -R and -f
  */
 
@@ -63,8 +64,10 @@
 #include <string.h>	/* strerror() */
 #include <stdarg.h>
 #include <limits.h>
+#include <ctype.h>
 #include <errno.h>
 #include <sys/acl.h>
+#include <aclutils.h>
 
 static int	rflag;
 static int	fflag;
@@ -77,25 +80,38 @@ static char	**mav;		/* Alternate to argv (for parseargs) */
 
 static char	*ms;		/* Points to the mode argument */
 
+#define	ACL_ADD		1
+#define	ACL_DELETE	2
+#define	ACL_SLOT_DELETE 3
+#define	ACL_REPLACE	4
+#define	ACL_STRIP	5
+
+typedef struct acl_args {
+	acl_t	*acl_aclp;
+	int	acl_slot;
+	int	acl_action;
+} acl_args_t;
+
 extern mode_t
 newmode_common(char *ms, mode_t new_mode, mode_t umsk, char *file, char *path,
 	o_mode_t *group_clear_bits, o_mode_t *group_set_bits);
 
 static int
-dochmod(char *name, char *path, mode_t umsk),
-chmodr(char *dir, char *path, mode_t mode, mode_t umsk);
+dochmod(char *name, char *path, mode_t umsk, acl_args_t *aclp),
+chmodr(char *dir, char *path, mode_t mode, mode_t umsk, acl_args_t *aclp);
+static int doacl(char *file, struct stat *st, acl_args_t *aclp);
 
 static void handle_acl(char *name, o_mode_t group_clear_bits,
-	o_mode_t group_set_bits);
+    o_mode_t group_set_bits);
 
-static void
-usage(void);
+static void usage(void);
 
-void
-errmsg(int severity, int code, char *format, ...);
+void errmsg(int severity, int code, char *format, ...);
 
-static void
-parseargs(int ac, char *av[]);
+static void parseargs(int ac, char *av[]);
+
+int
+parse_acl_args(char *arg, acl_args_t **acl_args);
 
 int
 main(int argc, char *argv[])
@@ -103,6 +119,7 @@ main(int argc, char *argv[])
 	int i, c;
 	int status = 0;
 	mode_t umsk;
+	acl_args_t *acl_args = NULL;
 
 	(void) setlocale(LC_ALL, "");
 #if !defined(TEXT_DOMAIN)	/* Should be defined by cc -D */
@@ -134,9 +151,16 @@ main(int argc, char *argv[])
 	mac -= optind;
 	mav += optind;
 
-	if (mac < 2) {
-		usage();
-		exit(2);
+	if (mac >= 2 && (mav[0][0] == 'A')) {
+		if (parse_acl_args(*mav, &acl_args)) {
+			usage();
+			exit(2);
+		}
+	} else {
+		if (mac < 2) {
+			usage();
+			exit(2);
+		}
 	}
 
 	ms = mav[0];
@@ -144,14 +168,15 @@ main(int argc, char *argv[])
 	umsk = umask(0);
 	(void) umask(umsk);
 
-	for (i = 1; i < mac; i++)
-		status += dochmod(mav[i], mav[i], umsk);
+	for (i = 1; i < mac; i++) {
+		status += dochmod(mav[i], mav[i], umsk, acl_args);
+	}
 
 	return (fflag ? 0 : status);
 }
 
 static int
-dochmod(char *name, char *path, mode_t umsk)
+dochmod(char *name, char *path, mode_t umsk, acl_args_t *aclp)
 {
 	static struct stat st;
 	int linkflg = 0;
@@ -172,9 +197,11 @@ dochmod(char *name, char *path, mode_t umsk)
 
 	/* Do not recurse if directory is object of symbolic link */
 	if (rflag && ((st.st_mode & S_IFMT) == S_IFDIR) && !linkflg)
-		return (chmodr(name, path, st.st_mode, umsk));
+		return (chmodr(name, path, st.st_mode, umsk, aclp));
 
-	if (chmod(name, newmode_common(ms, st.st_mode, umsk, name, path,
+	if (aclp) {
+		return (doacl(name, &st, aclp));
+	} else if (chmod(name, newmode_common(ms, st.st_mode, umsk, name, path,
 	    &group_clear_bits, &group_set_bits)) == -1) {
 		errmsg(2, 0, gettext("can't change %s\n"), path);
 		return (1);
@@ -195,7 +222,7 @@ dochmod(char *name, char *path, mode_t umsk)
 
 
 static int
-chmodr(char *dir, char *path,  mode_t mode, mode_t umsk)
+chmodr(char *dir, char *path,  mode_t mode, mode_t umsk, acl_args_t *aclp)
 {
 
 	DIR *dirp;
@@ -204,6 +231,7 @@ chmodr(char *dir, char *path,  mode_t mode, mode_t umsk)
 	char currdir[PATH_MAX+1];		/* current dir name + '/' */
 	char parentdir[PATH_MAX+1];		/* parent dir name  + '/' */
 	int ecode;
+	struct stat st;
 	o_mode_t	group_clear_bits, group_set_bits;
 
 	if (getcwd(savedir, PATH_MAX) == 0)
@@ -213,7 +241,14 @@ chmodr(char *dir, char *path,  mode_t mode, mode_t umsk)
 	/*
 	 * Change what we are given before doing it's contents
 	 */
-	if (chmod(dir, newmode_common(ms, mode, umsk, dir, path,
+	if (aclp) {
+		if (lstat(dir, &st) < 0) {
+			errmsg(2, 0, gettext("can't access %s\n"), path);
+			return (1);
+		}
+		if (doacl(dir, &st, aclp) != 0)
+			return (1);
+	} else if (chmod(dir, newmode_common(ms, mode, umsk, dir, path,
 	    &group_clear_bits, &group_set_bits)) < 0) {
 		errmsg(2, 0, gettext("can't change %s\n"), path);
 		return (1);
@@ -226,8 +261,11 @@ chmodr(char *dir, char *path,  mode_t mode, mode_t umsk)
 	 * permissions changes to both the acl mask and the
 	 * general group permissions.
 	 */
-	if (group_clear_bits || group_set_bits)
-		handle_acl(dir, group_clear_bits, group_set_bits);
+
+	if (aclp == NULL) { /* only necessary when not setting ACL */
+		if (group_clear_bits || group_set_bits)
+			handle_acl(dir, group_clear_bits, group_set_bits);
+	}
 
 	if (chdir(dir) < 0) {
 		errmsg(2, 0, "%s/%s: %s\n", savedir, dir, strerror(errno));
@@ -255,7 +293,7 @@ chmodr(char *dir, char *path,  mode_t mode, mode_t umsk)
 	for (dp = readdir(dirp); dp != NULL; dp = readdir(dirp))  {
 		(void) strcpy(currdir, parentdir);
 		(void) strcat(currdir, dp->d_name);
-		ecode += dochmod(dp->d_name, currdir, umsk);
+		ecode += dochmod(dp->d_name, currdir, umsk, aclp);
 	}
 	(void) closedir(dirp);
 	if (chdir(savedir) < 0) {
@@ -300,14 +338,27 @@ usage(void)
 	(void) fprintf(stderr, gettext(
 	    "usage:\tchmod [-fR] <absolute-mode> file ...\n"));
 
+	(void) fprintf(stderr, gettext(
+	    "\tchmod [-fR] <ACL-operation> file ...\n"));
+
 	(void) fprintf(stderr, gettext(
 	    "\tchmod [-fR] <symbolic-mode-list> file ...\n"));
 
+
 	(void) fprintf(stderr, gettext(
 	    "where \t<symbolic-mode-list> is a comma-separated list of\n"));
 
 	(void) fprintf(stderr, gettext(
 	    "\t[ugoa]{+|-|=}[rwxXlstugo]\n"));
+
+	(void) fprintf(stderr, gettext(
+	    "where \t<ACL-operation> is one of the following\n"));
+	(void) fprintf(stderr, gettext("\tA-<acl_specification>\n"));
+	(void) fprintf(stderr, gettext("\tA[number]-\n"));
+	(void) fprintf(stderr, gettext(
+	    "\tA[number]{+|=}<acl_specification>\n"));
+	(void) fprintf(stderr, gettext(
+	    "where \t<acl-specification> is a comma-separated list of ACEs\n"));
 }
 
 /*
@@ -373,6 +424,74 @@ parseargs(int ac, char *av[])
 	mav[mac] = (char *)NULL;
 }
 
+int
+parse_acl_args(char *arg, acl_args_t **acl_args)
+{
+	acl_t *new_acl = NULL;
+	int slot;
+	int error;
+	int len;
+	int action;
+	acl_args_t *new_acl_args;
+	char *acl_spec = NULL;
+	char *end;
+
+	if (arg[0] != 'A')
+		return (1);
+
+	slot = strtol(&arg[1], &end, 10);
+
+	len = strlen(arg);
+	switch (*end) {
+	case '+':
+		action = ACL_ADD;
+		acl_spec = ++end;
+		break;
+	case '-':
+		if (len == 2 && arg[0] == 'A' && arg[1] == '-')
+			action = ACL_STRIP;
+		else
+			action = ACL_DELETE;
+		if (action != ACL_STRIP) {
+			acl_spec = ++end;
+			if (acl_spec[0] == '\0') {
+				action = ACL_SLOT_DELETE;
+				acl_spec = NULL;
+			} else if (arg[1] != '-')
+				return (1);
+		}
+		break;
+	case '=':
+		action = ACL_REPLACE;
+		acl_spec = ++end;
+		break;
+	default:
+		return (1);
+	}
+
+	if ((action == ACL_REPLACE || action == ACL_ADD) && acl_spec[0] == '\0')
+		return (1);
+
+	if (acl_spec) {
+		if (error = acl_fromtext(acl_spec, &new_acl)) {
+			errmsg(1, 1, "%s\n", acl_strerror(error));
+			return (1);
+		}
+	}
+
+	new_acl_args = malloc(sizeof (acl_args_t));
+	if (new_acl_args == NULL)
+		return (1);
+
+	new_acl_args->acl_aclp = new_acl;
+	new_acl_args->acl_slot = slot;
+	new_acl_args->acl_action = action;
+
+	*acl_args = new_acl_args;
+
+	return (0);
+}
+
 /*
  * This function is called whenever the group permissions of a file
  * is being modified.  According to the chmod(1) manpage, any
@@ -388,6 +507,14 @@ handle_acl(char *name, o_mode_t group_clear_bits, o_mode_t group_set_bits)
 	aclent_t *aclp, *tp;
 	o_mode_t newperm;
 
+	/*
+	 * if this file system support ace_t acl's
+	 * then simply return since we don't have an
+	 * acl mask to deal with
+	 */
+	if (pathconf(name, _PC_ACL_ENABLED) == _ACL_ACE_ENABLED)
+		return;
+
 	if ((aclcnt = acl(name, GETACLCNT, 0, NULL)) <= MIN_ACL_ENTRIES)
 		return;	/* it's just a trivial acl; no need to change it */
 
@@ -424,3 +551,134 @@ handle_acl(char *name, o_mode_t group_clear_bits, o_mode_t group_set_bits)
 	}
 	free(aclp);
 }
+
+static int
+doacl(char *file, struct stat *st, acl_args_t *acl_args)
+{
+	acl_t *aclp;
+	acl_t *set_aclp;
+	int error = 0;
+	void *to, *from;
+	int len;
+	int isdir;
+
+	isdir = S_ISDIR(st->st_mode);
+
+	error = acl_get(file, 0, &aclp);
+
+	if (error != 0) {
+		errmsg(1, 1, "%s\n", acl_strerror(error));
+		return (1);
+	}
+
+	switch (acl_args->acl_action) {
+	case ACL_ADD:
+		if ((error = acl_addentries(aclp,
+			acl_args->acl_aclp, acl_args->acl_slot)) != 0) {
+				errmsg(1, 1, "%s\n", acl_strerror(error));
+				acl_free(aclp);
+				return (1);
+		}
+		set_aclp = aclp;
+		break;
+	case ACL_SLOT_DELETE:
+
+		if (acl_args->acl_slot + 1 > aclp->acl_cnt) {
+			errmsg(1, 1,
+			    gettext("Invalid slot specified for removal\n"));
+			acl_free(aclp);
+			return (1);
+		}
+
+		if (acl_args->acl_slot == 0 && aclp->acl_cnt == 1) {
+			errmsg(1, 1,
+			    gettext("Can't remove all ACL "
+			    "entries from a file\n"));
+			acl_free(aclp);
+			return (1);
+		}
+
+		/*
+		 * remove a single entry
+		 *
+		 * if last entry just adjust acl_cnt
+		 */
+
+		if ((acl_args->acl_slot + 1) == aclp->acl_cnt)
+			aclp->acl_cnt--;
+		else {
+			to = (char *)aclp->acl_aclp +
+			    (acl_args->acl_slot * aclp->acl_entry_size);
+			from = (char *)to + aclp->acl_entry_size;
+			len = (aclp->acl_cnt - acl_args->acl_slot - 1) *
+			    aclp->acl_entry_size;
+			(void) memmove(to, from, len);
+			aclp->acl_cnt--;
+		}
+		set_aclp = aclp;
+		break;
+
+	case ACL_DELETE:
+		if ((error = acl_removeentries(aclp, acl_args->acl_aclp,
+		    acl_args->acl_slot, ACL_REMOVE_ALL)) != 0) {
+			errmsg(1, 1, "%s\n", acl_strerror(error));
+			acl_free(aclp);
+			return (1);
+		}
+
+		if (aclp->acl_cnt == 0) {
+			errmsg(1, 1,
+			    gettext("Can't remove all ACL "
+			    "entries from a file\n"));
+			acl_free(aclp);
+			return (1);
+		}
+
+		set_aclp = aclp;
+		break;
+	case ACL_REPLACE:
+		if (acl_args->acl_slot >= 0)  {
+			error = acl_modifyentries(aclp, acl_args->acl_aclp,
+			    acl_args->acl_slot);
+			if (error) {
+				errmsg(1, 1, "%s\n", acl_strerror(error));
+				acl_free(aclp);
+				return (1);
+			}
+			set_aclp = aclp;
+		} else {
+			set_aclp = acl_args->acl_aclp;
+		}
+		break;
+	case ACL_STRIP:
+		error = acl_strip(file, st->st_uid, st->st_gid, st->st_mode);
+		if (error) {
+			errmsg(1, 1, "%s\n", acl_strerror(error));
+			return (1);
+		}
+		acl_free(aclp);
+		return (0);
+		/*NOTREACHED*/
+	default:
+		errmsg(1, 0, gettext("Unknown ACL action requested\n"));
+		return (1);
+		break;
+	}
+
+	error = acl_check(set_aclp, isdir);
+
+	if (error) {
+		errmsg(1, 0, "%s\n%s", acl_strerror(error),
+		    gettext("See chmod(1) for more information on "
+		    "valid ACL syntax\n"));
+		return (1);
+	}
+	if ((error = acl_set(file, set_aclp)) != 0) {
+			errmsg(1, 0, gettext("Failed to set ACL: %s\n"),
+			    acl_strerror(error));
+			acl_free(aclp);
+			return (1);
+	}
+	acl_free(aclp);
+	return (0);
+}
diff --git a/usr/src/cmd/cmd-inet/usr.bin/rcp.c b/usr/src/cmd/cmd-inet/usr.bin/rcp.c
index b21f426e6945..7b76b8967ad7 100644
--- a/usr/src/cmd/cmd-inet/usr.bin/rcp.c
+++ b/usr/src/cmd/cmd-inet/usr.bin/rcp.c
@@ -1,5 +1,5 @@
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -52,6 +52,7 @@
 #include <sys/sendfile.h>
 #include <sys/sysmacros.h>
 #include <sys/wait.h>
+#include <aclutils.h>
 
 /*
  * It seems like Berkeley got these from pathnames.h?
@@ -83,6 +84,7 @@ static int iamremote;
 static int iamrecursive;
 static int targetshouldbedirectory;
 static int aclflag;
+static int acl_aclflag;
 static int retval = 0;
 static int portnumber = 0;
 
@@ -200,14 +202,14 @@ main(int argc, char *argv[])
 	}
 
 	fflag = tflag = 0;
-	while ((ch = getopt(argc, argv, "axdfprtz:D:k:P:")) != EOF) {
+	while ((ch = getopt(argc, argv, "axdfprtz:D:k:P:Z")) != EOF) {
 		switch (ch) {
 		case 'd':
 			targetshouldbedirectory = 1;
 			break;
 		case 'f':			/* "from" */
 			fflag = 1;
-			if (aclflag)
+			if (aclflag | acl_aclflag)
 				/* ok response */
 				(void) desrcpwrite(rem, "", 1);
 			break;
@@ -220,6 +222,9 @@ main(int argc, char *argv[])
 		case 't':			/* "to" */
 			tflag = 1;
 			break;
+		case 'Z':
+			acl_aclflag++;
+			break;
 		case 'x':
 			if (!krb5_privacy_allowed()) {
 				(void) fprintf(stderr, gettext("rcp: "
@@ -652,9 +657,7 @@ toremote(char *targ, int argc, char *argv[])
 				if (response() < 0)
 					exit(1);
 
-				}
-				else
-				{
+				} else {
 
 				/*
 				 * ACL support: try to find out if the remote
@@ -663,8 +666,13 @@ toremote(char *targ, int argc, char *argv[])
 				 * purpose.
 				 */
 				aclflag = 1;
+				acl_aclflag = 1;
 
-				(void) snprintf(bp, buffersize, "%s -t %s",
+				/*
+				 * First see if the remote side will support
+				 * both aclent_t and ace_t acl's?
+				 */
+				(void) snprintf(bp, buffersize, "%s -tZ %s",
 							cmd_sunw, targ);
 				rem = rcmd_af(&host, portnumber, pwd->pw_name,
 					    tuser ? tuser : pwd->pw_name,
@@ -681,32 +689,49 @@ toremote(char *targ, int argc, char *argv[])
 				    != sizeof (resp))
 					lostconn();
 				if (resp != 0) {
-					/*
-					 * Not OK:
-					 * The other side is running
-					 * non-acl rcp. Try again with
-					 * normal stuff
-					 */
-					aclflag = 0;
+					acl_aclflag = 0;
 					(void) snprintf(bp, buffersize,
-						"%s -t %s", cmd, targ);
+					    "%s -t %s", cmd_sunw, targ);
+
 					(void) close(rem);
 					host = thost;
 					rem = rcmd_af(&host, portnumber,
-							pwd->pw_name,
-							tuser ? tuser :
-							pwd->pw_name, bp, 0,
-							AF_INET6);
+					    pwd->pw_name,
+					    tuser ? tuser : pwd->pw_name,
+					    bp, 0, AF_INET6);
 					if (rem < 0)
 						exit(1);
-					if (response() < 0)
-						exit(1);
+
+					if (read(rem, &resp, sizeof (resp))
+					    != sizeof (resp))
+						lostconn();
+					if (resp != 0) {
+						/*
+						 * Not OK:
+						 * The other side is running
+						 * non-acl rcp. Try again with
+						 * normal stuff
+						 */
+						aclflag = 0;
+						(void) snprintf(bp, buffersize,
+						    "%s -t %s", cmd, targ);
+						(void) close(rem);
+						host = thost;
+						rem = rcmd_af(&host, portnumber,
+						    pwd->pw_name,
+						    tuser ? tuser :
+						    pwd->pw_name, bp, 0,
+						    AF_INET6);
+						if (rem < 0)
+							exit(1);
+						if (response() < 0)
+						    exit(1);
+					}
 				}
 				/* everything should be fine now */
 				(void) setuid(userid);
 
 				}
-
 			}
 			source(1, argv + i);
 		}
@@ -843,8 +868,9 @@ tolocal(int argc, char *argv[])
 		 * running acl cognizant version of rcp.
 		 */
 		aclflag = 1;
+		acl_aclflag = 1;
 
-		(void) snprintf(bp, buffersize, "%s -f %s", cmd_sunw, src);
+		(void) snprintf(bp, buffersize, "%s -Zf %s", cmd_sunw, src);
 		rem = rcmd_af(&host, portnumber, pwd->pw_name, suser,
 			    bp, 0, AF_INET6);
 
@@ -862,6 +888,24 @@ tolocal(int argc, char *argv[])
 		if (read(rem, &resp, sizeof (resp)) != sizeof (resp))
 			lostconn();
 		if (resp != 0) {
+
+			/*
+			 * Try again without ace_acl support
+			 */
+			acl_aclflag = 0;
+			(void) snprintf(bp, buffersize, "%s -f %s",
+			    cmd_sunw, src);
+			rem = rcmd_af(&host, portnumber, pwd->pw_name, suser,
+			    bp, 0, AF_INET6);
+
+			if (rem < 0) {
+				++errs;
+				continue;
+			}
+
+			if (read(rem, &resp, sizeof (resp)) != sizeof (resp))
+				lostconn();
+
 			/*
 			 * NOT ok:
 			 * The other side is running non-acl rcp.
@@ -1118,7 +1162,7 @@ source(int argc, char *argv[])
 		}
 
 		/* ACL support: send */
-		if (aclflag) {
+		if (aclflag | acl_aclflag) {
 			/* get acl from f and send it over */
 			if (sendacl(f) == ACL_FAIL) {
 				(void) close(f);
@@ -1414,7 +1458,7 @@ sink(int argc, char *argv[])
 		if (buf[0] == 'D') {
 			if (exists) {
 				if ((stb.st_mode&S_IFMT) != S_IFDIR) {
-					if (aclflag) {
+					if (aclflag | acl_aclflag) {
 						/*
 						 * consume acl in the pipe
 						 * fd = -1 to indicate the
@@ -1439,7 +1483,7 @@ sink(int argc, char *argv[])
 			}
 
 			/* acl support for directories */
-			if (aclflag) {
+			if (aclflag | acl_aclflag) {
 				int dfd;
 
 				if ((dfd = open(np, O_RDONLY)) == -1)
@@ -1486,7 +1530,7 @@ sink(int argc, char *argv[])
 		/*
 		 * ACL support: receiving
 		 */
-		if (aclflag) {
+		if (aclflag | acl_aclflag) {
 			/* get acl and set it to ofd */
 			if (recvacl(ofd, exists, pflag) == ACL_FAIL) {
 				(void) close(ofd);
@@ -1733,49 +1777,78 @@ static int
 sendacl(int f)
 {
 	int		aclcnt;
-	aclent_t	*aclbufp;
-	int		aclsize;
 	char		*acltext;
 	char		buf[BUFSIZ];
+	acl_t		*aclp;
+	char		acltype;
+	int		aclerror;
+	int		trivial;
 
-	if ((aclcnt = facl(f, GETACLCNT, 0, NULL)) < 0) {
-		error("can't get acl count \n");
+
+	aclerror = facl_get(f, ACL_NO_TRIVIAL, &aclp);
+	if (aclerror != 0) {
+		error("can't retrieve ACL: %s \n", acl_strerror(aclerror));
 		return (ACL_FAIL);
 	}
 
+	/*
+	 * if acl type is not ACLENT_T and were operating in acl_aclflag == 0
+	 * then don't do the malloc and facl(fd, getcntcmd,...);
+	 * since the remote side doesn't support alternate style ACL's.
+	 */
+
+	if (aclp && (acl_type(aclp) != ACLENT_T) && (acl_aclflag == 0)) {
+		aclcnt = MIN_ACL_ENTRIES;
+		acltype = 'A';
+		trivial = ACL_IS_TRIVIAL;
+	} else {
+
+		aclcnt = (aclp != NULL) ? acl_cnt(aclp) : 0;
+
+		if (aclp) {
+			acltype = (acl_type(aclp) != ACLENT_T) ? 'Z' : 'A';
+			aclcnt = acl_cnt(aclp);
+			trivial = (acl_flags(aclp) & ACL_IS_TRIVIAL);
+		} else {
+			acltype = 'A';
+			aclcnt = MIN_ACL_ENTRIES;
+			trivial = ACL_IS_TRIVIAL;
+		}
+
+	}
+
 	/* send the acl count over */
-	(void) snprintf(buf, sizeof (buf), "A%d\n", aclcnt);
+	(void) snprintf(buf, sizeof (buf), "%c%d\n", acltype, aclcnt);
 	(void) desrcpwrite(rem, buf, strlen(buf));
 
-	/* only send acl when it is non-trivial */
-	if (aclcnt > MIN_ACL_ENTRIES) {
-		aclsize = aclcnt * sizeof (aclent_t);
-		if ((aclbufp = (aclent_t *)malloc(aclsize)) == NULL) {
-			error("rcp: cant allocate memory: aclcnt %d\n",
-					aclcnt);
-			exit(1);
-		}
-		if (facl(f, GETACL, aclcnt, aclbufp) < 0) {
-			error("rcp: failed to get acl\n");
-			return (ACL_FAIL);
-		}
-		acltext = acltotext(aclbufp, aclcnt);
+	/*
+	 * only send acl when we have an aclp, which would
+	 * imply its not trivial.
+	 */
+	if (aclp && (trivial != ACL_IS_TRIVIAL)) {
+		acltext = acl_totext(aclp);
 		if (acltext == NULL) {
 			error("rcp: failed to convert to text\n");
+			acl_free(aclp);
 			return (ACL_FAIL);
 		}
 
 		/* send ACLs over: send the length first */
-		(void) snprintf(buf, sizeof (buf), "A%d\n", strlen(acltext));
+		(void) snprintf(buf, sizeof (buf), "%c%d\n",
+		    acltype, strlen(acltext));
 
 		(void) desrcpwrite(rem, buf, strlen(buf));
 		(void) desrcpwrite(rem, acltext, strlen(acltext));
 		free(acltext);
-		free(aclbufp);
-		if (response() < 0)
+		if (response() < 0) {
+			acl_free(aclp);
 			return (ACL_FAIL);
+		}
 
 	}
+
+	if (aclp)
+		acl_free(aclp);
 	return (ACL_OK);
 }
 
@@ -1783,7 +1856,7 @@ sendacl(int f)
  * Use this routine to get acl entry count and acl text size (in bytes)
  */
 static int
-getaclinfo(int *cnt)
+getaclinfo(int *cnt, int *acltype)
 {
 	char		buf[BUFSIZ];
 	char		*cp;
@@ -1793,7 +1866,15 @@ getaclinfo(int *cnt)
 	cp = buf;
 	if (desrcpread(rem, cp, 1) <= 0)
 		return (ACL_FAIL);
-	if (*cp++ != 'A') {
+
+	switch (*cp++) {
+	case 'A':
+		*acltype = 0;
+		break;
+	case 'Z':
+		*acltype = 1;
+		break;
+	default:
 		error("rcp: expect an ACL record, but got %c\n", *cp);
 		return (ACL_FAIL);
 	}
@@ -1829,15 +1910,24 @@ recvacl(int f, int exists, int preserve)
 	int		j;
 	char		*tp;
 	char		*acltext;	/* external format */
-	aclent_t	*aclbufp;	/* internal format */
+	acl_t		*aclp;
+	int		acltype;
+	int		min_entries;
+	int		aclerror;
 
 	/* get acl count */
-	if (getaclinfo(&aclcnt) != ACL_OK)
+	if (getaclinfo(&aclcnt, &acltype) != ACL_OK)
 		return (ACL_FAIL);
 
-	if (aclcnt > MIN_ACL_ENTRIES) {
+	if (acltype == 0) {
+		min_entries = MIN_ACL_ENTRIES;
+	} else {
+		min_entries = 1;
+	}
+
+	if (aclcnt > min_entries) {
 		/* get acl text size */
-		if (getaclinfo(&aclsize) != ACL_OK)
+		if (getaclinfo(&aclsize, &acltype) != ACL_OK)
 			return (ACL_FAIL);
 		if ((acltext = malloc(aclsize + 1)) == NULL) {
 			error("rcp: cant allocate memory: %d\n", aclsize);
@@ -1858,19 +1948,21 @@ recvacl(int f, int exists, int preserve)
 		*tp = '\0';
 
 		if (preserve || !exists) {
-			aclbufp = aclfromtext(acltext, &aclcnt);
-			if (aclbufp == NULL) {
-				error("rcp: failed to parse acl\n");
+			aclerror = acl_fromtext(acltext, &aclp);
+			if (aclerror != 0) {
+				error("rcp: failed to parse acl : %s\n",
+				    acl_strerror(aclerror));
 				return (ACL_FAIL);
 			}
+
 			if (f != -1) {
-				if (facl(f, SETACL, aclcnt, aclbufp) < 0) {
+				if (facl_set(f, aclp) < 0) {
 					error("rcp: failed to set acl\n");
 					return (ACL_FAIL);
 				}
 			}
 			/* -1 means that just consume the data in the pipe */
-			free(aclbufp);
+			acl_free(aclp);
 		}
 		free(acltext);
 		(void) desrcpwrite(rem, "", 1);
diff --git a/usr/src/cmd/compress/Makefile b/usr/src/cmd/compress/Makefile
index abd852409b03..135dcad54153 100644
--- a/usr/src/cmd/compress/Makefile
+++ b/usr/src/cmd/compress/Makefile
@@ -22,7 +22,7 @@
 #
 #ident	"%Z%%M%	%I%	%E% SMI"
 #
-# Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
@@ -34,6 +34,7 @@ include ../Makefile.cmd
 
 CFLAGS += $(CCVERBOSE)
 CPPFLAGS += -D_FILE_OFFSET_BITS=64
+LDLIBS += -lsec
 
 all: $(PROG) 
 
diff --git a/usr/src/cmd/compress/compress.c b/usr/src/cmd/compress/compress.c
index 53f505d84d37..dba163196b58 100644
--- a/usr/src/cmd/compress/compress.c
+++ b/usr/src/cmd/compress/compress.c
@@ -136,6 +136,7 @@ static char rcs_ident[] =
 #include <strings.h>
 #include <fcntl.h>
 #include <dirent.h>
+#include <aclutils.h>
 
 /*
  * Multi-byte handling for 'y' or 'n'
@@ -1602,8 +1603,8 @@ copystat(char *ifname, struct stat *ifstat, char *ofname)
 {
 	mode_t mode;
 	struct utimbuf timep;
-	int aclcnt;
-	aclent_t *aclp;
+	acl_t *aclp;
+	int error;
 
 	if (fclose(outp)) {
 		perror(ofname);
@@ -1654,41 +1655,20 @@ copystat(char *ifname, struct stat *ifstat, char *ofname)
 		if (chmod(ofname, mode))	 /* Copy modes */
 			perror(ofname);
 
-		/* Copy ACL info */
-		if ((aclcnt = acl(ifname, GETACLCNT, 0, NULL)) < 0) {
+		error = acl_get(ifname, ACL_NO_TRIVIAL, &aclp);
+		if (error != 0) {
 			(void) fprintf(stderr, gettext(
-			    "%s: failed to get acl count\n"),
-			    ifname);
+			    "%s: failed to retrieve acl : %s\n"),
+			    ifname, acl_strerror(error));
 			perm_stat = 1;
 		}
-		/*
-		 * Get ACL info: don't bother allocating space if
-		 * there are only standard permissions, i.e.,
-		 * ACL count < 4.
-		 */
-		if (aclcnt > MIN_ACL_ENTRIES) {
-			if ((aclp = (aclent_t *)malloc(
-			    sizeof (aclent_t) * aclcnt)) == NULL) {
-				(void) fprintf(stderr, gettext(
-				    "Insufficient memory\n"));
-				exit(1);
-			}
-			if (acl(ifname, GETACL, aclcnt, aclp) < 0) {
-				(void) fprintf(stderr, gettext(
-				    "%s: failed to get acl entries\n"),
-				    ifname);
-				perm_stat = 1;
-			} else {
-				if (acl(ofname, SETACL,
-				    aclcnt, aclp) < 0) {
-					(void) fprintf(stderr, gettext(
-					    "%s: failed to set acl "
-					    "entries\n"), ofname);
-					perm_stat = 1;
-				}
-			}
-			free(aclp);
+		if (aclp && (acl_set(ofname, aclp) < 0)) {
+			(void) fprintf(stderr, gettext("%s: failed to set acl "
+			    "entries\n"), ofname);
+			perm_stat = 1;
 		}
+		if (aclp)
+			acl_free(aclp);
 
 		/* Copy ownership */
 		(void) chown(ofname, ifstat->st_uid, ifstat->st_gid);
diff --git a/usr/src/cmd/cpio/cpio.c b/usr/src/cmd/cpio/cpio.c
index cd7e242e27ba..4a8e442dee31 100644
--- a/usr/src/cmd/cpio/cpio.c
+++ b/usr/src/cmd/cpio/cpio.c
@@ -66,6 +66,7 @@
 #include <libintl.h>
 #include <dirent.h>
 #include <limits.h>
+#include <aclutils.h>
 
 /*
  * Special kludge for off_t being a signed quantity.
@@ -170,7 +171,6 @@ static int g_read(int, int, char *, unsigned);
 static int g_write(int, int, char *, unsigned);
 static int is_floppy(int);
 static int is_tape(int);
-static int append_secattr(char **, int *, int, char *, char);
 static void write_ancillary(char *secinfo, int len);
 static int remove_dir(char *);
 static int save_cwd(void);
@@ -477,9 +477,9 @@ static struct sec_attr {
 } *attr;
 
 static int	Pflag = 0;	/* flag indicates that acl is preserved */
-static int	aclcnt = 0;	/* acl entry count */
-static aclent_t *aclp = NULL;	/* pointer to ACL */
-static int	acl_set = 0;	/* True if an acl was set on the file */
+static int	acl_is_set = 0; /* True if an acl was set on the file */
+
+acl_t *aclp;
 
 /*
  *
@@ -595,7 +595,7 @@ static struct xattr_buf	*xattrp;
 static struct xattr_buf	*xattr_linkp;
 static int 		xattrbadhead;	/* is extended attribute header bad? */
 
-static int	append_secattr(char **, int *, int, char *, char);
+static int	append_secattr(char **, int *, acl_t *);
 static void	write_ancillary(char *, int);
 
 /*
@@ -676,11 +676,10 @@ main(int argc, char **argv)
 				 * the next file.
 				 */
 				if (aclp != NULL) {
-					free(aclp);
-					aclcnt = 0;
+					acl_free(aclp);
 					aclp = NULL;
 				}
-				acl_set = 0;
+				acl_is_set = 0;
 			}
 			(void) memset(&Gen, 0, sizeof (Gen));
 		}
@@ -710,10 +709,9 @@ main(int argc, char **argv)
 				Hiddendir = 0;
 			}
 			if (aclp != NULL) {
-				free(aclp);
-				aclcnt = 0;
+				acl_free(aclp);
 				aclp = NULL;
-				acl_set = 0;
+				acl_is_set = 0;
 			}
 		}
 		write_trail();
@@ -736,10 +734,9 @@ main(int argc, char **argv)
 			}
 			passret = file_pass();
 			if (aclp != NULL) {
-				free(aclp);
-				aclcnt = 0;
+				acl_free(aclp);
 				aclp = NULL;
-				acl_set = 0;
+				acl_is_set = 0;
 			}
 			if (Gen.g_passdirfd != -1)
 				(void) close(Gen.g_passdirfd);
@@ -1577,9 +1574,9 @@ creat_lnk(int dirfd, char *name1_p, char *name2_p)
 		errno = 0;
 		if (!link(name1_p, name2_p)) {
 			if (aclp != NULL) {
-				free(aclp);
+				acl_free(aclp);
 				aclp = NULL;
-				acl_set = 0;
+				acl_is_set = 0;
 			}
 			cnt = 0;
 			break;
@@ -1709,16 +1706,16 @@ creat_spec(int dirfd)
 			    "file %s\"", G_p->g_attrfnam_p);
 		}
 
-		acl_set = 0;
+		acl_is_set = 0;
 		if (Pflag && aclp != NULL) {
-			if (facl(dirfd, SETACL, aclcnt, aclp) < 0) {
+			if (facl_set(dirfd, aclp) < 0) {
 				msg(ERRN,
 				    "failed to set acl on attribute"
 				    " directory of %s ", G_p->g_attrfnam_p);
 			} else {
-				acl_set = 1;
+				acl_is_set = 1;
 			}
-			free(aclp);
+			acl_free(aclp);
 			aclp = NULL;
 		}
 
@@ -1754,18 +1751,18 @@ creat_spec(int dirfd)
 			/* A file by the same name exists. */
 
 			/* Take care of ACLs */
-			acl_set = 0;
+			acl_is_set = 0;
 
 			if (Pflag && aclp != NULL) {
-				if (acl(nam_p, SETACL, aclcnt, aclp) < 0) {
+				if (acl_set(nam_p, aclp) < 0) {
 					msg(ERRN,
 					    "\"%s\": failed to set acl",
 					    nam_p);
 				} else {
-					acl_set = 1;
+					acl_is_set = 1;
 				}
 
-				free(aclp);
+				acl_free(aclp);
 				aclp = NULL;
 			}
 			if (Args & OCd) {
@@ -1829,17 +1826,17 @@ creat_spec(int dirfd)
 			 * The file creation succeeded.  Take care of the ACLs.
 			 */
 
-			acl_set = 0;
+			acl_is_set = 0;
 
 			if (Pflag && aclp != NULL) {
-				if (acl(nam_p, SETACL, aclcnt, aclp) < 0) {
+				if (acl_set(nam_p, aclp) < 0) {
 					msg(ERRN,
 					    "\"%s\": failed to set acl", nam_p);
 				} else {
-					acl_set = 1;
+					acl_is_set = 1;
 				}
 
-				free(aclp);
+				acl_free(aclp);
 				aclp = NULL;
 			}
 
@@ -2206,8 +2203,7 @@ data_out(void)
 			int	len = 0;
 
 			/* append security attributes */
-			if (append_secattr(&secinfo, &len, aclcnt,
-				(char *)aclp, UFSD_ACL) == -1) {
+			if (append_secattr(&secinfo, &len, aclp) == -1) {
 				msg(ERR,
 				    "can create security information");
 			}
@@ -2328,8 +2324,7 @@ data_out(void)
 		int	len = 0;
 
 		/* append security attributes */
-		if ((append_secattr(&secinfo, &len, aclcnt, (char *)aclp,
-		    UFSD_ACL)) == -1)
+		if ((append_secattr(&secinfo, &len, aclp)) == -1)
 			msg(ERR, "can create security information");
 
 		/* call append_secattr() if more than one */
@@ -2926,8 +2921,7 @@ file_out(void)
 			int	len = 0;
 
 			/* append security attributes */
-			if ((append_secattr(&secinfo, &len, aclcnt,
-			    (char *)aclp, UFSD_ACL)) == -1)
+			if ((append_secattr(&secinfo, &len, aclp)) == -1)
 				msg(ERR, "can create security information");
 
 			/* call append_secattr() if more than one */
@@ -3248,6 +3242,8 @@ gethdr(void)
 	char *preptr;
 	int k = 0;
 	int j;
+	int error;
+	int aclcnt;
 
 	Gen.g_nam_p = Nam_p;
 	do { /* hit == NONE && (Args & OCk) && Buffr.b_cnt > 0 */
@@ -3629,20 +3625,29 @@ gethdr(void)
 				attr = (struct sec_attr *)tp;
 				switch (attr->attr_type) {
 				case UFSD_ACL:
+				case ACE_ACL:
 					(void) sscanf(attr->attr_len, "%7lo",
 					    (ulong_t *)&aclcnt);
-				/* header is 8 */
+					/* header is 8 */
 					attrsize = 8 +
 					    strlen(&attr->attr_info[0])
 					    + 1;
-					aclp = aclfromtext(&attr->attr_info[0],
-					    &cnt);
-					if (aclp == NULL) {
-						msg(ERR, "aclfromtext failed");
+
+					error =
+					    acl_fromtext(&attr->attr_info[0],
+					    &aclp);
+
+					if (error != 0) {
+						msg(ERR,
+						    "aclfromtext failed: %s",
+						    acl_strerror(error));
+						bytes -= attrsize;
 						break;
 					}
-					if (aclcnt != cnt) {
+
+					if (aclcnt != acl_cnt(aclp)) {
 						msg(ERR, "acl count error");
+						bytes -= attrsize;
 						break;
 					}
 					bytes -= attrsize;
@@ -3909,21 +3914,10 @@ getname(void)
 	 * standard permissions, i.e. ACL count < 4
 	 */
 	if ((SrcSt.st_mode & Ftype) != S_IFLNK && Pflag) {
-		if ((aclcnt = acl(Gen.g_nam_p, GETACLCNT, 0, NULL)) < 0)
+		if (acl_get(Gen.g_nam_p, ACL_NO_TRIVIAL, &aclp) != 0)
 			msg(ERRN, "Error with acl() of \"%s\"", Gen.g_nam_p);
-		if (aclcnt > MIN_ACL_ENTRIES) {
-			aclp = e_zalloc(E_EXIT, sizeof (aclent_t) * aclcnt);
-
-			if (acl(Gen.g_nam_p, GETACL, aclcnt, aclp) < 0) {
-				msg(ERRN,
-				    "Error with getacl() of \"%s\"",
-				    Gen.g_nam_p);
-				free(aclp);
-				aclp = NULL;
-			}
-		}
-	/* else: only traditional permissions, so proceed as usual */
 	}
+	/* else: only traditional permissions, so proceed as usual */
 	if (creat_hdr())
 		return (1);
 	else return (2);
@@ -4332,17 +4326,16 @@ openout(int dirfd)
 			if ((result = openat(dirfd, get_component(nam_p),
 			    O_CREAT|O_RDWR|O_TRUNC, (int)G_p->g_mode)) >= 0) {
 				/* acl support */
-				acl_set = 0;
+				acl_is_set = 0;
 				if (Pflag && aclp != NULL) {
-					if (facl(result, SETACL, aclcnt, aclp)
-					    < 0) {
+					if (facl_set(result, aclp) < 0) {
 						msg(ERRN,
 						    "\"%s\": failed to set acl",
 						    nam_p);
 					} else {
-						acl_set = 1;
+						acl_is_set = 1;
 					}
-					free(aclp);
+					acl_free(aclp);
 					aclp = NULL;
 				}
 				cnt = 0;
@@ -4879,7 +4872,7 @@ rstfiles(int over, int dirfd)
 		mode_t orig_mask, new_mask;
 		struct stat sbuf;
 
-		if (!(Pflag && acl_set)) {
+		if (!(Pflag && acl_is_set)) {
 			/* Acl was not set, so we must chmod */
 			if (LSTAT(dirfd, G_p->g_nam_p, &sbuf) == 0) {
 				if ((sbuf.st_mode & Ftype) != S_IFLNK) {
@@ -4927,7 +4920,7 @@ rstfiles(int over, int dirfd)
 			set_tym(dirfd, get_component(onam_p),
 			    G_p->g_mtime, G_p->g_mtime);
 		}
-		if (!acl_set) {
+		if (!acl_is_set) {
 			if (G_p->g_attrnam_p != (char *)NULL) {
 				error = fchmod(Ofile, (int)G_p->g_mode);
 			} else {
@@ -6636,11 +6629,9 @@ is_floppy(int fd)
  */
 static int
 append_secattr(
-	char	**secinfo,	/* existing security info */
-	int	*secinfo_len,	/* length of existing security info */
-	int	size,		/* new attribute size: unit depends on type */
-	char	*attrp,		/* new attribute data pointer */
-	char	attr_type)	/* new attribute type */
+	char		**secinfo,	/* existing security info */
+	int		*secinfo_len,	/* length of existing security info */
+	acl_t		*aclp) 	/* new attribute data pointer */
 {
 	char	*new_secinfo;
 	char	*attrtext;
@@ -6648,14 +6639,15 @@ append_secattr(
 	int	oldsize;
 
 	/* no need to add */
-	if (attrp == (char *)NULL) {
+	if (aclp == NULL) {
 		return (0);
 	}
 
-	switch (attr_type) {
-	case UFSD_ACL:
+	switch (acl_type(aclp)) {
+	case ACLENT_T:
+	case ACE_T:
 		/* LINTED alignment */
-		attrtext = acltotext((aclent_t *)attrp, size);
+		attrtext = acl_totext(aclp);
 		if (attrtext == NULL) {
 			(void) fprintf(stderr, "acltotext failed\n");
 			return (-1);
@@ -6667,9 +6659,10 @@ append_secattr(
 			(void) fprintf(stderr, "can't allocate memory\n");
 			return (-1);
 		}
-		attr->attr_type = '1';		/* UFSD_ACL */
+		attr->attr_type = (acl_type(aclp) == ACLENT_T) ?
+		    UFSD_ACL : ACE_ACL;
 		/* acl entry count */
-		(void) sprintf(attr->attr_len, "%06o", size);
+		(void) sprintf(attr->attr_len, "%06o", acl_cnt(aclp));
 		(void) strcpy((char *)&attr->attr_info[0], attrtext);
 		free(attrtext);
 		break;
@@ -6853,11 +6846,10 @@ xattrs_out(int (*func)())
 	 * If aclp still exists then free it since it is was set when base
 	 * file was extracted.
 	 */
-	if (aclp != (aclent_t *)NULL) {
-		free(aclp);
-		aclcnt = 0;
+	if (aclp != NULL) {
+		acl_free(aclp);
 		aclp = NULL;
-		acl_set = 0;
+		acl_is_set = 0;
 	}
 
 	Gen.g_dirfd = attropen(G_p->g_nam_p, ".", O_RDONLY);
@@ -6943,24 +6935,11 @@ xattrs_out(int (*func)())
 				free(namep);
 				continue;
 			}
-			if ((aclcnt = facl(filefd, GETACLCNT,
-			    0, NULL)) < 0) {
+			if (facl_get(filefd, ACL_NO_TRIVIAL, &aclp) != 0) {
 				msg(ERRN,
 				    "Error with acl() on %s",
 				    Gen.g_nam_p);
 			}
-			if (aclcnt > MIN_ACL_ENTRIES) {
-				aclp = e_zalloc(E_EXIT,
-					sizeof (aclent_t) * aclcnt);
-
-				if (facl(filefd, GETACL, aclcnt, aclp) < 0) {
-					msg(ERRN,
-					    "Error with getacl() on %s",
-					    Gen.g_nam_p);
-					free(aclp);
-					aclp = NULL;
-				}
-			}
 			(void) close(filefd);
 		}
 		(void) creat_hdr();
@@ -6973,11 +6952,10 @@ xattrs_out(int (*func)())
 		Gen.g_attrfnam_p = (char *)NULL;
 		Gen.g_linktoattrfnam_p = (char *)NULL;
 		Gen.g_linktoattrnam_p = (char *)NULL;
-		if (aclp != (aclent_t *)NULL) {
-			free(aclp);
-			aclcnt = 0;
+		if (aclp != NULL) {
+			acl_free(aclp);
 			aclp = NULL;
-			acl_set = 0;
+			acl_is_set = 0;
 		}
 		free(namep);
 	}
diff --git a/usr/src/cmd/devfsadm/Makefile.com b/usr/src/cmd/devfsadm/Makefile.com
index 4d7268b042d9..41acb0192e79 100644
--- a/usr/src/cmd/devfsadm/Makefile.com
+++ b/usr/src/cmd/devfsadm/Makefile.com
@@ -62,6 +62,7 @@ LINK_SRCS =			\
 	$(COMMON)/sgen_link.c	\
 	$(COMMON)/md_link.c	\
 	$(COMMON)/dtrace_link.c	\
+	$(COMMON)/zfs_link.c	\
 	$(MISC_LINK_ISA).c
 
 LINT_MODULES = $(LINK_SRCS:.c=.ln)
@@ -82,6 +83,7 @@ LINK_OBJS =			\
 	sgen_link.o		\
 	md_link.o		\
 	dtrace_link.o		\
+	zfs_link.o		\
 	$(MISC_LINK_ISA).o
 
 LINK_MODS =			\
@@ -100,6 +102,7 @@ LINK_MODS =			\
 	SUNW_sgen_link.so	\
 	SUNW_md_link.so		\
 	SUNW_dtrace_link.so	\
+	SUNW_zfs_link.so	\
 	SUNW_$(MISC_LINK_ISA).so
 
 DEVLINKTAB = devlink.tab
diff --git a/usr/src/cmd/devfsadm/zfs_link.c b/usr/src/cmd/devfsadm/zfs_link.c
new file mode 100644
index 000000000000..50148f3ae6db
--- /dev/null
+++ b/usr/src/cmd/devfsadm/zfs_link.c
@@ -0,0 +1,115 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <regex.h>
+#include <devfsadm.h>
+#include <stdio.h>
+#include <strings.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <sys/mkdev.h>
+#include <sys/fs/zfs.h>
+
+/* zfs and zvol name info */
+
+#define	ZVOL_LINK_RE_DEVICES	"zvol/r?dsk/.*/.*$"
+
+static int zfs(di_minor_t minor, di_node_t node);
+
+/*
+ * devfs create callback register
+ */
+static devfsadm_create_t zfs_create_cbt[] = {
+	{ "pseudo", "ddi_pseudo", ZFS_DRIVER,
+	    TYPE_EXACT | DRV_EXACT, ILEVEL_0, zfs,
+	},
+};
+DEVFSADM_CREATE_INIT_V0(zfs_create_cbt);
+
+/*
+ * devfs cleanup register
+ */
+static devfsadm_remove_t zfs_remove_cbt[] = {
+	{ "pseudo", ZVOL_LINK_RE_DEVICES, RM_HOT | RM_POST,
+	    ILEVEL_0, devfsadm_rm_all },
+};
+DEVFSADM_REMOVE_INIT_V0(zfs_remove_cbt);
+
+/*
+ * For the zfs control node:
+ *	/dev/zfs -> /devices/pseudo/zfs@0:zfs
+ * For zvols:
+ *	/dev/zvol/dsk/<pool>/<dataset> -> /devices/pseudo/zfs@0:1
+ *	/dev/zvol/rdsk/<pool>/<dataset> -> /devices/pseudo/zfs@0:1,raw
+ */
+static int
+zfs(di_minor_t minor, di_node_t node)
+{
+	dev_t	dev;
+	int	err;
+	char mn[MAXNAMELEN + 1];
+	char blkname[MAXNAMELEN + 1];
+	char rawname[MAXNAMELEN + 1];
+	char path[PATH_MAX + 1];
+	char *name;
+
+	(void) strcpy(mn, di_minor_name(minor));
+
+	if (strcmp(mn, ZFS_DRIVER) == 0) {
+		(void) devfsadm_mklink(ZFS_DRIVER, node, minor, 0);
+	} else {
+		dev = di_minor_devt(minor);
+		err = di_prop_lookup_strings(dev, node, ZVOL_PROP_NAME, &name);
+		if (err < 0) {
+			/* property not defined so can't do anything */
+			return (DEVFSADM_CONTINUE);
+		}
+		(void) snprintf(blkname, sizeof (blkname), "%dc",
+		    (int)minor(dev));
+		(void) snprintf(rawname, sizeof (rawname), "%dc,raw",
+		    (int)minor(dev));
+
+		/*
+		 * This is where the actual public name gets constructed.
+		 * Change the snprintf format to change the public
+		 * path that gets constructed.
+		 */
+		if (strcmp(mn, blkname) == 0) {
+			(void) snprintf(path, sizeof (path), "%s/%s",
+			    ZVOL_DEV_DIR, name);
+		} else if (strcmp(mn, rawname) == 0) {
+			(void) snprintf(path, sizeof (path), "%s/%s",
+			    ZVOL_RDEV_DIR, name);
+		} else {
+			return (DEVFSADM_CONTINUE);
+		}
+
+		(void) devfsadm_mklink(path, node, minor, 0);
+	}
+	return (DEVFSADM_CONTINUE);
+}
diff --git a/usr/src/cmd/dfs.cmds/shareall/shareall.sh b/usr/src/cmd/dfs.cmds/shareall/shareall.sh
index 0e6534e41b0f..87645837787a 100644
--- a/usr/src/cmd/dfs.cmds/shareall/shareall.sh
+++ b/usr/src/cmd/dfs.cmds/shareall/shareall.sh
@@ -88,3 +88,10 @@ then
 else			# for every file system ...
 	cat $infile|/sbin/sh
 fi
+
+# If the user has ZFS installed, invoke 'zfs share -a' to share all ZFS
+# filesystems 
+if [ -x /usr/sbin/zfs ]
+then
+	/usr/sbin/zfs share -a
+fi
diff --git a/usr/src/cmd/find/Makefile b/usr/src/cmd/find/Makefile
index 78d41d0b4933..d43075a5a6cb 100644
--- a/usr/src/cmd/find/Makefile
+++ b/usr/src/cmd/find/Makefile
@@ -22,8 +22,8 @@
 #
 #ident	"%Z%%M%	%I%	%E% SMI"
 #
-# Copyright (c) 1989,1996 by Sun Microsystems, Inc.
-# All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
 #
 
 PROG= find
@@ -34,6 +34,8 @@ include ../Makefile.cmd
 $(XPG4) := CFLAGS += -DXPG4
 CPPFLAGS += -D_FILE_OFFSET_BITS=64
 
+LDLIBS += -lsec
+
 .KEEP_STATE:
 
 all: $(PROG) $(XPG4)
diff --git a/usr/src/cmd/find/find.c b/usr/src/cmd/find/find.c
index dd8fab87ad3e..e6944d0031dd 100644
--- a/usr/src/cmd/find/find.c
+++ b/usr/src/cmd/find/find.c
@@ -60,7 +60,6 @@
 #include <langinfo.h>
 #include <ftw.h>
 
-
 #define	A_DAY		(long)(60*60*24)	/* a day full of seconds */
 #define	A_MIN		(long)(60)
 #define	BLKSIZ		512
@@ -964,8 +963,7 @@ struct FTW *state;
 			 * nftw()) of the file
 			 */
 			filename = gettail(name);
-			val = (acl(filename, GETACLCNT, 0, NULL) >
-			    MIN_ACL_ENTRIES);
+			val = acl_trivial(name);
 			break;
 		}
 		/*
@@ -1352,7 +1350,7 @@ list(file, stp)
 	struct stat *stp;
 {
 	char pmode[32], uname[32], gname[32], fsize[32], ftime[32];
-
+	int trivial;
 
 /*
  * Each line below contains the relevant permission (column 1) and character
@@ -1456,7 +1454,11 @@ list(file, stp)
 
 	tailname = gettail(file);
 
-	if (acl(tailname, GETACLCNT, 0, NULL) > MIN_ACL_ENTRIES)
+	trivial = acl_trivial(tailname);
+	if (trivial == -1)
+		trivial =  0;
+
+	if (trivial == 1)
 		pmode[permoffset(who) + 1] = '+';
 	else
 		pmode[permoffset(who) + 1] = ' ';
diff --git a/usr/src/cmd/fm/dicts/Makefile b/usr/src/cmd/fm/dicts/Makefile
index 4cd171fd1cce..d2ac4d192b91 100644
--- a/usr/src/cmd/fm/dicts/Makefile
+++ b/usr/src/cmd/fm/dicts/Makefile
@@ -31,7 +31,8 @@ common_DCNAMES = \
 	FMD \
 	SMF \
 	SUNOS \
-	PCI
+	PCI \
+	ZFS
 
 sparc_DCNAMES = \
 	SUN4U \
diff --git a/usr/src/cmd/fm/dicts/ZFS.dict b/usr/src/cmd/fm/dicts/ZFS.dict
new file mode 100644
index 000000000000..0166183535ad
--- /dev/null
+++ b/usr/src/cmd/fm/dicts/ZFS.dict
@@ -0,0 +1,41 @@
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+# DO NOT EDIT -- this file is generated by the Event Registry.
+#
+
+FMDICT: name=ZFS version=1 maxkey=1
+
+ereport.fs.zfs.pool.corrupt_cache=1
+ereport.fs.zfs.device.missing_r=2
+ereport.fs.zfs.device.missing_nr=3
+ereport.fs.zfs.device.corrupt_label_r=4
+ereport.fs.zfs.device.corrupt_label_nr=5
+ereport.fs.zfs.pool.bad_guid_sum=6
+ereport.fs.zfs.pool.corrupt_pool=7
+ereport.fs.zfs.object.corrupt_data=8
+ereport.fs.zfs.device.failing=9
+ereport.fs.zfs.device.version_mismatch=10
diff --git a/usr/src/cmd/fm/dicts/ZFS.po b/usr/src/cmd/fm/dicts/ZFS.po
new file mode 100644
index 000000000000..ea5a9c619548
--- /dev/null
+++ b/usr/src/cmd/fm/dicts/ZFS.po
@@ -0,0 +1,188 @@
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+# DO NOT EDIT -- this file is generated by the Event Registry.
+#
+#
+# code: ZFS-8000-14
+# keys: ereport.fs.zfs.pool.corrupt_cache
+#
+msgid "ZFS-8000-14.type"
+msgstr "Error"
+msgid "ZFS-8000-14.severity"
+msgstr "Critical"
+msgid "ZFS-8000-14.description"
+msgstr "The ZFS cache file is corrupted  Refer to %s for more information."
+msgid "ZFS-8000-14.response"
+msgstr "No automated response will be taken."
+msgid "ZFS-8000-14.impact"
+msgstr "ZFS filesystems are not available"
+msgid "ZFS-8000-14.action"
+msgstr "\nTo determine which pools are availabe for import, run the 'zpool status'\ncommand:\n\n\n# zpool import\n  pool: test\n    id: 12743384782310107047\n state: ONLINE\naction: The pool can be imported using its name or numeric identifier.\nconfig:\n\n        test              ONLINE\n          c0t0d0          ONLINE\n#\n\n\nThis will automatically scan /dev/dsk for any devices\npart of a pool.  If you previously had storage pools with devices in a\ndifferent directory, us the '-d' option to 'zpool import' to scan alternate\nlocations.\n\nOnce you have determined which pools are available for import, you can\nimport the pool explicitly by specifying the name or numeric identifier:\n\n\n# zpool import test\n#\n\n\nAlternately, you can import all available pools by specifying the '-a'\noption.  Once a pool has been imported, the ZFS cache will be repaired so\nthat the pool will appear normally in the future.\n	"
+#
+# code: ZFS-8000-2Q
+# keys: ereport.fs.zfs.device.missing_r
+#
+msgid "ZFS-8000-2Q.type"
+msgstr "Error"
+msgid "ZFS-8000-2Q.severity"
+msgstr "Major"
+msgid "ZFS-8000-2Q.description"
+msgstr "A device in a replicated configuration could not be\n	    opened.  Refer to %s for more information."
+msgid "ZFS-8000-2Q.response"
+msgstr "No automated response will be taken."
+msgid "ZFS-8000-2Q.impact"
+msgstr "The pool is no longer providing the configured level of\n	    replication."
+msgid "ZFS-8000-2Q.action"
+msgstr "\nIf this error was encountered while running 'zpool import', please see the\nsection below.  Otherwise, run 'zpool status -x' to determine which pool has\nexperienced a failure:\n\n\n# zpool status -x\n  pool: test\n state: DEGRADED\nstatus: One or more devices could not be opened.  Sufficient replicas exist for\n        the pool to continue functioning in a degraded state.\naction: Attach the missing device and online it using 'zpool online'.\n   see: http://www.sun.com/msg/ZFS-8000-2Q\n scrub: none requested\nconfig:\n\n        NAME                  STATE     READ WRITE CKSUM\n        test                  DEGRADED     0     0     0\n          mirror              DEGRADED     0     0     0\n            c0t0d0            ONLINE       0     0     0\n            c0t0d1            FAULTED      0     0     0  cannot open\n#\n\n\nDetermine which device failed to open by looking for a FAULTED device with\nan additional \"cannot open\" message.  If this device has been inadvertently\nremoved from the system, attach the device and bring it online with 'zpool\nonline':\n\n\n# zpool online test c0t0d1\nBringing device 'c0t0d1' online\n#\n\n\nIf the device is no longer available, the device can be replaced using the\n'zpool replace' command:\n\n\n# zpool replace test c0t0d1 c0t0d2\n#\n\n\nExisting data will be resilvered to the new device.  Once the resilvering\ncompletes, the device will be removed from the pool.\n\n\nIf this error is encountered during a 'zpool import', it means that one of\nthe devices is not attached to the system:\n\n\n# zpool import\n  pool: test\n    id: 10121266328238932306\n state: DEGRADED\nstatus: One or more devices are missing from the system.\naction: The pool can be imported despite missing or damaged devices.  The\n        fault tolerance of the pool may be compromised if imported.\n   see: http://www.sun.com/msg/ZFS-8000-2Q\nconfig:\n\n        test              DEGRADED\n          mirror          DEGRADED\n            c0t0d0        ONLINE\n            c0t0d1        FAULTED   cannot open\n\n\nUnlike when the pool is active on the system, the device cannot be replaced\nwhile the pool is exported.  If the device can be attached to the system,\nattach the device and run 'zpool import' again.\n\nAlternatively, the pool can be imported as-is, though it will be placed in\nthe DEGRADED state due to a missing device.  Once the pool has been\nimported, the missing device can be replaced as described above.\n	"
+#
+# code: ZFS-8000-3C
+# keys: ereport.fs.zfs.device.missing_nr
+#
+msgid "ZFS-8000-3C.type"
+msgstr "Error"
+msgid "ZFS-8000-3C.severity"
+msgstr "Critical"
+msgid "ZFS-8000-3C.description"
+msgstr "A device could not be opened and no replicas are available.  Refer to %s for more information."
+msgid "ZFS-8000-3C.response"
+msgstr "No automated response will be taken."
+msgid "ZFS-8000-3C.impact"
+msgstr "The pool is no longer available"
+msgid "ZFS-8000-3C.action"
+msgstr "\nIf this error was encountered while running 'zpool import', please see the\nsection below.  Otherwise, run 'zpool status -x' to determine which pool\nhas experienced a failure:\n\n\n# zpool status -x\n  pool: test\n state: FAULTED\nstatus: One or more devices could not be opened.  There are insufficient\n	replicas for the pool to continue functioning.\naction: Attach the missing device and online it using 'zpool online'.\n   see: http://www.sun.com/msg/ZFS-8000-3C\n scrub: none requested\nconfig:\n\n        NAME                  STATE     READ WRITE CKSUM\n        test                  FAULTED      0     0     0  insufficient replicas\n          c0t0d0              ONLINE       0     0     0\n          c0t0d1              FAULTED      0     0     0  cannot open\n#\n\n\nAttach the device to the system and run 'zpool status' again.  The pool\nshould automatically detect the newly attached device and resume\nfunctioning.  You may have to mount the filesystems in the pool explicitly\nusing 'zfs mount'.\n\nIf the device is no longer available and cannot be reattached to the system,\nthen the pool must be destroyed and re-created from a backup source.\n\n\nIf this error is encountered during a 'zpool import', it means that one of\nthe devices is not attached to the system:\n\n\n# zpool import\n  pool: test\n    id: 10121266328238932306\n state: FAULTED\nstatus: One or more devices are missing from the system.\naction: The pool cannot be imported.  Attach the missing devices and\n	try again.\n   see: http://www.sun.com/msg/ZFS-8000-3C\nconfig:\n\n        test              FAULTED   insufficient replicas\n          c0t0d0          ONLINE\n          c0t0d1          FAULTED   cannot open\n\n\nThe pool cannot be imported until the missing device is attached to the\nsystem.  If the device has been made available in an alternate location, you\ncan use the '-d' option to 'zpool import' to search for devices in a\ndifferent directory.\n	"
+#
+# code: ZFS-8000-4J
+# keys: ereport.fs.zfs.device.corrupt_label_r
+#
+msgid "ZFS-8000-4J.type"
+msgstr "Error"
+msgid "ZFS-8000-4J.severity"
+msgstr "Major"
+msgid "ZFS-8000-4J.description"
+msgstr "A device could not be opened due to a missing or invalid\n	    device label.  Refer to %s for more information."
+msgid "ZFS-8000-4J.response"
+msgstr "No automated response will be taken."
+msgid "ZFS-8000-4J.impact"
+msgstr "The pool is no longer providing the configured level of\n	    replication."
+msgid "ZFS-8000-4J.action"
+msgstr "\nIf this error is encountered while running 'zpool import', see the section\nbelow.  Otherwise, run 'zpool status -x' to determine which pool has the\ndamaged device:\n\n\n# zpool status -x\n  pool: test\n state: DEGRADED\nstatus: One or more devices could not be used because the label is missing or\n        invalid.  Sufficient replicas exist for the pool to continue\n        functioning in a degraded state.\naction: Replace the device using 'zpool replace'.\n   see: http://www.sun.com/msg/ZFS-8000-4J\n scrub: none requested\nconfig:\n\n        NAME                  STATE     READ WRITE CKSUM\n        test                  DEGRADED     0     0     0\n          mirror              DEGRADED     0     0     0\n            c0t0d0            ONLINE       0     0     0\n            c0t0d1            FAULTED      0     0     0  corrupted data\n\n\nDetermine which device is damaged by locating the FAULTED device showing\n\"corrupted data\".  This indicates that the device label was corrupt.  Because\nZFS could not identify the device as the one expected, no automatic resilvering\nwill take place.\n\nThe device can be resilvered by issuing 'zpool replace':\n\n\n# zpool replace test c0t0d1\n\n\nThis will replace the device in situ.  To replace the device with another,\ndifferent, device, run 'zpool replace' with an additional argument specifying\nthe new device:\n\n\n# zpool replace test c0t0d1 c0t0d2\n\n\nZFS will being migrating data to the new device as soon as the replace is\nissued.  Once the resilvering completes, the original device (if different from\nthe replacement) will be removed, and the pool will be restored to the ONLINE\nstate.\n\n\nIf this error is encountered while running 'zpool import', the pool can be still\nbe imported despite the failure:\n\n\n# zpool import\n  pool: test\n    id: 5187963178597328409\n state: DEGRADED\nstatus: One or more devices contains corrupted data.  The fault tolerance of\n	the pool may be compromised if imported.\naction: The pool can be imported using its name or numeric identifier.\n   see: http://www.sun.com/msg/ZFS-8000-4J\nconfig:\n\n        test              DEGRADED\n          mirror          DEGRADED\n            /disk/a       ONLINE\n            /disk/b       FAULTED   corrupted data\n\n\nTo import the pool, run 'zpool import':\n\n\n# zpool import test\n\n\nOnce the pool has been imported, the damaged device can be replaced according to\nthe above procedure.\n	"
+#
+# code: ZFS-8000-5E
+# keys: ereport.fs.zfs.device.corrupt_label_nr
+#
+msgid "ZFS-8000-5E.type"
+msgstr "Error"
+msgid "ZFS-8000-5E.severity"
+msgstr "Critical"
+msgid "ZFS-8000-5E.description"
+msgstr "A device could not be opened due to a missing or invalid\n	    device label and no replicas are available.  Refer to %s for more information."
+msgid "ZFS-8000-5E.response"
+msgstr "No automated response will be taken."
+msgid "ZFS-8000-5E.impact"
+msgstr "The pool is no longer available"
+msgid "ZFS-8000-5E.action"
+msgstr "\nIf this error is encountered during 'zpool import', see the section below.\nOtherwise, run 'zpool status -x' to determine which pool is faulted:\n\n\n# zpool status -x\n  pool: test\n state: FAULTED\nstatus: One or more devices could not be used because the the label is missing \n        or invalid.  There are insufficient replicas for the pool to continue\n        functioning.\naction: Destroy and re-create the pool from a backup source.\n   see: http://www.sun.com/msg/ZFS-8000-5E\n scrub: none requested\nconfig:\n\n        NAME                  STATE     READ WRITE CKSUM\n        test                  FAULTED      0     0     0  insufficient replicas\n          c0t0d0              FAULTED      0     0     0  corrupted data\n          c0t0d1              ONLINE       0     0     0\n\n\nThe device listed as FAULTED with \"corrupted data\" cannot be opened due to a\ncorrupt label.  ZFS will be unable to use the pool, and all data within the pool\nis irrevocably lost.  The pool must be destroyed and recreated from an\nappropriate backup source.  Using replicated configurations will prevent this\nfrom happening in the future.\n\n\nIf this error is enountered during 'zpool import', the action is the same.  The\npool cannot be imported - all data is lost and must be restored from an\nappropriate backup source.\n	"
+#
+# code: ZFS-8000-6X
+# keys: ereport.fs.zfs.pool.bad_guid_sum
+#
+msgid "ZFS-8000-6X.type"
+msgstr "Error"
+msgid "ZFS-8000-6X.severity"
+msgstr "Critical"
+msgid "ZFS-8000-6X.description"
+msgstr "One or more top level devices are missing.  Refer to %s for more information."
+msgid "ZFS-8000-6X.response"
+msgstr "No automated response will be taken."
+msgid "ZFS-8000-6X.impact"
+msgstr "The pool cannot be imported"
+msgid "ZFS-8000-6X.action"
+msgstr "\nRun 'zpool import' to list which pool cannot be imported:\n\n\n# zpool import\n  pool: test\n    id: 13783646421373024673\n state: FAULTED\nstatus: One or more devices are missing from the system.\naction: The pool cannot be imported.  Attach the missing\n	devices and try again.\n   see: http://www.sun.com/msg/ZFS-8000-6X\nconfig:\n\n        test              FAULTED   missing device\n          c0t0d0          ONLINE\n\n        Additional devices are known to be part of this pool, though their\n        exact configuration cannot be determined.\n\n\nZFS attempts to store enough configuration data on the devices such that the\nconfiguration is recoverable from any subset of devices.  In some cases,\nparticularly when an entire toplevel virtual device is not attached to the\nsystem, ZFS will be unable to determine the complete configuration.  It will\nalways detect that these devices are missing, even if it cannot identify all of\nthe devices.\n\nThe unknown missing devices must be attached to the system, at which point\n'zpool import' can be used to import the pool.\n	"
+#
+# code: ZFS-8000-72
+# keys: ereport.fs.zfs.pool.corrupt_pool
+#
+msgid "ZFS-8000-72.type"
+msgstr "Error"
+msgid "ZFS-8000-72.severity"
+msgstr "Critical"
+msgid "ZFS-8000-72.description"
+msgstr "The metadata required to open the pool is corrupt.  Refer to %s for more information."
+msgid "ZFS-8000-72.response"
+msgstr "No automated response will be taken."
+msgid "ZFS-8000-72.impact"
+msgstr "The pool is no longer available"
+msgid "ZFS-8000-72.action"
+msgstr "\nIf this error is encountered during 'zpool import', see the section below.\nOtherwise, run 'zpool status -x' to determine which pool is faulted:\n\n\n# zpool status -x\n# zpool import\n  pool: test\n    id: 13783646421373024673\n state: FAULTED\nstatus: The pool metadata is corrupted and cannot be opened.\naction: Destroy the pool and restore from backup.\n   see: http://www.sun.com/msg/ZFS-8000-72\nconfig:\n\n        test              FAULTED   corrupted data\n          c0t0d0          ONLINE\n	  c0t0d1	  ONLINE\n\n\nEven though all the devices are available, the on-disk data has been corrupted\nsuch that the pool cannot be opened.  All data within the pool is lost, and the\npool must be destroyed and restored from an appropriate backup source.\n\n\nIf this error is encountered during 'zpool import', the pool is unrecoverable\nand cannot be imported.  The pool must be restored from an appropriate backup\nsource.\n	"
+#
+# code: ZFS-8000-8A
+# keys: ereport.fs.zfs.object.corrupt_data
+#
+msgid "ZFS-8000-8A.type"
+msgstr "Error"
+msgid "ZFS-8000-8A.severity"
+msgstr "Critical"
+msgid "ZFS-8000-8A.description"
+msgstr "A file or directory could not be read due to corrupt data.  Refer to %s for more information."
+msgid "ZFS-8000-8A.response"
+msgstr "No automated response will be taken."
+msgid "ZFS-8000-8A.impact"
+msgstr "The file or directory is unavailable."
+msgid "ZFS-8000-8A.action"
+msgstr "\nRun 'zpool status -x' to determine which pool is damaged:\n\n\n# zpool status -x\n  pool: test\n state: ONLINE\nstatus: One or more devices has experienced an error and no valid replicas\n        are available.  Some filesystem data is corrupt, and applications\n        may have been affected.\naction: Destroy the pool and restore from backup.\n   see: http://www.sun.com/msg/ZFS-8000-8A\n scrub: none requested\nconfig:\n\n        NAME                  STATE     READ WRITE CKSUM\n        test                  ONLINE       0     0     2\n          c0t0d0              ONLINE       0     0     2\n          c0t0d1              ONLINE       0     0     0\n\n\nUnfrotunately, the data cannot be repaired, and the only choice to repair the\ndata is to restore the pool from backup.  Applications attempting to access the\ncorrupted data will get an error (EIO), and data may be permanently lost.\n	"
+#
+# code: ZFS-8000-9P
+# keys: ereport.fs.zfs.device.failing
+#
+msgid "ZFS-8000-9P.type"
+msgstr "Error"
+msgid "ZFS-8000-9P.severity"
+msgstr "Minor"
+msgid "ZFS-8000-9P.description"
+msgstr "A device has experienced uncorrectable errors in a\n	    replicated configuration.  Refer to %s for more information."
+msgid "ZFS-8000-9P.response"
+msgstr "ZFS has attempted to repair the affected data."
+msgid "ZFS-8000-9P.impact"
+msgstr "The system is unaffected.  The detected errors may\n	    indicate future failure."
+msgid "ZFS-8000-9P.action"
+msgstr "\nRun 'zpool status -x' to determine which pool has experienced errors:\n\n\n# zpool status\n  pool: test\n state: ONLINE\nstatus: One or more devices has experienced an unrecoverable error.  An\n        attempt was made to correct the error.  Applications are unaffected.\naction: Determine if the device needs to be replaced, and clear the errors\n        using 'zpool online' or replace the device with 'zpool replace'.\n   see: http://www.sun.com/msg/ZFS-8000-9P\n scrub: none requested\nconfig:\n\n        NAME                  STATE     READ WRITE CKSUM\n        test                  ONLINE       0     0     0\n          mirror              ONLINE       0     0     0\n            c0t0d0            ONLINE       0     0     2\n            c0t0d1            ONLINE       0     0     0\n\n\nFind the device with a non-zero error count for READ, WRITE, or CKSUM.  This\nindicates that the device has experienced a read I/O error, write I/O error, or\nchecksum validation error.  Because the device is part of a mirror or RAID-Z\ndevice, ZFS was able to recover from the error and subsequently repair the\ndamaged data.\n\nThese error counts may or may not indicate that the device needs replacement.\nIt depends on how the errors were caused, which the administrator needs to\ndetermine.  For example, the following cases will all produce errors that do not\nindicate potential device failure:\n\n\nA network attached device lost connectivity but has now\nrecovered\nA device suffered from a bit flip, and expected event over long\nperiods of time\nAn adminstrator accidentally wrote over a portion of the disk using\nanother program\n\n\nIn these cases, the presence of errors does not indicate that the device is\nlikely to fail in the future, and therefore does not need to be replaced.  If\nthis is the case, then the device errors should be cleared using 'zpool online':\n\n\n# zpool online test c0t0d0\n\n\nOn the other hand, errors may very well indicate that the device has failed or\nis about to fail.  If there are continual I/O errors to a device that is\notherwise attached and functioning on the system, it most likely needs to be\nreplaced.   The administrator should check the system log for any driver\nmessages that may indicate hardware failure.  If it is determined that the\ndevice needs to be replaced, then the 'zpool replace' command should be used:\n\n\n# zpool replace test c0t0d0 c0t0d2\n\n\nThis will attach the new device to the pool and begin resilvering data to it.\nOnce the resilvering process is complete, the old device will automatically be\nremoved from the pool, at which point it can safely be removed from the system.\nIf the device needs to be replaced in-place (because there are no available\nspare devices), the original device can be removed and replaced with a new\ndevice, at which point a different form of 'zpool replace' can be used:\n\n\n# zpool replace test c0t0d0\n\n\nThis assumes that the original device at 'c0t0d0' has been replaced with a new\ndevice under the same path, and will be replaced appropriately\n\nYou can monitor the progress of the resilvering operation by using the 'zpool\nstatus -x' command:\n\n\n# zpool status -x\n  pool: test\n state: DEGRADED\nstatus: One or more devices is currently being replaced.  The pool may not be\n	providing the necessary level of replication.\naction: Wait for the resilvering operation to complete\n scrub: resilver in progress, 0.14% done, 0h0m to go\nconfig:\n\n        NAME                  STATE     READ WRITE CKSUM\n        test                  ONLINE       0     0     0\n          mirror              ONLINE       0     0     0\n            replacing         ONLINE       0     0     0\n              c0t0d0          ONLINE       0     0     3\n              c0t0d2          ONLINE       0     0     0  58.5K resilvered\n            c0t0d1            ONLINE       0     0     0\n\n	"
+#
+# code: ZFS-8000-A5
+# keys: ereport.fs.zfs.device.version_mismatch
+#
+msgid "ZFS-8000-A5.type"
+msgstr "Error"
+msgid "ZFS-8000-A5.severity"
+msgstr "Major"
+msgid "ZFS-8000-A5.description"
+msgstr "The on-disk version is not compatible with the running\n	    system.  Refer to %s for more information."
+msgid "ZFS-8000-A5.response"
+msgstr "No automated response will occur,"
+msgid "ZFS-8000-A5.impact"
+msgstr "The pool is unavailable"
+msgid "ZFS-8000-A5.action"
+msgstr "\nIf this error is seen during 'zpool import', see the section below.  Otherwise,\nrun 'zpool status -x' to determine which pool is faulted:\n\n\n# zpool status -x\n  pool: test\n state: FAULTED\nstatus: The ZFS version for the pool is incompatible with the software running\n        on this system.\naction: Destroy and re-create the pool.\n scrub: none requested\nconfig:\n\n        NAME                  STATE     READ WRITE CKSUM\n        test                  FAULTED      0     0     0  incompatible version\n          mirror              ONLINE       0     0     0\n            c0t0d0            ONLINE       0     0     0\n            c0t0d1            ONLINE       0     0     0\n\n\nThe pool cannot be used on this system.  Either move the disks to the system\nwhere they were originally created, or destroy the pool and re-create it from\nbackup.\n\n\nIf this error is seen during import, the pool cannot be imported on the current\nsystem.  The disks must be attached to the system which originally created the\npool, and imported there.\n	"
diff --git a/usr/src/cmd/fs.d/Makefile b/usr/src/cmd/fs.d/Makefile
index 38039de5ab51..b0653b513851 100644
--- a/usr/src/cmd/fs.d/Makefile
+++ b/usr/src/cmd/fs.d/Makefile
@@ -46,7 +46,8 @@ DEFAULTFILES=	fs.dfl
 include ../Makefile.cmd
 
 SUBDIR1= lofs
-SUBDIR2= fd pcfs nfs hsfs proc ctfs udfs ufs tmpfs cachefs autofs mntfs objfs
+SUBDIR2= zfs fd pcfs nfs hsfs proc ctfs udfs ufs tmpfs cachefs autofs mntfs \
+		objfs
 i386_SUBDIRS= xmemfs
 i386_I18NDIRS= xmemfs
 SUBDIRS= $(SUBDIR1) $(SUBDIR2) $($(MACH)_SUBDIRS)
diff --git a/usr/src/cmd/fs.d/df.c b/usr/src/cmd/fs.d/df.c
index 0d55abfcf5ac..6852626a68f2 100644
--- a/usr/src/cmd/fs.d/df.c
+++ b/usr/src/cmd/fs.d/df.c
@@ -24,13 +24,14 @@
 
 
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
+#include <dlfcn.h>
 #include <stdio.h>
 #include <stdarg.h>
 #include <string.h>
@@ -51,6 +52,7 @@
 #include <sys/mkdev.h>
 #include <sys/int_limits.h>
 #include <sys/zone.h>
+#include <libzfs.h>
 
 #include "fslib.h"
 
@@ -171,6 +173,7 @@ struct df_request {
 
 #define	DFR_MOUNT_POINT(dfrp)	(dfrp)->dfr_mte->mte_mount->mnt_mountp
 #define	DFR_SPECIAL(dfrp)	(dfrp)->dfr_mte->mte_mount->mnt_special
+#define	DFR_FSTYPE(dfrp)	(dfrp)->dfr_mte->mte_mount->mnt_fstype
 #define	DFR_ISMOUNTEDFS(dfrp)	((dfrp)->dfr_mte != NULL)
 
 #define	DFRP(p)			((struct df_request *)(p))
@@ -236,9 +239,23 @@ static void parse_options(int, char **);
 static char *basename(char *);
 
 
+/* ARGSUSED */
+static void
+dummy_error_handler(const char *fmt, va_list ap)
+{
+	/* Do nothing */
+}
+
+static zfs_handle_t *(*_zfs_open)(const char *, int);
+static void (*_zfs_close)(zfs_handle_t *);
+static uint64_t (*_zfs_prop_get_int)(zfs_handle_t *, zfs_prop_t);
+static void (*_zfs_set_error_handler)(void (*)(const char *, va_list));
+
 void
 main(int argc, char *argv[])
 {
+	void *hdl;
+
 	(void) setlocale(LC_ALL, "");
 
 #if !defined(TEXT_DOMAIN)		/* Should be defined by cc -D */
@@ -252,6 +269,32 @@ main(int argc, char *argv[])
 	sysv3_set = getenv("SYSV3");
 #endif	/* _iBCS2 */
 
+	/*
+	 * Dynamically check for libzfs, in case the user hasn't installed the
+	 * SUNWzfs packages.  A basic utility such as df shouldn't depend on
+	 * optional filesystems.
+	 */
+	if ((hdl = dlopen("libzfs.so", RTLD_LAZY)) != NULL) {
+		_zfs_set_error_handler = (void (*)())
+		    dlsym(hdl, "zfs_set_error_handler");
+		_zfs_open = (zfs_handle_t *(*)())dlsym(hdl, "zfs_open");
+		_zfs_close = (void (*)())dlsym(hdl, "zfs_close");
+		_zfs_prop_get_int = (uint64_t (*)())
+		    dlsym(hdl, "zfs_prop_get_int");
+
+		if (_zfs_set_error_handler != NULL) {
+			assert(_zfs_open != NULL);
+			assert(_zfs_close != NULL);
+			assert(_zfs_prop_get_int != NULL);
+
+			/*
+			 * Disable ZFS error reporting, so we don't get messages
+			 * like "can't open ..." under race conditions.
+			 */
+			_zfs_set_error_handler(dummy_error_handler);
+		}
+	}
+
 	if (EQ(program_name, DEVNM_CMD))
 		do_devnm(argc, argv);
 
@@ -1169,6 +1212,68 @@ number_to_scaled_string(
 	return (buf);
 }
 
+/*
+ * The statvfs() implementation allows us to return only two values, the total
+ * number of blocks and the number of blocks free.  The equation 'used = total -
+ * free' will not work for ZFS filesystems, due to the nature of pooled storage.
+ * We choose to return values in the statvfs structure that will produce correct
+ * results for 'used' and 'available', but not 'total'.  This function will open
+ * the underlying ZFS dataset if necessary and get the real value.
+ */
+static void
+adjust_total_blocks(struct df_request *dfrp, fsblkcnt64_t *total,
+    uint64_t blocksize)
+{
+	zfs_handle_t	*zhp;
+	char *dataset, *slash;
+	uint64_t quota;
+
+	if (strcmp(DFR_FSTYPE(dfrp), MNTTYPE_ZFS) != 0 ||
+	    _zfs_open == NULL)
+		return;
+
+	/*
+	 * We want to get the total size for this filesystem as bounded by any
+	 * quotas. In order to do this, we start at the current filesystem and
+	 * work upwards until we find a dataset with a quota.  If we reach the
+	 * pool itself, then the total space is the amount used plus the amount
+	 * available.
+	 */
+	if ((dataset = strdup(DFR_SPECIAL(dfrp))) == NULL)
+		return;
+
+	slash = dataset + strlen(dataset);
+	do {
+		*slash = '\0';
+
+		if ((zhp = _zfs_open(dataset, ZFS_TYPE_ANY)) == NULL) {
+			free(dataset);
+			return;
+		}
+
+		if ((quota = _zfs_prop_get_int(zhp, ZFS_PROP_QUOTA)) != 0) {
+			*total = quota / blocksize;
+			_zfs_close(zhp);
+			free(dataset);
+			return;
+		}
+
+		_zfs_close(zhp);
+
+	} while ((slash = strrchr(dataset, '/')) != NULL);
+
+
+	if ((zhp = _zfs_open(dataset, ZFS_TYPE_ANY)) == NULL) {
+		free(dataset);
+		return;
+	}
+
+	*total = (_zfs_prop_get_int(zhp, ZFS_PROP_USED) +
+	    _zfs_prop_get_int(zhp, ZFS_PROP_AVAILABLE)) / blocksize;
+
+	_zfs_close(zhp);
+	free(dataset);
+}
 
 /*
  * The output will appear properly columnized regardless of the names of
@@ -1178,6 +1283,7 @@ static void
 g_output(struct df_request *dfrp, struct statvfs64 *fsp)
 {
 	fsblkcnt64_t	available_blocks	= fsp->f_bavail;
+	fsblkcnt64_t	total_blocks = fsp->f_blocks;
 	numbuf_t	total_blocks_buf;
 	numbuf_t	total_files_buf;
 	numbuf_t	free_blocks_buf;
@@ -1258,9 +1364,11 @@ g_output(struct df_request *dfrp, struct statvfs64 *fsp)
 	if ((long long)available_blocks < (long long)0)
 		available_blocks = (fsblkcnt64_t)0;
 
+	adjust_total_blocks(dfrp, &total_blocks, fsp->f_frsize);
+
 	(void) printf("%*s %-*s %*s %-*s %*s %-*s %*s %-*s\n",
 		NCOL1_WIDTH, number_to_string(total_blocks_buf,
-					fsp->f_blocks, fsp->f_frsize, 512),
+					total_blocks, fsp->f_frsize, 512),
 			SCOL1_WIDTH, total_blocks_str,
 		NCOL2_WIDTH, number_to_string(free_blocks_buf,
 					fsp->f_bfree, fsp->f_frsize, 512),
@@ -1346,6 +1454,8 @@ k_output(struct df_request *dfrp, struct statvfs64 *fsp)
 		file_system = "";
 	}
 
+	adjust_total_blocks(dfrp, &total_blocks, fsp->f_frsize);
+
 	if (use_scaling) { /* comes from the -h option */
 	(void) printf("%-*s %*s %*s %*s %-*s %-s\n",
 		FILESYSTEM_WIDTH, file_system,
@@ -1428,6 +1538,7 @@ strings_init()
 static void
 t_output(struct df_request *dfrp, struct statvfs64 *fsp)
 {
+	fsblkcnt64_t	total_blocks = fsp->f_blocks;
 	numbuf_t	total_blocks_buf;
 	numbuf_t	total_files_buf;
 	numbuf_t	free_blocks_buf;
@@ -1435,6 +1546,8 @@ t_output(struct df_request *dfrp, struct statvfs64 *fsp)
 
 	STRINGS_INIT();
 
+	adjust_total_blocks(dfrp, &total_blocks, fsp->f_frsize);
+
 	(void) printf("%-*s(%-*s): %*s %s %*s %s\n",
 		MOUNT_POINT_WIDTH, DFR_MOUNT_POINT(dfrp),
 		SPECIAL_DEVICE_WIDTH, DFR_SPECIAL(dfrp),
@@ -1456,7 +1569,7 @@ t_output(struct df_request *dfrp, struct statvfs64 *fsp)
 	(void) printf("%*s: %*s %s %*s %s\n",
 		MNT_SPEC_WIDTH, total_str,
 		BLOCK_WIDTH, number_to_string(total_blocks_buf,
-				fsp->f_blocks, fsp->f_frsize, 512),
+				total_blocks, fsp->f_frsize, 512),
 		blocks_str,
 		NFILES_WIDTH, number_to_string(total_files_buf,
 				fsp->f_files, 1, 1),
diff --git a/usr/src/cmd/fs.d/nfs/svc/nfs-server b/usr/src/cmd/fs.d/nfs/svc/nfs-server
index 36cf2cb3ad5c..dc3339e3d147 100644
--- a/usr/src/cmd/fs.d/nfs/svc/nfs-server
+++ b/usr/src/cmd/fs.d/nfs/svc/nfs-server
@@ -42,8 +42,7 @@ case "$1" in
 	fi
 
 	# If /etc/dfs/dfstab exists and has non-blank or non-commented-out
-	# lines, then run shareall to export them, and then start up mountd
-	# and nfsd if anything is exported.
+	# lines, then run shareall to export them.
 
 	startnfsd=0
 	if [ -f /etc/dfs/dfstab ] && /usr/bin/egrep -v '^[	 ]*(#|$)' \
@@ -52,6 +51,14 @@ case "$1" in
 		/usr/sbin/shareall -F nfs
 	fi
 
+	# Share any ZFS filesystems marked for sharing.
+
+	if [ -x /usr/sbin/zfs ]; then
+		/usr/sbin/zfs share -a
+	fi
+
+	# Start up mountd and nfsd if anything is exported.
+
 	if /usr/bin/grep -s nfs /etc/dfs/sharetab >/dev/null; then
 		startnfsd=1
 	fi
@@ -88,6 +95,14 @@ case "$1" in
 'stop')
 	/usr/bin/pkill -x -u 0,1 -z $zone '(nfsd|mountd)'
 
+	# Unshare shared ZFS filesystems.
+
+	if [ -x /usr/sbin/zfs ]; then
+		/usr/sbin/zfs unshare -a
+	fi
+
+	# Unshare remaining shared filesystems.
+
 	if /usr/bin/grep -s nfs /etc/dfs/sharetab >/dev/null; then
 		/usr/sbin/unshareall -F nfs
 	fi
diff --git a/usr/src/cmd/fs.d/zfs/Makefile b/usr/src/cmd/fs.d/zfs/Makefile
new file mode 100644
index 000000000000..39187b544dc0
--- /dev/null
+++ b/usr/src/cmd/fs.d/zfs/Makefile
@@ -0,0 +1,58 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+SUBDIRS=	fstyp
+
+all:=		TARGET= all
+install:=	TARGET= install
+clean:=		TARGET= clean
+clobber:=	TARGET= clobber
+lint:=		TARGET= lint
+catalog:=       TARGET= catalog
+
+# for messaging catalog
+#
+POFILE= zfs.po
+POFILES= $(SUBDIRS:%=%/%.po)
+
+.KEEP_STATE:
+
+.PARALLEL:	$(SUBDIRS)
+
+all install clean clobber lint: $(SUBDIRS)
+
+catalog:        $(POFILE)
+
+$(POFILE):      $(SUBDIRS)
+	$(RM) 	$@
+	cat     $(POFILES)      > $@
+
+$(SUBDIRS):	FRC
+	@cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
diff --git a/usr/src/cmd/fs.d/zfs/fstyp/Makefile b/usr/src/cmd/fs.d/zfs/fstyp/Makefile
new file mode 100644
index 000000000000..8bad1c0f15d1
--- /dev/null
+++ b/usr/src/cmd/fs.d/zfs/fstyp/Makefile
@@ -0,0 +1,41 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+FSTYPE=		zfs
+LIBPROG=	fstyp
+
+include		../../Makefile.fstype
+
+POFILE=		fstyp.po
+
+catalog:	$(POFILE)
+
+LDLIBS += -lzfs -lnvpair
+
+lint :=		PROG = $(LIBPROG)
+lint:		lint_PROG
diff --git a/usr/src/cmd/fs.d/zfs/fstyp/fstyp.c b/usr/src/cmd/fs.d/zfs/fstyp/fstyp.c
new file mode 100644
index 000000000000..90de2fe5a692
--- /dev/null
+++ b/usr/src/cmd/fs.d/zfs/fstyp/fstyp.c
@@ -0,0 +1,156 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <libintl.h>
+#include <locale.h>
+#include <string.h>
+#include <libzfs.h>
+#include <errno.h>
+
+static void
+usage(void)
+{
+	(void) fprintf(stderr, gettext("Usage: fstype [-v] <device>\n"));
+	exit(1);
+}
+
+static void
+dump_nvlist(nvlist_t *list, int indent)
+{
+	nvpair_t *elem = NULL;
+
+	while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
+		switch (nvpair_type(elem)) {
+		case DATA_TYPE_STRING:
+			{
+				char *value;
+
+				verify(nvpair_value_string(elem, &value) == 0);
+				(void) printf("%*s%s='%s'\n", indent, "",
+				    nvpair_name(elem), value);
+			}
+			break;
+
+		case DATA_TYPE_UINT64:
+			{
+				uint64_t value;
+
+				verify(nvpair_value_uint64(elem, &value) == 0);
+				(void) printf("%*s%s=%llu\n", indent, "",
+				    nvpair_name(elem), (u_longlong_t)value);
+			}
+			break;
+
+		case DATA_TYPE_NVLIST:
+			{
+				nvlist_t *value;
+
+				verify(nvpair_value_nvlist(elem, &value) == 0);
+				(void) printf("%*s%s\n", indent, "",
+				    nvpair_name(elem));
+				dump_nvlist(value, indent + 4);
+			}
+			break;
+
+		case DATA_TYPE_NVLIST_ARRAY:
+			{
+				nvlist_t **value;
+				uint_t c, count;
+
+				verify(nvpair_value_nvlist_array(elem, &value,
+				    &count) == 0);
+
+				for (c = 0; c < count; c++) {
+					(void) printf("%*s%s[%u]\n", indent, "",
+					    nvpair_name(elem), c);
+					dump_nvlist(value[c], indent + 8);
+				}
+			}
+			break;
+
+		default:
+
+			(void) printf("bad config type %d for %s\n",
+			    nvpair_type(elem), nvpair_name(elem));
+		}
+	}
+}
+
+int
+main(int argc, char **argv)
+{
+	int c, fd;
+	int verbose = 0;
+	nvlist_t *config;
+
+	(void) setlocale(LC_ALL, "");
+
+#if !defined(TEXT_DOMAIN)
+#define	TEXT_DOMAIN "SYS_TEST"
+#endif
+	(void) textdomain(TEXT_DOMAIN);
+
+	while ((c = getopt(argc, argv, "v")) != -1) {
+		switch (c) {
+		case 'v':
+			verbose = 1;
+			break;
+		default:
+			usage();
+			break;
+		}
+	}
+
+	argv += optind;
+	argc -= optind;
+
+	if (argc != 1)
+		usage();
+
+	if ((fd = open64(argv[0], O_RDONLY)) < 0) {
+		perror("open64");
+		return (1);
+	}
+
+	if ((config = zpool_read_label(fd)) == NULL)
+		return (1);
+
+	(void) printf("zfs\n");
+
+	if (verbose)
+		dump_nvlist(config, 4);
+
+	(void) close(fd);
+
+	return (0);
+}
diff --git a/usr/src/cmd/getfacl/getfacl.c b/usr/src/cmd/getfacl/getfacl.c
index 2c05291fbcae..b2e56fdd3a49 100644
--- a/usr/src/cmd/getfacl/getfacl.c
+++ b/usr/src/cmd/getfacl/getfacl.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -112,6 +112,14 @@ main(int argc, char *argv[])
 		/* Get ACL info of the files */
 		errno = 0;
 		if ((aclcnt = acl(filep, GETACLCNT, 0, NULL)) < 0) {
+			if (errno == ENOSYS) {
+				(void) fprintf(stderr,
+				    gettext("File system doesn't support "
+				    "aclent_t style ACL's.\n"
+				    "See acl(5) for more information on "
+				    "Solaris ACL support.\n"));
+				exit(2);
+			}
 			perror(filep);
 			exit(2);
 		}
@@ -301,7 +309,7 @@ pruname(uid_t uid)
 	static char	uidp[10];	/* big enough */
 
 	passwdp = getpwuid(uid);
-	if (passwdp == (struct passwd *) NULL) {
+	if (passwdp == (struct passwd *)NULL) {
 		/* could not get passwd information: display uid instead */
 		(void) sprintf(uidp, "%ld", (long)uid);
 		return (uidp);
@@ -316,7 +324,7 @@ prgname(gid_t gid)
 	static char	gidp[10];	/* big enough */
 
 	groupp = getgrgid(gid);
-	if (groupp == (struct group *) NULL) {
+	if (groupp == (struct group *)NULL) {
 		/* could not get group information: display gid instead */
 		(void) sprintf(gidp, "%ld", (long)gid);
 		return (gidp);
diff --git a/usr/src/cmd/ls/Makefile.com b/usr/src/cmd/ls/Makefile.com
index e91d7aaf745a..6d695f3cd504 100644
--- a/usr/src/cmd/ls/Makefile.com
+++ b/usr/src/cmd/ls/Makefile.com
@@ -20,7 +20,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -36,6 +36,7 @@ SRCS=           $(OBJS:%.o=../%.c)
 
 include ../../Makefile.cmd
 
+LDLIBS += -lsec
 CFLAGS	+=	$(CCVERBOSE)
 $(XPG4) := CFLAGS += -DXPG4
 
diff --git a/usr/src/cmd/ls/ls.c b/usr/src/cmd/ls/ls.c
index e27311335ac1..540be3b04627 100644
--- a/usr/src/cmd/ls/ls.c
+++ b/usr/src/cmd/ls/ls.c
@@ -61,7 +61,7 @@
 #include <unistd.h>
 #include <libgen.h>
 #include <errno.h>
-#include <libcmdutils.h>
+#include <aclutils.h>
 
 #ifndef STANDALONE
 #define	TERMINFO
@@ -139,6 +139,7 @@ struct	lbuf	{
 	char 	acl;		/* indicate there are additional acl entries */
 	int	cycle;		/* cycle detected flag */
 	struct ditem *ancinfo;	/* maintains ancestor info */
+	acl_t *aclp;		/* ACL if present */
 };
 
 struct dchain {
@@ -219,6 +220,7 @@ static int		Hflg;
 static int		Lflg;
 static int		Rflg;
 static int		Sflg;
+static int		vflg;
 static long		hscale;
 static mode_t		flags;
 static int		err = 0;	/* Contains return code */
@@ -284,9 +286,8 @@ main(int argc, char *argv[])
 		mflg = 0;
 	}
 
-
 	while ((c = getopt(argc, argv,
-	    "aAbcCdeEfFghHilLmnopqrRsStux1@")) != EOF)
+	    "aAbcCdeEfFghHilLmnopqrRsStux1@v")) != EOF)
 		switch (c) {
 		case 'a':
 			aflg++;
@@ -415,6 +416,18 @@ main(int argc, char *argv[])
 			cflg = 0;
 			uflg++;
 			continue;
+		case 'v':
+			vflg++;
+#if !defined(XPG4)
+			if (lflg)
+				continue;
+#endif
+			lflg++;
+			statreq++;
+			Cflg = 0;
+			xflg = 0;
+			mflg = 0;
+			continue;
 		case 'x':
 			xflg = 1;
 			Cflg = 1;
@@ -447,7 +460,7 @@ main(int argc, char *argv[])
 		}
 	if (opterr) {
 		(void) fprintf(stderr, gettext(
-		    "usage: ls -aAbcCdeEfFghHilLmnopqrRsStux1@ [files]\n"));
+		    "usage: ls -aAbcCdeEfFghHilLmnopqrRsStuxv1@ [files]\n"));
 		exit(2);
 	}
 
@@ -851,6 +864,13 @@ pentry(struct lbuf *ap)
 			curcol += strcol((unsigned char *)dmark);
 		}
 	}
+
+	if (vflg) {
+		new_line();
+		if (p->aclp) {
+			acl_printacl(p->aclp, num_cols);
+		}
+	}
 }
 
 /* print various r,w,x permissions */
@@ -1100,7 +1120,7 @@ gstat(char *file, int argfl, struct ditem *myparent)
 	ssize_t cc;
 	int (*statf)() = ((Lflg) || (Hflg && argfl)) ? stat : lstat;
 	int aclcnt;
-	aclent_t *aclp;
+	int error;
 	aclent_t *tp;
 	o_mode_t groupperm, mask;
 	int grouppermfound, maskfound;
@@ -1285,75 +1305,77 @@ gstat(char *file, int argfl, struct ditem *myparent)
 
 		/* ACL: check acl entries count */
 		if (doacl) {
-			rep->acl = ' ';
-			if ((aclcnt = acl(file, GETACLCNT, 0, NULL)) >
-			    MIN_ACL_ENTRIES) {
 
-				/* this file has a non-trivial acl */
+			error = acl_get(file, 0, &rep->aclp);
+			if (error) {
+				(void) fprintf(stderr,
+				    gettext("ls: can't read ACL on %s: %s\n"),
+				    file, acl_strerror(error));
+				return (NULL);
+			}
 
-				rep->acl = '+';
+			rep->acl = ' ';
 
+			if (rep->aclp &&
+			    ((acl_flags(rep->aclp) & ACL_IS_TRIVIAL) == 0)) {
+				rep->acl = '+';
 				/*
-				 * For files with non-trivial acls, the
-				 * effective group permissions are the
-				 * intersection of the GROUP_OBJ value and
-				 * the CLASS_OBJ (acl mask) value. Determine
-				 * both the GROUP_OBJ and CLASS_OBJ for this
-				 * file and insert the logical AND of those
-				 * two values in the group permissions field
-				 * of the lflags value for this file.
+				 * Special handling for ufs aka aclent_t ACL's
 				 */
+				if (rep->aclp &&
+				    acl_type(rep->aclp) == ACLENT_T) {
+					/*
+					 * For files with non-trivial acls, the
+					 * effective group permissions are the
+					 * intersection of the GROUP_OBJ value
+					 * and the CLASS_OBJ (acl mask) value.
+					 * Determine both the GROUP_OBJ and
+					 * CLASS_OBJ for this file and insert
+					 * the logical AND of those two values
+					 * in the group permissions field
+					 * of the lflags value for this file.
+					 */
 
-				if ((aclp = (aclent_t *)malloc(
-				    (sizeof (aclent_t)) * aclcnt)) == NULL) {
-					perror("ls");
-					exit(2);
-				}
-
-				if (acl(file, GETACL, aclcnt, aclp) < 0) {
-					free(aclp);
-					(void) fprintf(stderr, "ls: ");
-					perror(file);
-					nfiles--;
-					err = 2;
-					return (NULL);
-				}
-
-				/*
-				 * Until found in acl list, assume maximum
-				 * permissions for both group and mask.  (Just
-				 * in case the acl lacks either value for
-				 * some reason.)
-				 */
-				groupperm = 07;
-				mask = 07;
-				grouppermfound = 0;
-				maskfound = 0;
-				for (tp = aclp; aclcnt--; tp++) {
-					if (tp->a_type == GROUP_OBJ) {
-						groupperm = tp->a_perm;
-						grouppermfound = 1;
-						continue;
-					}
-					if (tp->a_type == CLASS_OBJ) {
-						mask = tp->a_perm;
-						maskfound = 1;
+					/*
+					 * Until found in acl list, assume
+					 * maximum permissions for both group
+					 * a nd mask.  (Just in case the acl
+					 * lacks either value for some reason.)
+					 */
+					groupperm = 07;
+					mask = 07;
+					grouppermfound = 0;
+					maskfound = 0;
+					aclcnt = acl_cnt(rep->aclp);
+					for (tp =
+					    (aclent_t *)acl_data(rep->aclp);
+					    aclcnt--; tp++) {
+						if (tp->a_type == GROUP_OBJ) {
+							groupperm = tp->a_perm;
+							grouppermfound = 1;
+							continue;
+						}
+						if (tp->a_type == CLASS_OBJ) {
+							mask = tp->a_perm;
+							maskfound = 1;
+						}
+						if (grouppermfound && maskfound)
+							break;
 					}
-					if (grouppermfound && maskfound)
-						break;
-				}
 
-				free(aclp);
 
-				/* reset all the group bits */
-				rep->lflags &= ~S_IRWXG;
+					/* reset all the group bits */
+					rep->lflags &= ~S_IRWXG;
 
-				/*
-				 * Now set them to the logical AND of the
-				 * GROUP_OBJ permissions and the acl mask.
-				 */
+					/*
+					 * Now set them to the logical AND of
+					 * the GROUP_OBJ permissions and the
+					 * acl mask.
+					 */
 
-				rep->lflags |= (groupperm & mask) << 3;
+					rep->lflags |= (groupperm & mask) << 3;
+
+				}
 			}
 
 			if (atflg && pathconf(file, _PC_XATTR_EXISTS) == 1)
diff --git a/usr/src/cmd/mdb/Makefile.common b/usr/src/cmd/mdb/Makefile.common
index c28b9c84fbd8..7befb7e3a208 100644
--- a/usr/src/cmd/mdb/Makefile.common
+++ b/usr/src/cmd/mdb/Makefile.common
@@ -30,11 +30,13 @@
 # 
 COMMON_MODULES_PROC = \
 	dof \
+	libavl \
 	libc \
 	libnvpair \
 	libsysevent \
 	libumem \
 	libuutil \
+	libzpool \
 	mdb_ds \
 	mdb_test
 
@@ -72,4 +74,5 @@ COMMON_MODULES_KVM = \
 	sppp \
 	ufs \
 	ufs_log \
-	usba
+	usba \
+	zfs
diff --git a/usr/src/cmd/mdb/common/modules/genunix/avl.c b/usr/src/cmd/mdb/common/modules/genunix/avl.c
new file mode 100644
index 000000000000..b10856cfc3cd
--- /dev/null
+++ b/usr/src/cmd/mdb/common/modules/genunix/avl.c
@@ -0,0 +1,217 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/avl.h>
+
+#include <mdb/mdb_modapi.h>
+
+struct aw_info {
+	void *aw_buff;		/* buffer to hold the tree's data structure */
+	avl_tree_t aw_tree;	/* copy of avl_tree_t being walked */
+};
+
+/*
+ * common code used to find the addr of the the leftmost child below
+ * an AVL node
+ */
+static uintptr_t
+avl_leftmostchild(uintptr_t addr, void * buff, size_t offset, size_t size)
+{
+	avl_node_t *node = (avl_node_t *)((uintptr_t)buff + offset);
+
+	for (;;) {
+		addr -= offset;
+		if (mdb_vread(buff, size, addr) == -1) {
+			mdb_warn("read of avl_node_t failed: %p", addr);
+			return ((uintptr_t)-1L);
+		}
+		if (node->avl_child[0] == NULL)
+			break;
+		addr = (uintptr_t)node->avl_child[0];
+	}
+	return (addr);
+}
+
+/*
+ * initialize a forward walk thru an avl tree.
+ */
+int
+avl_walk_init(mdb_walk_state_t *wsp)
+{
+	struct aw_info *aw;
+	avl_tree_t *tree;
+	uintptr_t addr;
+
+	/*
+	 * allocate the AVL walk data
+	 */
+	wsp->walk_data = aw = mdb_zalloc(sizeof (struct aw_info), UM_SLEEP);
+
+	/*
+	 * get an mdb copy of the avl_tree_t being walked
+	 */
+	tree = &aw->aw_tree;
+	if (mdb_vread(tree, sizeof (avl_tree_t), wsp->walk_addr) == -1) {
+		mdb_warn("read of avl_tree_t failed: %p", wsp->walk_addr);
+		goto error;
+	}
+	if (tree->avl_size < tree->avl_offset + sizeof (avl_node_t)) {
+		mdb_warn("invalid avl_tree_t at %p, avl_size:%d, avl_offset:%d",
+		    wsp->walk_addr, tree->avl_size, tree->avl_offset);
+		goto error;
+	}
+
+	/*
+	 * allocate a buffer to hold the mdb copy of tree's structs
+	 * "node" always points at the avl_node_t field inside the struct
+	 */
+	aw->aw_buff = mdb_zalloc(tree->avl_size, UM_SLEEP);
+
+	/*
+	 * get the first avl_node_t address, use same algorithm
+	 * as avl_start() -- leftmost child in tree from root
+	 */
+	addr = (uintptr_t)tree->avl_root;
+	if (addr == NULL) {
+		wsp->walk_addr = NULL;
+		return (WALK_NEXT);
+	}
+	addr = avl_leftmostchild(addr, aw->aw_buff, tree->avl_offset,
+	    tree->avl_size);
+	if (addr == (uintptr_t)-1L)
+		goto error;
+
+	wsp->walk_addr = addr;
+	return (WALK_NEXT);
+
+error:
+	if (aw->aw_buff != NULL)
+		mdb_free(aw->aw_buff, sizeof (tree->avl_size));
+	mdb_free(aw, sizeof (struct aw_info));
+	return (WALK_ERR);
+}
+
+/*
+ * At each step, visit (callback) the current node, then move to the next
+ * in the AVL tree.  Uses the same algorithm as avl_walk().
+ */
+int
+avl_walk_step(mdb_walk_state_t *wsp)
+{
+	struct aw_info *aw;
+	size_t offset;
+	size_t size;
+	uintptr_t addr;
+	avl_node_t *node;
+	int status;
+	int was_child;
+
+	/*
+	 * don't walk past the end of the tree!
+	 */
+	addr = wsp->walk_addr;
+	if (addr == NULL)
+		return (WALK_DONE);
+
+	aw = (struct aw_info *)wsp->walk_data;
+	size = aw->aw_tree.avl_size;
+	offset = aw->aw_tree.avl_offset;
+	node = (avl_node_t *)((uintptr_t)aw->aw_buff + offset);
+
+	/*
+	 * must read the current node for the call back to use
+	 */
+	if (mdb_vread(aw->aw_buff, size, addr) == -1) {
+		mdb_warn("read of avl_node_t failed: %p", addr);
+		return (WALK_ERR);
+	}
+
+	/*
+	 * do the call back
+	 */
+	status = wsp->walk_callback(addr, aw->aw_buff, wsp->walk_cbdata);
+	if (status != WALK_NEXT)
+		return (status);
+
+	/*
+	 * move to the next node....
+	 * note we read in new nodes, so the pointer to the buffer is fixed
+	 */
+
+	/*
+	 * if the node has a right child then go to it and then all the way
+	 * thru as many left children as possible
+	 */
+	addr = (uintptr_t)node->avl_child[1];
+	if (addr != NULL) {
+		addr = avl_leftmostchild(addr, aw->aw_buff, offset, size);
+		if (addr == (uintptr_t)-1L)
+			return (WALK_ERR);
+
+	/*
+	 * othewise return to parent nodes, stopping if we ever return from
+	 * a left child
+	 */
+	} else {
+		for (;;) {
+			was_child = AVL_XCHILD(node);
+			addr = (uintptr_t)AVL_XPARENT(node);
+			if (addr == NULL)
+				break;
+			addr -= offset;
+			if (was_child == 0) /* stop on return from left child */
+				break;
+			if (mdb_vread(aw->aw_buff, size, addr) == -1) {
+				mdb_warn("read of avl_node_t failed: %p", addr);
+				return (WALK_ERR);
+			}
+		}
+	}
+
+	wsp->walk_addr = addr;
+	return (WALK_NEXT);
+}
+
+/*
+ * Release the memory allocated for the walk
+ */
+void
+avl_walk_fini(mdb_walk_state_t *wsp)
+{
+	struct aw_info *aw;
+
+	aw = (struct aw_info *)wsp->walk_data;
+
+	if (aw == NULL)
+		return;
+
+	if (aw->aw_buff != NULL)
+		mdb_free(aw->aw_buff, aw->aw_tree.avl_size);
+
+	mdb_free(aw, sizeof (struct aw_info));
+}
diff --git a/usr/src/cmd/mdb/common/modules/genunix/avl.h b/usr/src/cmd/mdb/common/modules/genunix/avl.h
new file mode 100644
index 000000000000..1d2e9dcb88ea
--- /dev/null
+++ b/usr/src/cmd/mdb/common/modules/genunix/avl.h
@@ -0,0 +1,48 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_MDB_AVL_H
+#define	_MDB_AVL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	AVL_WALK_NAME	"avl"
+#define	AVL_WALK_DESC	"given any avl_tree_t *, forward walk all " \
+			"entries in tree"
+
+extern int avl_walk_init(mdb_walk_state_t *);
+extern int avl_walk_step(mdb_walk_state_t *);
+extern void avl_walk_fini(mdb_walk_state_t *wsp);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _MDB_AVL_H */
diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
index 2a3b26ea8c11..5db8641cbe91 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
@@ -69,6 +69,7 @@
 #include <regex.h>
 #include <sys/port_impl.h>
 
+#include "avl.h"
 #include "contract.h"
 #include "cpupart_mdb.h"
 #include "devinfo.h"
@@ -1868,193 +1869,6 @@ generic_walk_step(mdb_walk_state_t *wsp)
 	    wsp->walk_cbdata));
 }
 
-struct aw_info {
-	void *aw_buff;		/* buffer to hold the tree's data structure */
-	avl_tree_t aw_tree;	/* copy of avl_tree_t being walked */
-};
-
-/*
- * common code used to find the addr of the the leftmost child below
- * an AVL node
- */
-static uintptr_t
-avl_leftmostchild(uintptr_t addr, void * buff, size_t offset, size_t size)
-{
-	avl_node_t *node = (avl_node_t *)((uintptr_t)buff + offset);
-
-	for (;;) {
-		addr -= offset;
-		if (mdb_vread(buff, size, addr) == -1) {
-			mdb_warn("read of avl_node_t failed: %p", addr);
-			return ((uintptr_t)-1L);
-		}
-		if (node->avl_child[0] == NULL)
-			break;
-		addr = (uintptr_t)node->avl_child[0];
-	}
-	return (addr);
-}
-
-/*
- * initialize a forward walk thru an avl tree.
- */
-int
-avl_walk_init(mdb_walk_state_t *wsp)
-{
-	struct aw_info *aw;
-	avl_tree_t *tree;
-	uintptr_t addr;
-
-	/*
-	 * allocate the AVL walk data
-	 */
-	wsp->walk_data = aw = mdb_zalloc(sizeof (struct aw_info), UM_SLEEP);
-
-	/*
-	 * get an mdb copy of the avl_tree_t being walked
-	 */
-	tree = &aw->aw_tree;
-	if (mdb_vread(tree, sizeof (avl_tree_t), wsp->walk_addr) == -1) {
-		mdb_warn("read of avl_tree_t failed: %p", wsp->walk_addr);
-		goto error;
-	}
-	if (tree->avl_size < tree->avl_offset + sizeof (avl_node_t)) {
-		mdb_warn("invalid avl_tree_t at %p, avl_size:%d, avl_offset:%d",
-		    wsp->walk_addr, tree->avl_size, tree->avl_offset);
-		goto error;
-	}
-
-	/*
-	 * allocate a buffer to hold the mdb copy of tree's structs
-	 * "node" always points at the avl_node_t field inside the struct
-	 */
-	aw->aw_buff = mdb_zalloc(tree->avl_size, UM_SLEEP);
-
-	/*
-	 * get the first avl_node_t address, use same algorithm
-	 * as avl_start() -- leftmost child in tree from root
-	 */
-	addr = (uintptr_t)tree->avl_root;
-	if (addr == NULL) {
-		wsp->walk_addr = NULL;
-		return (WALK_NEXT);
-	}
-	addr = avl_leftmostchild(addr, aw->aw_buff, tree->avl_offset,
-	    tree->avl_size);
-	if (addr == (uintptr_t)-1L)
-		goto error;
-
-	wsp->walk_addr = addr;
-	return (WALK_NEXT);
-
-error:
-	if (aw->aw_buff != NULL)
-		mdb_free(aw->aw_buff, sizeof (tree->avl_size));
-	mdb_free(aw, sizeof (struct aw_info));
-	return (WALK_ERR);
-}
-
-/*
- * At each step, visit (callback) the current node, then move to the next
- * in the AVL tree.  Uses the same algorithm as avl_walk().
- */
-int
-avl_walk_step(mdb_walk_state_t *wsp)
-{
-	struct aw_info *aw;
-	size_t offset;
-	size_t size;
-	uintptr_t addr;
-	avl_node_t *node;
-	int status;
-	int was_child;
-
-	/*
-	 * don't walk past the end of the tree!
-	 */
-	addr = wsp->walk_addr;
-	if (addr == NULL)
-		return (WALK_DONE);
-
-	aw = (struct aw_info *)wsp->walk_data;
-	size = aw->aw_tree.avl_size;
-	offset = aw->aw_tree.avl_offset;
-	node = (avl_node_t *)((uintptr_t)aw->aw_buff + offset);
-
-	/*
-	 * must read the current node for the call back to use
-	 */
-	if (mdb_vread(aw->aw_buff, size, addr) == -1) {
-		mdb_warn("read of avl_node_t failed: %p", addr);
-		return (WALK_ERR);
-	}
-
-	/*
-	 * do the call back
-	 */
-	status = wsp->walk_callback(addr, aw->aw_buff, wsp->walk_cbdata);
-	if (status != WALK_NEXT)
-		return (status);
-
-	/*
-	 * move to the next node....
-	 * note we read in new nodes, so the pointer to the buffer is fixed
-	 */
-
-	/*
-	 * if the node has a right child then go to it and then all the way
-	 * thru as many left children as possible
-	 */
-	addr = (uintptr_t)node->avl_child[1];
-	if (addr != NULL) {
-		addr = avl_leftmostchild(addr, aw->aw_buff, offset, size);
-		if (addr == (uintptr_t)-1L)
-			return (WALK_ERR);
-
-	/*
-	 * othewise return to parent nodes, stopping if we ever return from
-	 * a left child
-	 */
-	} else {
-		for (;;) {
-			was_child = AVL_XCHILD(node);
-			addr = (uintptr_t)AVL_XPARENT(node);
-			if (addr == NULL)
-				break;
-			addr -= offset;
-			if (was_child == 0) /* stop on return from left child */
-				break;
-			if (mdb_vread(aw->aw_buff, size, addr) == -1) {
-				mdb_warn("read of avl_node_t failed: %p", addr);
-				return (WALK_ERR);
-			}
-		}
-	}
-
-	wsp->walk_addr = addr;
-	return (WALK_NEXT);
-}
-
-/*
- * Release the memory allocated for the walk
- */
-void
-avl_walk_fini(mdb_walk_state_t *wsp)
-{
-	struct aw_info *aw;
-
-	aw = (struct aw_info *)wsp->walk_data;
-
-	if (aw == NULL)
-		return;
-
-	if (aw->aw_buff != NULL)
-		mdb_free(aw->aw_buff, aw->aw_tree.avl_size);
-
-	mdb_free(aw, sizeof (struct aw_info));
-}
-
-
 int
 seg_walk_init(mdb_walk_state_t *wsp)
 {
@@ -3575,6 +3389,8 @@ static const mdb_dcmd_t dcmds[] = {
 	/* from nvpair.c */
 	{ NVPAIR_DCMD_NAME, NVPAIR_DCMD_USAGE, NVPAIR_DCMD_DESCR,
 		nvpair_print },
+	{ NVLIST_DCMD_NAME, NVLIST_DCMD_USAGE, NVLIST_DCMD_DESCR,
+		nvlist_print },
 
 	/* from rctl.c */
 	{ "rctl_dict", "?", "print systemwide default rctl definitions",
@@ -3654,8 +3470,6 @@ static const mdb_dcmd_t dcmds[] = {
 static const mdb_walker_t walkers[] = {
 
 	/* from genunix.c */
-	{ "avl", "given any avl_tree_t *, forward walk all entries in tree",
-		avl_walk_init, avl_walk_step, avl_walk_fini },
 	{ "anon", "given an amp, list of anon structures",
 		anon_walk_init, anon_walk_step, anon_walk_fini },
 	{ "cpu", "walk cpu structures", cpu_walk_init, cpu_walk_step },
@@ -3702,6 +3516,10 @@ static const mdb_walker_t walkers[] = {
 	{ "taskq_entry", "given a taskq_t*, list all taskq_ent_t in the list",
 		taskq_walk_init, taskq_walk_step, NULL, NULL },
 
+	/* from avl.c */
+	{ AVL_WALK_NAME, AVL_WALK_DESC,
+		avl_walk_init, avl_walk_step, avl_walk_fini },
+
 	/* from zone.c */
 	{ "zone", "walk a list of kernel zones",
 		zone_walk_init, zone_walk_step, NULL },
@@ -3842,7 +3660,7 @@ static const mdb_walker_t walkers[] = {
 		lgrp_walk_init, lgrp_walk_step, NULL },
 
 	/* from list.c */
-	{ "list", "walk a linked list",
+	{ LIST_WALK_NAME, LIST_WALK_DESC,
 		list_walk_init, list_walk_step, list_walk_fini },
 
 	/* from memory.c */
diff --git a/usr/src/cmd/mdb/common/modules/genunix/list.h b/usr/src/cmd/mdb/common/modules/genunix/list.h
index 04d02da2c75e..10581cc90023 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/list.h
+++ b/usr/src/cmd/mdb/common/modules/genunix/list.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -35,6 +35,9 @@
 extern "C" {
 #endif
 
+#define	LIST_WALK_NAME	"list"
+#define	LIST_WALK_DESC	"walk a linked list"
+
 int list_walk_init(mdb_walk_state_t *wsp);
 int list_walk_step(mdb_walk_state_t *wsp);
 void list_walk_fini(mdb_walk_state_t *wsp);
diff --git a/usr/src/cmd/mdb/common/modules/genunix/nvpair.c b/usr/src/cmd/mdb/common/modules/genunix/nvpair.c
index d9025fa6be75..463f226a2c78 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/nvpair.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/nvpair.c
@@ -91,9 +91,45 @@ nvpair_walk_step(mdb_walk_state_t *wsp)
 	return (status);
 }
 
+/*
+ * ::nvlist [-v]
+ *
+ * Print out an entire nvlist.  This is shorthand for '::walk nvpair |
+ * ::nvpair -rq'.  The '-v' option invokes '::nvpair' without the "-q" option.
+ */
+/*ARGSUSED*/
+int
+nvlist_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	int verbose = B_FALSE;
+	mdb_arg_t v;
+
+	if (!(flags & DCMD_ADDRSPEC))
+		return (DCMD_USAGE);
+
+	if (mdb_getopts(argc, argv,
+	    'v', MDB_OPT_SETBITS, TRUE, &verbose,
+	    NULL) != argc)
+		return (DCMD_USAGE);
+
+	v.a_type = MDB_TYPE_STRING;
+	if (verbose)
+		v.a_un.a_str = "-r";
+	else
+		v.a_un.a_str = "-rq";
+
+	return (mdb_pwalk_dcmd("nvpair", "nvpair", 1, &v, addr));
+}
 
 /*
- * nvpair dcmd
+ * ::nvpair [-rq]
+ *
+ * 	-r	Recursively print any nvlist elements
+ * 	-q	Quiet mode; print members only as "name=value"
+ *
+ * Prints out a single nvpair.  By default, all information is printed.  When
+ * given the '-q' option, the type of elements is hidden, and elements are
+ * instead printed simply as 'name=value'.
  */
 typedef struct {
 	data_type_t	type;
@@ -136,7 +172,6 @@ nvpair_print_value(char *data, int32_t elem_size, int32_t nelem,
 {
 	int32_t i;
 
-	mdb_printf("value=");
 	if (elem_size == 0) {
 		char *p = data;
 
@@ -186,8 +221,16 @@ nvpair_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	char		*data = NULL, *data_end = NULL;
 	char		*type_name = NULL;
 	data_type_t	type = DATA_TYPE_UNKNOWN;
+	int		quiet = FALSE;
+	int		recurse = FALSE;
+
+	if (!(flags & DCMD_ADDRSPEC))
+		return (DCMD_USAGE);
 
-	if (!(flags & DCMD_ADDRSPEC) || argc != 0)
+	if (mdb_getopts(argc, argv,
+	    'r', MDB_OPT_SETBITS, TRUE, &recurse,
+	    'q', MDB_OPT_SETBITS, TRUE, &quiet,
+	    NULL) != argc)
 		return (DCMD_USAGE);
 
 	/* read in the nvpair header so we can get the size */
@@ -218,19 +261,30 @@ nvpair_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 			break;
 		}
 	}
-	/* print out the first line of nvpair info */
-	mdb_printf("name='%s'", NVP_NAME(nvpair));
-	if (type_name != NULL) {
-		mdb_printf(" type=%s", type_name);
+
+	if (quiet) {
+		mdb_printf("%s", NVP_NAME(nvpair));
 	} else {
-		/* if the nvpair type is unknown we print the type number */
-		mdb_printf(" type=0x%x", type);
+		/* print out the first line of nvpair info */
+		mdb_printf("name='%s'", NVP_NAME(nvpair));
+		if (type_name != NULL) {
+			mdb_printf(" type=%s", type_name);
+		} else {
+			/*
+			 * If the nvpair type is unknown we print the type
+			 * number
+			 */
+			mdb_printf(" type=0x%x", type);
+		}
+		mdb_printf(" items=%d\n", nelem);
 	}
-	mdb_printf(" items=%d\n", nelem);
 
 	/* if there is no data and the type is known then we're done */
-	if ((nelem == 0) && (type_name != NULL))
+	if ((nelem == 0) && (type_name != NULL)) {
+		if (quiet)
+			mdb_printf("(unknown)\n");
 		return (DCMD_OK);
+	}
 
 	/* get pointers to the data to print out */
 	data = (char *)NVP_VALUE(nvpair);
@@ -249,20 +303,54 @@ nvpair_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 	 */
 	if (type == DATA_TYPE_NVLIST) {
 		char *p = (char *)addr + (data - (char *)nvpair);
-		mdb_inc_indent(NVPAIR_VALUE_INDENT);
-		mdb_printf("value=%p\n", p);
-		mdb_dec_indent(NVPAIR_VALUE_INDENT);
+		if (recurse) {
+			if (quiet)
+				mdb_printf("\n");
+			mdb_inc_indent(NVPAIR_VALUE_INDENT);
+			if (mdb_pwalk_dcmd("nvpair", "nvpair", argc, argv,
+			    (uintptr_t)p) != DCMD_OK)
+				return (DCMD_ERR);
+			mdb_dec_indent(NVPAIR_VALUE_INDENT);
+		} else {
+			if (!quiet) {
+				mdb_inc_indent(NVPAIR_VALUE_INDENT);
+				mdb_printf("value", p);
+			}
+			mdb_printf("=%p\n", p);
+			if (!quiet)
+				mdb_dec_indent(NVPAIR_VALUE_INDENT);
+		}
 		return (DCMD_OK);
 
 	} else if (type == DATA_TYPE_NVLIST_ARRAY) {
-		mdb_inc_indent(NVPAIR_VALUE_INDENT);
-		mdb_printf("value=");
-		for (i = 0; i < nelem; i++, data += sizeof (nvlist_t *)) {
-			nvlist_t **nl = (nvlist_t **)(void *)data;
-			mdb_printf("%c%p", " "[i == 0], *nl);
+		if (recurse) {
+			for (i = 0; i < nelem; i++,
+			    data += sizeof (nvlist_t *)) {
+				nvlist_t **nl = (nvlist_t **)(void *)data;
+				if (quiet && i != 0)
+					mdb_printf("%s", NVP_NAME(nvpair));
+				mdb_printf("[%d]\n", i);
+				mdb_inc_indent(NVPAIR_VALUE_INDENT);
+				if (mdb_pwalk_dcmd("nvpair", "nvpair", argc,
+				    argv, (uintptr_t)*nl) != DCMD_OK)
+					return (DCMD_ERR);
+				mdb_dec_indent(NVPAIR_VALUE_INDENT);
+			}
+		} else {
+			if (!quiet) {
+				mdb_inc_indent(NVPAIR_VALUE_INDENT);
+				mdb_printf("value");
+			}
+			mdb_printf("=");
+			for (i = 0; i < nelem; i++,
+			    data += sizeof (nvlist_t *)) {
+				nvlist_t **nl = (nvlist_t **)(void *)data;
+				mdb_printf("%c%p", " "[i == 0], *nl);
+			}
+			mdb_printf("\n");
+			if (!quiet)
+				mdb_dec_indent(NVPAIR_VALUE_INDENT);
 		}
-		mdb_printf("\n");
-		mdb_dec_indent(NVPAIR_VALUE_INDENT);
 		return (DCMD_OK);
 	}
 
@@ -298,9 +386,15 @@ nvpair_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 		}
 	}
 
-	mdb_inc_indent(NVPAIR_VALUE_INDENT);
+	if (!quiet) {
+		mdb_inc_indent(NVPAIR_VALUE_INDENT);
+		mdb_printf("value=");
+	} else {
+		mdb_printf("=");
+	}
 	nvpair_print_value(data, elem_size, nelem, type);
-	mdb_dec_indent(NVPAIR_VALUE_INDENT);
+	if (!quiet)
+		mdb_dec_indent(NVPAIR_VALUE_INDENT);
 
 	return (DCMD_OK);
 }
diff --git a/usr/src/cmd/mdb/common/modules/genunix/nvpair.h b/usr/src/cmd/mdb/common/modules/genunix/nvpair.h
index 7f5210ec3e13..071f90116dec 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/nvpair.h
+++ b/usr/src/cmd/mdb/common/modules/genunix/nvpair.h
@@ -20,8 +20,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 2001 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #ifndef	_NVPAIR_H
@@ -34,9 +34,13 @@ extern "C" {
 #endif
 
 #define	NVPAIR_DCMD_NAME    "nvpair"
-#define	NVPAIR_DCMD_USAGE   ":"
+#define	NVPAIR_DCMD_USAGE   ":[-rq]"
 #define	NVPAIR_DCMD_DESCR   "print out an nvpair"
 
+#define	NVLIST_DCMD_NAME	"nvlist"
+#define	NVLIST_DCMD_USAGE	":[-v]"
+#define	NVLIST_DCMD_DESCR	"print out an nvlist"
+
 #define	NVPAIR_WALKER_NAME  "nvpair"
 #define	NVPAIR_WALKER_DESCR "walk through the nvpairs in an unpacked nvlist"
 
@@ -52,7 +56,9 @@ extern "C" {
 extern int nvpair_walk_init(mdb_walk_state_t *wsp);
 extern int nvpair_walk_step(mdb_walk_state_t *wsp);
 extern int nvpair_print(uintptr_t addr, uint_t flags,
-			int argc, const mdb_arg_t *argv);
+    int argc, const mdb_arg_t *argv);
+extern int nvlist_print(uintptr_t addr, uint_t flags,
+    int argc, const mdb_arg_t *argv);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/cmd/mdb/common/modules/libavl/libavl.c b/usr/src/cmd/mdb/common/modules/libavl/libavl.c
new file mode 100644
index 000000000000..e35fd049a9df
--- /dev/null
+++ b/usr/src/cmd/mdb/common/modules/libavl/libavl.c
@@ -0,0 +1,47 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <mdb/mdb_modapi.h>
+
+#include "../genunix/avl.h"
+
+static const mdb_walker_t walkers[] = {
+	{ AVL_WALK_NAME, AVL_WALK_DESC,
+		avl_walk_init, avl_walk_step, avl_walk_fini },
+	{ NULL }
+};
+
+static const mdb_modinfo_t modinfo = {
+	MDB_API_VERSION, NULL, walkers
+};
+
+const mdb_modinfo_t *
+_mdb_init(void)
+{
+	return (&modinfo);
+}
diff --git a/usr/src/cmd/mdb/common/modules/libnvpair/libnvpair.c b/usr/src/cmd/mdb/common/modules/libnvpair/libnvpair.c
index 7a9c87be8aa6..c2462e2d1c64 100644
--- a/usr/src/cmd/mdb/common/modules/libnvpair/libnvpair.c
+++ b/usr/src/cmd/mdb/common/modules/libnvpair/libnvpair.c
@@ -20,8 +20,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 2001 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -33,6 +33,8 @@
 static const mdb_dcmd_t dcmds[] = {
 	{ NVPAIR_DCMD_NAME, NVPAIR_DCMD_USAGE, NVPAIR_DCMD_DESCR,
 		nvpair_print },
+	{ NVLIST_DCMD_NAME, NVLIST_DCMD_USAGE, NVLIST_DCMD_DESCR,
+		nvlist_print },
 	{ NULL }
 };
 
diff --git a/usr/src/cmd/mdb/common/modules/zfs/inc.flg b/usr/src/cmd/mdb/common/modules/zfs/inc.flg
new file mode 100644
index 000000000000..bb65300ccae9
--- /dev/null
+++ b/usr/src/cmd/mdb/common/modules/zfs/inc.flg
@@ -0,0 +1,30 @@
+#!/bin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+find_files "s.*" usr/src/uts/common/fs/zfs/sys
+echo_file usr/src/uts/common/sys/fs/zfs.h
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
new file mode 100644
index 000000000000..d34f71f5df32
--- /dev/null
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
@@ -0,0 +1,1594 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <mdb/mdb_ctf.h>
+#include <sys/zfs_context.h>
+#include <sys/mdb_modapi.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/metaslab_impl.h>
+#include <sys/space_map.h>
+#include <sys/list.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio_compress.h>
+
+#ifndef _KERNEL
+#include "../genunix/list.h"
+#endif
+
+#ifdef _KERNEL
+#define	ZFS_OBJ_NAME	"zfs"
+#else
+#define	ZFS_OBJ_NAME	"libzpool.so.1"
+#endif
+
+static char *
+local_strdup(const char *s)
+{
+	char *s1 = mdb_alloc(strlen(s) + 1, UM_SLEEP);
+
+	(void) strcpy(s1, s);
+	return (s1);
+}
+
+static int
+getmember(uintptr_t addr, const char *type, mdb_ctf_id_t *idp,
+    const char *member, int len, void *buf)
+{
+	mdb_ctf_id_t id;
+	ulong_t off;
+	char name[64];
+
+	if (idp == NULL) {
+		if (mdb_ctf_lookup_by_name(type, &id) == -1) {
+			mdb_warn("couldn't find type %s", type);
+			return (DCMD_ERR);
+		}
+		idp = &id;
+	} else {
+		type = name;
+		mdb_ctf_type_name(*idp, name, sizeof (name));
+	}
+
+	if (mdb_ctf_offsetof(*idp, member, &off) == -1) {
+		mdb_warn("couldn't find member %s of type %s\n", member, type);
+		return (DCMD_ERR);
+	}
+	if (off % 8 != 0) {
+		mdb_warn("member %s of type %s is unsupported bitfield",
+		    member, type);
+		return (DCMD_ERR);
+	}
+	off /= 8;
+
+	if (mdb_vread(buf, len, addr + off) == -1) {
+		mdb_warn("failed to read %s from %s at %p",
+		    member, type, addr + off);
+		return (DCMD_ERR);
+	}
+	/* mdb_warn("read %s from %s at %p+%llx\n", member, type, addr, off); */
+
+	return (0);
+}
+
+#define	GETMEMB(addr, type, member, dest) \
+	getmember(addr, #type, NULL, #member, sizeof (dest), &(dest))
+
+#define	GETMEMBID(addr, ctfid, member, dest) \
+	getmember(addr, NULL, ctfid, #member, sizeof (dest), &(dest))
+
+static int
+getrefcount(uintptr_t addr, mdb_ctf_id_t *id,
+    const char *member, uint64_t *rc)
+{
+	static int gotid;
+	static mdb_ctf_id_t rc_id;
+	ulong_t off;
+
+	if (!gotid) {
+		if (mdb_ctf_lookup_by_name("struct refcount", &rc_id) == -1) {
+			mdb_warn("couldn't find struct refcount");
+			return (DCMD_ERR);
+		}
+		gotid = TRUE;
+	}
+
+	if (mdb_ctf_offsetof(*id, member, &off) == -1) {
+		char name[64];
+		mdb_ctf_type_name(*id, name, sizeof (name));
+		mdb_warn("couldn't find member %s of type %s\n", member, name);
+		return (DCMD_ERR);
+	}
+	off /= 8;
+
+	return (GETMEMBID(addr + off, &rc_id, rc_count, *rc));
+}
+
+static int
+read_symbol(char *sym_name, void **bufp)
+{
+	GElf_Sym sym;
+
+	if (mdb_lookup_by_obj(MDB_TGT_OBJ_EVERY, sym_name, &sym)) {
+		mdb_warn("can't find symbol %s", sym_name);
+		return (DCMD_ERR);
+	}
+
+	*bufp = mdb_alloc(sym.st_size, UM_SLEEP);
+
+	if (mdb_vread(*bufp, sym.st_size, sym.st_value) == -1) {
+		mdb_warn("can't read data for symbol %s", sym_name);
+		mdb_free(*bufp, sym.st_size);
+		return (DCMD_ERR);
+	}
+
+	return (DCMD_OK);
+}
+
+static int verbose;
+
+static int
+freelist_walk_init(mdb_walk_state_t *wsp)
+{
+	if (wsp->walk_addr == NULL) {
+		mdb_warn("must supply starting address\n");
+		return (WALK_ERR);
+	}
+
+	wsp->walk_data = 0;  /* Index into the freelist */
+	return (WALK_NEXT);
+}
+
+static int
+freelist_walk_step(mdb_walk_state_t *wsp)
+{
+	uint64_t entry;
+	uintptr_t number = (uintptr_t)wsp->walk_data;
+	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID" };
+	int mapshift = SPA_MINBLOCKSHIFT;
+
+	if (mdb_vread(&entry, sizeof (entry), wsp->walk_addr) == -1) {
+		mdb_warn("failed to read freelist entry %p", wsp->walk_addr);
+		return (WALK_DONE);
+	}
+	wsp->walk_addr += sizeof (entry);
+	wsp->walk_data = (void *)(number + 1);
+
+	if (SM_DEBUG_DECODE(entry)) {
+		mdb_printf("DEBUG: %3u  %10s: txg=%llu  pass=%llu\n",
+		    number,
+		    ddata[SM_DEBUG_ACTION_DECODE(entry)],
+		    SM_DEBUG_TXG_DECODE(entry),
+		    SM_DEBUG_SYNCPASS_DECODE(entry));
+	} else {
+		mdb_printf("Entry: %3u  offsets=%08llx-%08llx  type=%c  "
+		    "size=%06llx", number,
+		    SM_OFFSET_DECODE(entry) << mapshift,
+		    (SM_OFFSET_DECODE(entry) + SM_RUN_DECODE(entry)) <<
+		    mapshift,
+		    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
+		    SM_RUN_DECODE(entry) << mapshift);
+		if (verbose)
+			mdb_printf("      (raw=%012llx)\n", entry);
+		mdb_printf("\n");
+	}
+	return (WALK_NEXT);
+}
+
+/* ARGSUSED */
+static void
+freelist_walk_fini(mdb_walk_state_t *wsp)
+{
+}
+
+typedef struct dbuf_walk_data {
+	dbuf_hash_table_t ht;
+	int64_t bucket;
+	uintptr_t dbp;
+	dmu_buf_impl_t db;
+} dbuf_walk_data_t;
+
+static int
+dbuf_walk_init(mdb_walk_state_t *wsp)
+{
+	dbuf_walk_data_t *dwd;
+
+	if (wsp->walk_addr != NULL) {
+		mdb_warn("must supply starting address\n");
+		return (WALK_ERR);
+	}
+
+	dwd = mdb_alloc(sizeof (dbuf_walk_data_t), UM_SLEEP);
+
+	if (mdb_readvar(&dwd->ht, "dbuf_hash_table") == -1) {
+		mdb_warn("failed to read 'dbuf_hash_table'");
+		mdb_free(dwd, sizeof (dbuf_walk_data_t));
+		return (WALK_ERR);
+	}
+	dwd->bucket = -1;
+	dwd->dbp = 0;
+	wsp->walk_data = dwd;
+	return (WALK_NEXT);
+}
+
+static int
+dbuf_walk_step(mdb_walk_state_t *wsp)
+{
+	int status;
+	dbuf_walk_data_t *dwd = wsp->walk_data;
+
+	while (dwd->dbp == 0) {
+		dwd->bucket++;
+		if (dwd->bucket == dwd->ht.hash_table_mask+1)
+			return (WALK_DONE);
+
+		if (mdb_vread(&dwd->dbp, sizeof (void *),
+		    (uintptr_t)(dwd->ht.hash_table+dwd->bucket)) == -1) {
+			mdb_warn("failed to read hash bucket %u at %p",
+			    dwd->bucket, dwd->ht.hash_table+dwd->bucket);
+			return (WALK_DONE);
+		}
+	}
+
+	wsp->walk_addr = dwd->dbp;
+	if (mdb_vread(&dwd->db, sizeof (dmu_buf_impl_t),
+	    wsp->walk_addr) == -1) {
+		mdb_warn("failed to read dbuf at %p", wsp->walk_addr);
+		return (WALK_DONE);
+	}
+	status = wsp->walk_callback(wsp->walk_addr, &dwd->db, wsp->walk_cbdata);
+
+	dwd->dbp = (uintptr_t)dwd->db.db_hash_next;
+	return (status);
+}
+
+static void
+dbuf_walk_fini(mdb_walk_state_t *wsp)
+{
+	dbuf_walk_data_t *dwd = wsp->walk_data;
+	mdb_free(dwd, sizeof (dbuf_walk_data_t));
+}
+
+static int
+dataset_name(uintptr_t addr, char *buf)
+{
+	static int gotid;
+	static mdb_ctf_id_t dd_id;
+	uintptr_t dd_parent;
+	char dd_myname[MAXNAMELEN];
+
+	if (!gotid) {
+		if (mdb_ctf_lookup_by_name("struct dsl_dir",
+		    &dd_id) == -1) {
+			mdb_warn("couldn't find struct dsl_dir");
+			return (DCMD_ERR);
+		}
+		gotid = TRUE;
+	}
+	if (GETMEMBID(addr, &dd_id, dd_parent, dd_parent) ||
+	    GETMEMBID(addr, &dd_id, dd_myname, dd_myname)) {
+		return (DCMD_ERR);
+	}
+
+	if (dd_parent) {
+		if (dataset_name(dd_parent, buf))
+			return (DCMD_ERR);
+		strcat(buf, "/");
+	}
+
+	if (dd_myname[0])
+		strcat(buf, dd_myname);
+	else
+		strcat(buf, "???");
+
+	return (0);
+}
+
+static int
+objset_name(uintptr_t addr, char *buf)
+{
+	static int gotid;
+	static mdb_ctf_id_t osi_id, ds_id;
+	uintptr_t os_dsl_dataset;
+	char ds_snapname[MAXNAMELEN];
+	uintptr_t ds_dir;
+
+	buf[0] = '\0';
+
+	if (!gotid) {
+		if (mdb_ctf_lookup_by_name("struct objset_impl",
+		    &osi_id) == -1) {
+			mdb_warn("couldn't find struct objset_impl");
+			return (DCMD_ERR);
+		}
+		if (mdb_ctf_lookup_by_name("struct dsl_dataset",
+		    &ds_id) == -1) {
+			mdb_warn("couldn't find struct dsl_dataset");
+			return (DCMD_ERR);
+		}
+
+		gotid = TRUE;
+	}
+
+	if (GETMEMBID(addr, &osi_id, os_dsl_dataset, os_dsl_dataset))
+		return (DCMD_ERR);
+
+	if (os_dsl_dataset == 0) {
+		strcat(buf, "mos");
+		return (0);
+	}
+
+	if (GETMEMBID(os_dsl_dataset, &ds_id, ds_snapname, ds_snapname) ||
+	    GETMEMBID(os_dsl_dataset, &ds_id, ds_dir, ds_dir)) {
+		return (DCMD_ERR);
+	}
+
+	if (ds_dir && dataset_name(ds_dir, buf))
+		return (DCMD_ERR);
+
+	if (ds_snapname[0]) {
+		strcat(buf, "@");
+		strcat(buf, ds_snapname);
+	}
+	return (0);
+}
+
+static void
+enum_lookup(char *out, size_t size, mdb_ctf_id_t id, int val,
+    const char *prefix)
+{
+	const char *cp;
+	size_t len = strlen(prefix);
+
+	if ((cp = mdb_ctf_enum_name(id, val)) != NULL) {
+		if (strncmp(cp, prefix, len) == 0)
+			cp += len;
+		(void) strncpy(out, cp, size);
+	} else {
+		mdb_snprintf(out, size, "? (%d)", val);
+	}
+}
+
+/* ARGSUSED */
+static int
+zio_pipeline(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	mdb_ctf_id_t pipe_enum;
+	int i;
+	char stage[1024];
+
+	if (mdb_ctf_lookup_by_name("enum zio_stage", &pipe_enum) == -1) {
+		mdb_warn("Could not find enum zio_stage");
+		return (DCMD_ERR);
+	}
+
+	for (i = 0; i < 32; i++) {
+		if (addr & (1U << i)) {
+			enum_lookup(stage, sizeof (stage), pipe_enum, i,
+			    "ZIO_STAGE_");
+			mdb_printf("    %s\n", stage);
+		}
+	}
+
+	return (DCMD_OK);
+}
+
+/* ARGSUSED */
+static int
+blkptr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	blkptr_t bp;
+	dva_t *dva;
+	dmu_object_type_info_t *doti;
+	zio_compress_info_t *zct;
+	zio_checksum_info_t *zci;
+	int i;
+	char buf[MAXPATHLEN];
+
+	if (mdb_vread(&bp, sizeof (blkptr_t), addr) == -1) {
+		mdb_warn("failed to read blkptr_t");
+		return (DCMD_ERR);
+	}
+
+	if (read_symbol("dmu_ot", (void **)&doti) != DCMD_OK)
+		return (DCMD_ERR);
+	for (i = 0; i < DMU_OT_NUMTYPES; i++) {
+		mdb_readstr(buf, sizeof (buf), (uintptr_t)doti[i].ot_name);
+		doti[i].ot_name = local_strdup(buf);
+	}
+
+	if (read_symbol("zio_checksum_table", (void **)&zci) != DCMD_OK)
+		return (DCMD_ERR);
+	for (i = 0; i < ZIO_CHECKSUM_FUNCTIONS; i++) {
+		mdb_readstr(buf, sizeof (buf), (uintptr_t)zci[i].ci_name);
+		zci[i].ci_name = local_strdup(buf);
+	}
+
+	if (read_symbol("zio_compress_table", (void **)&zct) != DCMD_OK)
+		return (DCMD_ERR);
+	for (i = 0; i < ZIO_COMPRESS_FUNCTIONS; i++) {
+		mdb_readstr(buf, sizeof (buf), (uintptr_t)zct[i].ci_name);
+		zct[i].ci_name = local_strdup(buf);
+	}
+
+	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
+		dva = &bp.blk_dva[i];
+		mdb_printf("DVA[%d]: vdev_id %lld / %llx\n", i,
+		    DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva));
+		mdb_printf("DVA[%d]:                    GRID:  %04x\t"
+		    "ASIZE: %llx\n", i, DVA_GET_GRID(dva), DVA_GET_ASIZE(dva));
+	}
+	mdb_printf("LSIZE:  %-16llx\t\tPSIZE: %llx\n",
+	    BP_GET_LSIZE(&bp), BP_GET_PSIZE(&bp));
+	mdb_printf("ENDIAN: %6s             GANG:  %-5s\tTYPE:  %s\n",
+	    BP_GET_BYTEORDER(&bp) ? "LITTLE" : "BIG",
+	    DVA_GET_GANG(dva) ? "TRUE" : "FALSE",
+	    doti[BP_GET_TYPE(&bp)].ot_name);
+	mdb_printf("BIRTH:  %-16llx   LEVEL: %-2d\tFILL:  %llx\n",
+	    bp.blk_birth, BP_GET_LEVEL(&bp), bp.blk_fill);
+	mdb_printf("CKFUNC: %-16s\t\tCOMP:  %s\n",
+	    zci[BP_GET_CHECKSUM(&bp)].ci_name,
+	    zct[BP_GET_COMPRESS(&bp)].ci_name);
+	mdb_printf("CKSUM:  %llx:%llx:%llx:%llx\n",
+	    bp.blk_cksum.zc_word[0],
+	    bp.blk_cksum.zc_word[1],
+	    bp.blk_cksum.zc_word[2],
+	    bp.blk_cksum.zc_word[3]);
+
+	return (DCMD_OK);
+}
+
+/* ARGSUSED */
+static int
+dbuf(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	mdb_ctf_id_t id;
+	dmu_buf_t db;
+	uintptr_t objset;
+	uint8_t level;
+	uint64_t blkid;
+	uint64_t holds;
+	char objectname[32];
+	char blkidname[32];
+	char path[MAXNAMELEN];
+
+	if (DCMD_HDRSPEC(flags)) {
+		mdb_printf("        addr object lvl blkid holds os\n");
+	}
+
+	if (mdb_ctf_lookup_by_name("struct dmu_buf_impl", &id) == -1) {
+		mdb_warn("couldn't find struct dmu_buf_impl_t");
+		return (DCMD_ERR);
+	}
+
+	if (GETMEMBID(addr, &id, db_objset, objset) ||
+	    GETMEMBID(addr, &id, db, db) ||
+	    GETMEMBID(addr, &id, db_level, level) ||
+	    GETMEMBID(addr, &id, db_blkid, blkid)) {
+		return (WALK_ERR);
+	}
+
+	if (getrefcount(addr, &id, "db_holds", &holds)) {
+		return (WALK_ERR);
+	}
+
+	if (db.db_object == DMU_META_DNODE_OBJECT)
+		(void) strcpy(objectname, "mdn");
+	else
+		(void) mdb_snprintf(objectname, sizeof (objectname), "%llx",
+		    (u_longlong_t)db.db_object);
+
+	if (blkid == DB_BONUS_BLKID)
+		(void) strcpy(blkidname, "bonus");
+	else
+		(void) mdb_snprintf(blkidname, sizeof (blkidname), "%llx",
+		    (u_longlong_t)blkid);
+
+	if (objset_name(objset, path)) {
+		return (WALK_ERR);
+	}
+
+	mdb_printf("%p %8s %1u %9s %2llu %s\n",
+	    addr, objectname, level, blkidname, holds, path);
+
+	return (DCMD_OK);
+}
+
+/* ARGSUSED */
+static int
+dbuf_stats(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+#define	HISTOSZ 32
+	uintptr_t dbp;
+	dmu_buf_impl_t db;
+	dbuf_hash_table_t ht;
+	uint64_t bucket, ndbufs;
+	uint64_t histo[HISTOSZ];
+	uint64_t histo2[HISTOSZ];
+	int i, maxidx;
+
+	if (mdb_readvar(&ht, "dbuf_hash_table") == -1) {
+		mdb_warn("failed to read 'dbuf_hash_table'");
+		return (DCMD_ERR);
+	}
+
+	for (i = 0; i < HISTOSZ; i++) {
+		histo[i] = 0;
+		histo2[i] = 0;
+	}
+
+	ndbufs = 0;
+	for (bucket = 0; bucket < ht.hash_table_mask+1; bucket++) {
+		int len;
+
+		if (mdb_vread(&dbp, sizeof (void *),
+		    (uintptr_t)(ht.hash_table+bucket)) == -1) {
+			mdb_warn("failed to read hash bucket %u at %p",
+			    bucket, ht.hash_table+bucket);
+			return (DCMD_ERR);
+		}
+
+		len = 0;
+		while (dbp != 0) {
+			if (mdb_vread(&db, sizeof (dmu_buf_impl_t),
+			    dbp) == -1) {
+				mdb_warn("failed to read dbuf at %p", dbp);
+				return (DCMD_ERR);
+			}
+			dbp = (uintptr_t)db.db_hash_next;
+			for (i = MIN(len, HISTOSZ - 1); i >= 0; i--)
+				histo2[i]++;
+			len++;
+			ndbufs++;
+		}
+
+		if (len >= HISTOSZ)
+			len = HISTOSZ-1;
+		histo[len]++;
+	}
+
+	mdb_printf("hash table has %llu buckets, %llu dbufs "
+	    "(avg %llu buckets/dbuf)\n",
+	    ht.hash_table_mask+1, ndbufs,
+	    (ht.hash_table_mask+1)/ndbufs);
+
+	mdb_printf("\n");
+	maxidx = 0;
+	for (i = 0; i < HISTOSZ; i++)
+		if (histo[i] > 0)
+			maxidx = i;
+	mdb_printf("hash chain length	number of buckets\n");
+	for (i = 0; i <= maxidx; i++)
+		mdb_printf("%u			%llu\n", i, histo[i]);
+
+	mdb_printf("\n");
+	maxidx = 0;
+	for (i = 0; i < HISTOSZ; i++)
+		if (histo2[i] > 0)
+			maxidx = i;
+	mdb_printf("hash chain depth	number of dbufs\n");
+	for (i = 0; i <= maxidx; i++)
+		mdb_printf("%u or more		%llu	%llu%%\n",
+		    i, histo2[i], histo2[i]*100/ndbufs);
+
+
+	return (DCMD_OK);
+}
+
+typedef struct dbufs_data {
+	mdb_ctf_id_t id;
+	uint64_t objset;
+	uint64_t object;
+	uint64_t level;
+	uint64_t blkid;
+	char *osname;
+} dbufs_data_t;
+
+#define	DBUFS_UNSET	(0xbaddcafedeadbeefULL)
+
+/* ARGSUSED */
+static int
+dbufs_cb(uintptr_t addr, const void *unknown, void *arg)
+{
+	dbufs_data_t *data = arg;
+	uintptr_t objset;
+	dmu_buf_t db;
+	uint8_t level;
+	uint64_t blkid;
+	char osname[MAXNAMELEN];
+
+	if (GETMEMBID(addr, &data->id, db_objset, objset) ||
+	    GETMEMBID(addr, &data->id, db, db) ||
+	    GETMEMBID(addr, &data->id, db_level, level) ||
+	    GETMEMBID(addr, &data->id, db_blkid, blkid)) {
+		return (WALK_ERR);
+	}
+
+	if ((data->objset == DBUFS_UNSET || data->objset == objset) &&
+	    (data->osname == NULL || (objset_name(objset, osname) == 0 &&
+		strcmp(data->osname, osname) == 0)) &&
+	    (data->object == DBUFS_UNSET || data->object == db.db_object) &&
+	    (data->level == DBUFS_UNSET || data->level == level) &&
+	    (data->blkid == DBUFS_UNSET || data->blkid == blkid)) {
+		mdb_printf("%#lr\n", addr);
+	}
+	return (WALK_NEXT);
+}
+
+/* ARGSUSED */
+static int
+dbufs(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	dbufs_data_t data;
+	char *object = NULL;
+	char *blkid = NULL;
+
+	data.objset = data.object = data.level = data.blkid = DBUFS_UNSET;
+	data.osname = NULL;
+
+	if (mdb_getopts(argc, argv,
+	    'O', MDB_OPT_UINT64, &data.objset,
+	    'n', MDB_OPT_STR, &data.osname,
+	    'o', MDB_OPT_STR, &object,
+	    'l', MDB_OPT_UINT64, &data.level,
+	    'b', MDB_OPT_STR, &blkid) != argc) {
+		return (DCMD_USAGE);
+	}
+
+	if (object) {
+		if (strcmp(object, "mdn") == 0) {
+			data.object = DMU_META_DNODE_OBJECT;
+		} else {
+			data.object = mdb_strtoull(object);
+		}
+	}
+
+	if (blkid) {
+		if (strcmp(blkid, "bonus") == 0) {
+			data.blkid = DB_BONUS_BLKID;
+		} else {
+			data.blkid = mdb_strtoull(blkid);
+		}
+	}
+
+	if (mdb_ctf_lookup_by_name("struct dmu_buf_impl", &data.id) == -1) {
+		mdb_warn("couldn't find struct dmu_buf_impl_t");
+		return (DCMD_ERR);
+	}
+
+	if (mdb_pwalk("dbufs", dbufs_cb, &data, 0) != 0) {
+		mdb_warn("can't walk dbufs");
+		return (DCMD_ERR);
+	}
+
+	return (DCMD_OK);
+}
+
+typedef struct abuf_find_data {
+	dva_t dva;
+	mdb_ctf_id_t id;
+} abuf_find_data_t;
+
+/* ARGSUSED */
+static int
+abuf_find_cb(uintptr_t addr, const void *unknown, void *arg)
+{
+	abuf_find_data_t *data = arg;
+	dva_t dva;
+
+	if (GETMEMBID(addr, &data->id, b_dva, dva)) {
+		return (WALK_ERR);
+	}
+
+	if (dva.dva_word[0] == data->dva.dva_word[0] &&
+	    dva.dva_word[1] == data->dva.dva_word[1]) {
+		mdb_printf("%#lr\n", addr);
+	}
+	return (WALK_NEXT);
+}
+
+/* ARGSUSED */
+static int
+abuf_find(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	abuf_find_data_t data;
+	GElf_Sym sym;
+	int i;
+	const char *syms[] = {
+		"ARC_mru_top",
+		"ARC_mru_bot",
+		"ARC_mfu_top",
+		"ARC_mfu_bot",
+	};
+
+	if (argc != 2)
+		return (DCMD_USAGE);
+
+	for (i = 0; i < 2; i ++) {
+		switch (argv[i].a_type) {
+		case MDB_TYPE_STRING:
+			data.dva.dva_word[i] = mdb_strtoull(argv[i].a_un.a_str);
+			break;
+		case MDB_TYPE_IMMEDIATE:
+			data.dva.dva_word[i] = argv[i].a_un.a_val;
+			break;
+		default:
+			return (DCMD_USAGE);
+		}
+	}
+
+	if (mdb_ctf_lookup_by_name("struct arc_buf_hdr", &data.id) == -1) {
+		mdb_warn("couldn't find struct arc_buf_hdr");
+		return (DCMD_ERR);
+	}
+
+	for (i = 0; i < sizeof (syms) / sizeof (syms[0]); i++) {
+		if (mdb_lookup_by_name(syms[i], &sym)) {
+			mdb_warn("can't find symbol %s", syms[i]);
+			return (DCMD_ERR);
+		}
+
+		if (mdb_pwalk("list", abuf_find_cb, &data, sym.st_value) != 0) {
+			mdb_warn("can't walk %s", syms[i]);
+			return (DCMD_ERR);
+		}
+	}
+
+	return (DCMD_OK);
+}
+
+void
+abuf_help(void)
+{
+	mdb_printf("::abuf_find dva_word[0] dva_word[1]\n");
+}
+
+/*
+ * ::spa
+ *
+ * 	-c	Print configuration information as well
+ * 	-v	Print vdev state
+ * 	-e	Print vdev error stats
+ *
+ * Print a summarized spa_t.  When given no arguments, prints out a table of all
+ * active pools on the system.
+ */
+/* ARGSUSED */
+static int
+spa_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	spa_t spa;
+	char poolname[MAXNAMELEN];
+	const char *statetab[] = { "ACTIVE", "EXPORTED", "DESTROYED",
+		"UNINIT", "UNAVAIL" };
+	const char *state;
+	int config = FALSE;
+	int vdevs = FALSE;
+	int errors = FALSE;
+
+	if (mdb_getopts(argc, argv,
+	    'c', MDB_OPT_SETBITS, TRUE, &config,
+	    'v', MDB_OPT_SETBITS, TRUE, &vdevs,
+	    'e', MDB_OPT_SETBITS, TRUE, &errors,
+	    NULL) != argc)
+		return (DCMD_USAGE);
+
+	if (!(flags & DCMD_ADDRSPEC)) {
+		if (mdb_walk_dcmd("spa", "spa", argc, argv) == -1) {
+			mdb_warn("can't walk spa");
+			return (DCMD_ERR);
+		}
+
+		return (DCMD_OK);
+	}
+
+	if (flags & DCMD_PIPE_OUT) {
+		mdb_printf("%#lr\n", addr);
+		return (DCMD_OK);
+	}
+
+	if (DCMD_HDRSPEC(flags))
+		mdb_printf("%<u>%-?s %9s %-*s%</u>\n", "ADDR", "STATE",
+		    sizeof (uintptr_t) == 4 ? 60 : 52, "NAME");
+
+	if (mdb_vread(&spa, sizeof (spa), addr) == -1) {
+		mdb_warn("failed to read spa_t at %p", addr);
+		return (DCMD_ERR);
+	}
+
+	if (mdb_readstr(poolname, sizeof (poolname), (uintptr_t)spa.spa_name)
+	    == -1) {
+		mdb_warn("failed to read pool name at %p", spa.spa_name);
+		return (DCMD_ERR);
+	}
+
+	if (spa.spa_state < 0 || spa.spa_state > POOL_STATE_UNAVAIL)
+		state = "UKNNOWN";
+	else
+		state = statetab[spa.spa_state];
+
+	mdb_printf("%0?p %9s %s\n", addr, state, poolname);
+
+	if (config) {
+		mdb_printf("\n");
+		mdb_inc_indent(4);
+		if (mdb_call_dcmd("spa_config", addr, flags, 0,
+		    NULL) != DCMD_OK)
+			return (DCMD_ERR);
+		mdb_dec_indent(4);
+	}
+
+	if (vdevs || errors) {
+		mdb_arg_t v;
+
+		v.a_type = MDB_TYPE_STRING;
+		v.a_un.a_str = "-e";
+
+		mdb_printf("\n");
+		mdb_inc_indent(4);
+		if (mdb_call_dcmd("spa_vdevs", addr, flags, errors ? 1 : 0,
+		    &v) != DCMD_OK)
+			return (DCMD_ERR);
+		mdb_dec_indent(4);
+	}
+
+	return (DCMD_OK);
+}
+
+/*
+ * ::spa_config
+ *
+ * Given a spa_t, print the configuration information stored in spa_config.
+ * Since it's just an nvlist, format it as an indented list of name=value pairs.
+ * We simply read the value of spa_config and pass off to ::nvlist.
+ */
+/* ARGSUSED */
+static int
+spa_print_config(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	spa_t spa;
+
+	if (argc != 0 || !(flags & DCMD_ADDRSPEC))
+		return (DCMD_USAGE);
+
+	if (mdb_vread(&spa, sizeof (spa), addr) == -1) {
+		mdb_warn("failed to read spa_t at %p", addr);
+		return (DCMD_ERR);
+	}
+
+	if (spa.spa_config == NULL) {
+		mdb_printf("(none)\n");
+		return (DCMD_OK);
+	}
+
+	return (mdb_call_dcmd("nvlist", (uintptr_t)spa.spa_config, flags,
+	    0, NULL));
+}
+
+void
+vdev_help(void)
+{
+	mdb_printf("[vdev_t*]::vdev [-qr]\n"
+		"\t-> -q display vdev_queue parameters\n"
+		"\t-> -r recursive (visit all children)\n");
+}
+
+/*
+ * ::vdev
+ *
+ * Print out a summarized vdev_t, in the following form:
+ *
+ * ADDR             STATE	AUX            DESC
+ * fffffffbcde23df0 HEALTHY	-              /dev/dsk/c0t0d0
+ *
+ * or with "-q" to print out a vdev_t's vdev_queue parameters:
+ *
+ *  vdev_t: c26ae4c0
+ *     c26ae73c min pending         0x2
+ *     c26ae744 max pending         0x23
+ *     c26ae74c agg limit           0x20000
+ *     c26ae754 time shift          0x4
+ *     c26ae75c ramp rate           0x2
+ *
+ * If '-r' is specified, recursively visit all children.
+ *
+ * With '-e', the statistics associated with the vdev are printed as well.
+ */
+static int
+do_print_vdev(uintptr_t addr, int flags, int depth, int queue, int stats,
+    int recursive)
+{
+	vdev_t vdev;
+	char desc[MAXNAMELEN];
+	int c, children;
+	uintptr_t *child;
+	const char *state, *aux;
+
+	if (mdb_vread(&vdev, sizeof (vdev), (uintptr_t)addr) == -1) {
+		mdb_warn("failed to read vdev_t at %p\n", (uintptr_t)addr);
+		return (DCMD_ERR);
+	}
+
+	if (flags & DCMD_PIPE_OUT) {
+		mdb_printf("%#lr", addr);
+	} else {
+		if (vdev.vdev_path != NULL) {
+			if (mdb_readstr(desc, sizeof (desc),
+			    (uintptr_t)vdev.vdev_path) == -1) {
+				mdb_warn("failed to read vdev_path at %p\n",
+				    vdev.vdev_path);
+				return (DCMD_ERR);
+			}
+		} else if (vdev.vdev_ops != NULL) {
+			vdev_ops_t ops;
+			if (mdb_vread(&ops, sizeof (ops),
+			    (uintptr_t)vdev.vdev_ops) == -1) {
+				mdb_warn("failed to read vdev_ops at %p\n",
+				    vdev.vdev_ops);
+				return (DCMD_ERR);
+			}
+			(void) strcpy(desc, ops.vdev_op_type);
+		} else {
+			(void) strcpy(desc, "<unknown>");
+		}
+
+		if (depth == 0 && DCMD_HDRSPEC(flags))
+			mdb_printf("%<u>%-?s %-9s %-12s %-*s%</u>\n",
+			    "ADDR", "STATE", "AUX",
+			    sizeof (uintptr_t) == 4 ? 43 : 35,
+			    "DESCRIPTION");
+
+		mdb_printf("%0?p ", addr);
+
+		switch (vdev.vdev_state) {
+		case VDEV_STATE_CLOSED:
+		    state = "CLOSED";
+		    break;
+		case VDEV_STATE_OFFLINE:
+		    state = "OFFLINE";
+		    break;
+		case VDEV_STATE_CANT_OPEN:
+		    state = "CANT_OPEN";
+		    break;
+		case VDEV_STATE_DEGRADED:
+		    state = "DEGRADED";
+		    break;
+		case VDEV_STATE_HEALTHY:
+		    state = "HEALTHY";
+		    break;
+		default:
+		    state = "UNKNOWN";
+		    break;
+		}
+
+		switch (vdev.vdev_stat.vs_aux) {
+		case VDEV_AUX_NONE:
+			aux = "-";
+			break;
+		case VDEV_AUX_OPEN_FAILED:
+			aux = "OPEN_FAILED";
+			break;
+		case VDEV_AUX_CORRUPT_DATA:
+			aux = "CORRUPT_DATA";
+			break;
+		case VDEV_AUX_NO_REPLICAS:
+			aux = "NO_REPLICAS";
+			break;
+		case VDEV_AUX_BAD_GUID_SUM:
+			aux = "BAD_GUID_SUM";
+			break;
+		case VDEV_AUX_TOO_SMALL:
+			aux = "TOO_SMALL";
+			break;
+		case VDEV_AUX_BAD_LABEL:
+			aux = "BAD_LABEL";
+			break;
+		default:
+			aux = "UNKNOWN";
+			break;
+		}
+
+		mdb_printf("%-9s %-12s %*s%s\n", state, aux, depth, "", desc);
+
+		if (queue) {
+			mdb_inc_indent(4);
+			mdb_printf("\n");
+			mdb_printf("%p min pending		0x%llx\n",
+			    (uintptr_t)(addr + offsetof(vdev_t,
+			    vdev_queue.vq_min_pending)),
+			    vdev.vdev_queue.vq_min_pending);
+			mdb_printf("%p max pending		0x%llx\n",
+			    (uintptr_t)(addr + offsetof(vdev_t,
+			    vdev_queue.vq_max_pending)),
+			    vdev.vdev_queue.vq_max_pending);
+			mdb_printf("%p agg limit		0x%llx\n",
+			    (uintptr_t)(addr + offsetof(vdev_t,
+			    vdev_queue.vq_agg_limit)),
+			    vdev.vdev_queue.vq_agg_limit);
+			mdb_printf("%p time shift		0x%llx\n",
+			    (uintptr_t)(addr + offsetof(vdev_t,
+			    vdev_queue.vq_time_shift)),
+			    vdev.vdev_queue.vq_time_shift);
+			mdb_printf("%p ramp rate 		0x%llx\n",
+			    (uintptr_t)(addr + offsetof(vdev_t,
+			    vdev_queue.vq_ramp_rate)),
+			    vdev.vdev_queue.vq_ramp_rate);
+			mdb_dec_indent(4);
+		}
+
+		if (stats) {
+			vdev_stat_t *vs = &vdev.vdev_stat;
+			int i;
+
+			mdb_inc_indent(4);
+			mdb_printf("\n");
+			mdb_printf("%<u>       %12s %12s %12s %12s "
+			    "%12s%</u>\n", "READ", "WRITE", "FREE", "CLAIM",
+			    "IOCTL");
+			mdb_printf("OPS     ");
+			for (i = 1; i < ZIO_TYPES; i++)
+				mdb_printf("%11#llx%s", vs->vs_ops[i],
+				    i == ZIO_TYPES - 1 ? "" : "  ");
+			mdb_printf("\n");
+			mdb_printf("BYTES   ");
+			for (i = 1; i < ZIO_TYPES; i++)
+				mdb_printf("%11#llx%s", vs->vs_bytes[i],
+				    i == ZIO_TYPES - 1 ? "" : "  ");
+
+
+			mdb_printf("\n");
+			mdb_printf("EREAD    %10#llx\n", vs->vs_read_errors);
+			mdb_printf("EWRITE   %10#llx\n", vs->vs_write_errors);
+			mdb_printf("ECKSUM   %10#llx\n",
+			    vs->vs_checksum_errors);
+			mdb_dec_indent(4);
+		}
+
+		if (queue || stats)
+			mdb_printf("\n");
+	}
+
+	children = vdev.vdev_children;
+
+	if (children == 0 || !recursive)
+		return (DCMD_OK);
+
+	child = mdb_alloc(children * sizeof (void *), UM_SLEEP | UM_GC);
+	if (mdb_vread(child, children * sizeof (void *),
+	    (uintptr_t)vdev.vdev_child) == -1) {
+		mdb_warn("failed to read vdev children at %p", vdev.vdev_child);
+		return (DCMD_ERR);
+	}
+
+	for (c = 0; c < children; c++) {
+		if (do_print_vdev(child[c], flags, depth + 2, queue, stats,
+		    recursive))
+			return (DCMD_ERR);
+	}
+
+	return (DCMD_OK);
+}
+
+static int
+vdev_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	int print_queue = FALSE;
+	int recursive = FALSE;
+	int stats = FALSE;
+
+	if (mdb_getopts(argc, argv,
+	    'q', MDB_OPT_SETBITS, TRUE, &print_queue,
+	    'r', MDB_OPT_SETBITS, TRUE, &recursive,
+	    'e', MDB_OPT_SETBITS, TRUE, &stats,
+	    NULL) != argc)
+		return (DCMD_USAGE);
+
+	if (!(flags & DCMD_ADDRSPEC)) {
+		mdb_warn("no vdev_t address given\n");
+		return (DCMD_ERR);
+	}
+
+	return (do_print_vdev(addr, flags, 0, print_queue, stats, recursive));
+}
+
+typedef struct mdb_spa {
+	uintptr_t spa_dsl_pool;
+	uintptr_t spa_root_vdev;
+} mdb_spa_t;
+
+typedef struct mdb_dsl_dir {
+	uintptr_t dd_phys;
+	uint64_t dd_used_bytes;
+	int64_t dd_space_towrite[TXG_SIZE];
+} mdb_dsl_dir_t;
+
+typedef struct mdb_dsl_dir_phys {
+	uint64_t dd_used_bytes;
+	uint64_t dd_compressed_bytes;
+	uint64_t dd_uncompressed_bytes;
+} mdb_dsl_dir_phys_t;
+
+typedef struct mdb_vdev {
+	uintptr_t vdev_parent;
+	uintptr_t vdev_ms;
+	uint64_t vdev_ms_count;
+	vdev_stat_t vdev_stat;
+} mdb_vdev_t;
+
+typedef struct mdb_metaslab {
+	space_map_t ms_allocmap[TXG_SIZE];
+	space_map_t ms_freemap[TXG_SIZE];
+	space_map_t ms_map;
+	uint64_t ms_usable_space;
+} mdb_metaslab_t;
+
+/*
+ * ::spa_space [-b]
+ *
+ * Given a spa_t, print out it's on-disk space usage and in-core
+ * estimates of future usage.  If -b is given, print space in bytes.
+ * Otherwise print in megabytes.
+ */
+/* ARGSUSED */
+static int
+spa_space(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	mdb_spa_t spa;
+	uintptr_t dp_root_dir;
+	mdb_dsl_dir_t dd;
+	mdb_dsl_dir_phys_t dsp;
+	uint64_t children;
+	uintptr_t childaddr;
+	uintptr_t *child;
+	uint64_t ms_allocmap[TXG_SIZE] = {0, 0, 0, 0};
+	uint64_t ms_freemap[TXG_SIZE] = {0, 0, 0, 0};
+	uint64_t ms_map = 0;
+	uint64_t ms_usable_space = 0;
+	int i, j;
+	int havecompressed = TRUE;
+	int shift = 20;
+	char *suffix = "M";
+	int bits = FALSE;
+
+	if (mdb_getopts(argc, argv, 'b', MDB_OPT_SETBITS, TRUE, &bits, NULL) !=
+	    argc)
+		return (DCMD_USAGE);
+	if (!(flags & DCMD_ADDRSPEC))
+		return (DCMD_USAGE);
+
+	if (bits) {
+		shift = 0;
+		suffix = "";
+	}
+
+	if (GETMEMB(addr, struct spa, spa_dsl_pool, spa.spa_dsl_pool) ||
+	    GETMEMB(addr, struct spa, spa_root_vdev, spa.spa_root_vdev) ||
+	    GETMEMB(spa.spa_root_vdev, struct vdev, vdev_children, children) ||
+	    GETMEMB(spa.spa_root_vdev, struct vdev, vdev_child, childaddr) ||
+	    GETMEMB(spa.spa_dsl_pool, struct dsl_pool,
+	    dp_root_dir, dp_root_dir) ||
+	    GETMEMB(dp_root_dir, struct dsl_dir, dd_phys, dd.dd_phys) ||
+	    GETMEMB(dp_root_dir, struct dsl_dir,
+	    dd_used_bytes, dd.dd_used_bytes) ||
+	    GETMEMB(dp_root_dir, struct dsl_dir,
+	    dd_space_towrite, dd.dd_space_towrite) ||
+	    GETMEMB(dd.dd_phys, struct dsl_dir_phys,
+	    dd_used_bytes, dsp.dd_used_bytes)) {
+		return (DCMD_ERR);
+	}
+
+	if (GETMEMB(dd.dd_phys, struct dsl_dir_phys,
+	    dd_compressed_bytes, dsp.dd_compressed_bytes) ||
+	    GETMEMB(dd.dd_phys, struct dsl_dir_phys,
+	    dd_uncompressed_bytes, dsp.dd_uncompressed_bytes)) {
+		havecompressed = FALSE;
+	}
+
+	child = mdb_alloc(children * sizeof (void *), UM_SLEEP | UM_GC);
+	if (mdb_vread(child, children * sizeof (void *), childaddr) == -1) {
+		mdb_warn("failed to read root vdev children at %p", childaddr);
+		return (DCMD_ERR);
+	}
+
+	mdb_printf("dd_space_towrite = %llu%s %llu%s %llu%s %llu%s\n",
+	    dd.dd_space_towrite[0] >> shift, suffix,
+	    dd.dd_space_towrite[1] >> shift, suffix,
+	    dd.dd_space_towrite[2] >> shift, suffix,
+	    dd.dd_space_towrite[3] >> shift, suffix);
+	mdb_printf("dd_used_bytes = %llu%s\n",
+	    dd.dd_used_bytes >> shift, suffix);
+
+	mdb_printf("dd_phys.dd_used_bytes = %llu%s\n",
+	    dsp.dd_used_bytes >> shift, suffix);
+	if (havecompressed) {
+		mdb_printf("dd_phys.dd_compressed_bytes = %llu%s\n",
+		    dsp.dd_compressed_bytes >> shift, suffix);
+		mdb_printf("dd_phys.dd_uncompressed_bytes = %llu%s\n",
+		    dsp.dd_uncompressed_bytes >> shift, suffix);
+	}
+
+	for (i = 0; i < children; i++) {
+		mdb_vdev_t vd;
+		uintptr_t *vdev_ms;
+
+		if (GETMEMB(child[i], struct vdev,
+		    vdev_parent, vd.vdev_parent) ||
+		    GETMEMB(child[i], struct vdev,
+		    vdev_stat, vd.vdev_stat) ||
+		    GETMEMB(child[i], struct vdev, vdev_ms, vd.vdev_ms) ||
+		    GETMEMB(child[i], struct vdev,
+		    vdev_ms_count, vd.vdev_ms_count)) {
+			return (DCMD_ERR);
+		}
+
+		/*
+		 * If this is the root vdev, its stats are the pool-wide stats.
+		 */
+		if (vd.vdev_parent == NULL) {
+			mdb_printf("pool_alloc = %llu%s\n",
+			    vd.vdev_stat.vs_alloc >> shift, suffix);
+			mdb_printf("pool_space = %llu%s\n",
+			    vd.vdev_stat.vs_space >> shift, suffix);
+		}
+
+		/*
+		 * If this is not a top-level vdev, it doesn't have space.
+		 */
+		if (vd.vdev_parent != spa.spa_root_vdev)
+			continue;
+
+		vdev_ms = mdb_alloc(vd.vdev_ms_count * sizeof (void*),
+		    UM_SLEEP | UM_GC);
+		if (mdb_vread(vdev_ms, vd.vdev_ms_count * sizeof (void*),
+		    (uintptr_t)vd.vdev_ms) == -1) {
+			mdb_warn("failed to read vdev_ms at %p", vd.vdev_ms);
+			return (DCMD_ERR);
+		}
+
+		for (j = 0; j < vd.vdev_ms_count; j++) {
+			mdb_metaslab_t ms;
+
+			if (GETMEMB(vdev_ms[j], struct metaslab,
+			    ms_allocmap, ms.ms_allocmap) ||
+			    GETMEMB(vdev_ms[j], struct metaslab,
+			    ms_freemap, ms.ms_freemap) ||
+			    GETMEMB(vdev_ms[j], struct metaslab,
+			    ms_map, ms.ms_map) ||
+			    GETMEMB(vdev_ms[j], struct metaslab,
+			    ms_usable_space, ms.ms_usable_space)) {
+				return (DCMD_ERR);
+			}
+
+			ms_allocmap[0] += ms.ms_allocmap[0].sm_space;
+			ms_allocmap[1] += ms.ms_allocmap[1].sm_space;
+			ms_allocmap[2] += ms.ms_allocmap[2].sm_space;
+			ms_allocmap[3] += ms.ms_allocmap[3].sm_space;
+			ms_freemap[0] += ms.ms_freemap[0].sm_space;
+			ms_freemap[1] += ms.ms_freemap[1].sm_space;
+			ms_freemap[2] += ms.ms_freemap[2].sm_space;
+			ms_freemap[3] += ms.ms_freemap[3].sm_space;
+			ms_map += ms.ms_map.sm_space;
+			ms_usable_space += ms.ms_usable_space;
+		}
+	}
+
+	mdb_printf("ms_allocmap = %llu%s %llu%s %llu%s %llu%s\n",
+	    ms_allocmap[0] >> shift, suffix,
+	    ms_allocmap[1] >> shift, suffix,
+	    ms_allocmap[2] >> shift, suffix,
+	    ms_allocmap[3] >> shift, suffix);
+	mdb_printf("ms_freemap = %llu%s %llu%s %llu%s %llu%s\n",
+	    ms_freemap[0] >> shift, suffix,
+	    ms_freemap[1] >> shift, suffix,
+	    ms_freemap[2] >> shift, suffix,
+	    ms_freemap[3] >> shift, suffix);
+	mdb_printf("ms_map = %llu%s\n", ms_map >> shift, suffix);
+	mdb_printf("ms_usable_space = %llu%s\n",
+	    ms_usable_space >> shift, suffix);
+
+	return (DCMD_OK);
+}
+
+/*
+ * ::spa_verify
+ *
+ * Given a spa_t, verify that that the pool is self-consistent.
+ * Currently, it only checks to make sure that the vdev tree exists.
+ */
+/* ARGSUSED */
+static int
+spa_verify(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	spa_t spa;
+
+	if (argc != 0 || !(flags & DCMD_ADDRSPEC))
+		return (DCMD_USAGE);
+
+	if (mdb_vread(&spa, sizeof (spa), addr) == -1) {
+		mdb_warn("failed to read spa_t at %p", addr);
+		return (DCMD_ERR);
+	}
+
+	if (spa.spa_root_vdev == NULL) {
+		mdb_printf("no vdev tree present\n");
+		return (DCMD_OK);
+	}
+
+	return (DCMD_OK);
+}
+
+/*
+ * ::spa_vdevs
+ *
+ * 	-e	Include error stats
+ *
+ * Print out a summarized list of vdevs for the given spa_t.
+ * This is accomplished by invoking "::vdev -re" on the root vdev.
+ */
+/* ARGSUSED */
+static int
+spa_vdevs(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	spa_t spa;
+	mdb_arg_t v;
+	int errors = FALSE;
+
+	if (mdb_getopts(argc, argv,
+	    'e', MDB_OPT_SETBITS, TRUE, &errors,
+	    NULL) != argc)
+		return (DCMD_USAGE);
+
+	if (!(flags & DCMD_ADDRSPEC))
+		return (DCMD_USAGE);
+
+	if (mdb_vread(&spa, sizeof (spa), addr) == -1) {
+		mdb_warn("failed to read spa_t at %p", addr);
+		return (DCMD_ERR);
+	}
+
+	v.a_type = MDB_TYPE_STRING;
+	v.a_un.a_str = errors ? "-re" : "-r";
+
+	return (mdb_call_dcmd("vdev", (uintptr_t)spa.spa_root_vdev,
+	    flags, 1, &v));
+}
+
+typedef struct txg_list_walk_data {
+	uintptr_t lw_head[TXG_SIZE];
+	int	lw_txgoff;
+	int	lw_maxoff;
+	size_t	lw_offset;
+	void	*lw_obj;
+} txg_list_walk_data_t;
+
+static int
+txg_list_walk_init_common(mdb_walk_state_t *wsp, int txg, int maxoff)
+{
+	txg_list_walk_data_t *lwd;
+	txg_list_t list;
+	int i;
+
+	lwd = mdb_alloc(sizeof (txg_list_walk_data_t), UM_SLEEP | UM_GC);
+	if (mdb_vread(&list, sizeof (txg_list_t), wsp->walk_addr) == -1) {
+		mdb_warn("failed to read txg_list_t at %#lx", wsp->walk_addr);
+		return (WALK_ERR);
+	}
+
+	for (i = 0; i < TXG_SIZE; i++)
+		lwd->lw_head[i] = (uintptr_t)list.tl_head[i];
+	lwd->lw_offset = list.tl_offset;
+	lwd->lw_obj = mdb_alloc(lwd->lw_offset + sizeof (txg_node_t),
+	    UM_SLEEP | UM_GC);
+	lwd->lw_txgoff = txg;
+	lwd->lw_maxoff = maxoff;
+
+	wsp->walk_addr = lwd->lw_head[lwd->lw_txgoff];
+	wsp->walk_data = lwd;
+
+	return (WALK_NEXT);
+}
+
+static int
+txg_list_walk_init(mdb_walk_state_t *wsp)
+{
+	return (txg_list_walk_init_common(wsp, 0, TXG_SIZE-1));
+}
+
+static int
+txg_list0_walk_init(mdb_walk_state_t *wsp)
+{
+	return (txg_list_walk_init_common(wsp, 0, 0));
+}
+
+static int
+txg_list1_walk_init(mdb_walk_state_t *wsp)
+{
+	return (txg_list_walk_init_common(wsp, 1, 1));
+}
+
+static int
+txg_list2_walk_init(mdb_walk_state_t *wsp)
+{
+	return (txg_list_walk_init_common(wsp, 2, 2));
+}
+
+static int
+txg_list3_walk_init(mdb_walk_state_t *wsp)
+{
+	return (txg_list_walk_init_common(wsp, 3, 3));
+}
+
+static int
+txg_list_walk_step(mdb_walk_state_t *wsp)
+{
+	txg_list_walk_data_t *lwd = wsp->walk_data;
+	uintptr_t addr;
+	txg_node_t *node;
+	int status;
+
+	while (wsp->walk_addr == NULL && lwd->lw_txgoff < lwd->lw_maxoff) {
+		lwd->lw_txgoff++;
+		wsp->walk_addr = lwd->lw_head[lwd->lw_txgoff];
+	}
+
+	if (wsp->walk_addr == NULL)
+		return (WALK_DONE);
+
+	addr = wsp->walk_addr - lwd->lw_offset;
+
+	if (mdb_vread(lwd->lw_obj,
+	    lwd->lw_offset + sizeof (txg_node_t), addr) == -1) {
+		mdb_warn("failed to read list element at %#lx", addr);
+		return (WALK_ERR);
+	}
+
+	status = wsp->walk_callback(addr, lwd->lw_obj, wsp->walk_cbdata);
+	node = (txg_node_t *)((uintptr_t)lwd->lw_obj + lwd->lw_offset);
+	wsp->walk_addr = (uintptr_t)node->tn_next[lwd->lw_txgoff];
+
+	return (status);
+}
+
+/* ARGSUSED */
+static void
+txg_list_walk_fini(mdb_walk_state_t *wsp)
+{
+}
+
+/*
+ * ::walk spa
+ *
+ * Walk all named spa_t structures in the namespace.  This is nothing more than
+ * a layered avl walk.
+ */
+static int
+spa_walk_init(mdb_walk_state_t *wsp)
+{
+	GElf_Sym sym;
+
+	if (wsp->walk_addr != NULL) {
+		mdb_warn("spa walk only supports global walks\n");
+		return (WALK_ERR);
+	}
+
+	if (mdb_lookup_by_obj(ZFS_OBJ_NAME, "spa_namespace_avl", &sym) == -1) {
+		mdb_warn("failed to find symbol 'spa_namespace_avl'");
+		return (WALK_ERR);
+	}
+
+	wsp->walk_addr = (uintptr_t)sym.st_value;
+
+	if (mdb_layered_walk("avl", wsp) == -1) {
+		mdb_warn("failed to walk 'avl'\n");
+		return (WALK_ERR);
+	}
+
+	return (WALK_NEXT);
+}
+
+static int
+spa_walk_step(mdb_walk_state_t *wsp)
+{
+	spa_t	spa;
+
+	if (mdb_vread(&spa, sizeof (spa), wsp->walk_addr) == -1) {
+		mdb_warn("failed to read spa_t at %p", wsp->walk_addr);
+		return (WALK_ERR);
+	}
+
+	return (wsp->walk_callback(wsp->walk_addr, &spa, wsp->walk_cbdata));
+}
+
+/*
+ * MDB module linkage information:
+ *
+ * We declare a list of structures describing our dcmds, and a function
+ * named _mdb_init to return a pointer to our module information.
+ */
+
+static const mdb_dcmd_t dcmds[] = {
+	{ "blkptr", ":", "print blkptr_t", blkptr },
+	{ "dbuf", ":", "print dmu_buf_impl_t", dbuf },
+	{ "dbuf_stats", ":", "dbuf stats", dbuf_stats },
+	{ "dbufs",
+	"\t[-O objset_t*] [-n objset_name | \"mos\"] [-o object | \"mdn\"] \n"
+	"\t[-l level] [-b blkid | \"bonus\"]",
+	"find dmu_buf_impl_t's that meet criterion", dbufs },
+	{ "abuf_find", "dva_word[0] dva_word[1]",
+	"find arc_buf_hdr_t of a specified DVA",
+	abuf_find },
+	{ "spa", "?[-cv]", "spa_t summary", spa_print },
+	{ "spa_config", ":", "print spa_t configuration", spa_print_config },
+	{ "spa_verify", ":", "verify spa_t consistency", spa_verify },
+	{ "spa_space", ":[-b]", "print spa_t on-disk space usage", spa_space },
+	{ "spa_vdevs", ":", "given a spa_t, print vdev summary", spa_vdevs },
+	{ "vdev", ":[-qre]", "vdev_t summary", vdev_print },
+	{ "zio_pipeline", ":", "decode a zio pipeline", zio_pipeline },
+	{ NULL }
+};
+
+static const mdb_walker_t walkers[] = {
+	/*
+	 * In userland, there is no generic provider of list_t walkers, so we
+	 * need to add it.
+	 */
+#ifndef _KERNEL
+	{ LIST_WALK_NAME, LIST_WALK_DESC,
+		list_walk_init, list_walk_step, list_walk_fini },
+#endif
+	{ "dbufs", "walk cached ZFS dbufs",
+		dbuf_walk_init, dbuf_walk_step, dbuf_walk_fini },
+	{ "zms_freelist", "walk ZFS metaslab freelist",
+		freelist_walk_init, freelist_walk_step, freelist_walk_fini },
+	{ "txg_list", "given any txg_list_t *, walk all entries in all txgs",
+		txg_list_walk_init, txg_list_walk_step, txg_list_walk_fini },
+	{ "txg_list0", "given any txg_list_t *, walk all entries in txg 0",
+		txg_list0_walk_init, txg_list_walk_step, txg_list_walk_fini },
+	{ "txg_list1", "given any txg_list_t *, walk all entries in txg 1",
+		txg_list1_walk_init, txg_list_walk_step, txg_list_walk_fini },
+	{ "txg_list2", "given any txg_list_t *, walk all entries in txg 2",
+		txg_list2_walk_init, txg_list_walk_step, txg_list_walk_fini },
+	{ "txg_list3", "given any txg_list_t *, walk all entries in txg 3",
+		txg_list3_walk_init, txg_list_walk_step, txg_list_walk_fini },
+	{ "spa", "walk all spa_t entries in the namespace",
+		spa_walk_init, spa_walk_step, NULL },
+	{ NULL }
+};
+
+static const mdb_modinfo_t modinfo = {
+	MDB_API_VERSION, dcmds, walkers
+};
+
+const mdb_modinfo_t *
+_mdb_init(void)
+{
+	return (&modinfo);
+}
diff --git a/usr/src/cmd/mdb/intel/amd64/genunix/Makefile b/usr/src/cmd/mdb/intel/amd64/genunix/Makefile
index 3e59587d49dd..25af0c5a026e 100644
--- a/usr/src/cmd/mdb/intel/amd64/genunix/Makefile
+++ b/usr/src/cmd/mdb/intel/amd64/genunix/Makefile
@@ -29,6 +29,7 @@ MODULE = genunix.so
 MDBTGT = kvm
 
 COMMONSRCS = \
+	avl.c \
 	bio.c \
 	contract.c \
 	cpupart.c \
diff --git a/usr/src/cmd/mdb/intel/amd64/libavl/Makefile b/usr/src/cmd/mdb/intel/amd64/libavl/Makefile
new file mode 100644
index 000000000000..4f3e9cf60ae9
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/amd64/libavl/Makefile
@@ -0,0 +1,37 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+MODULE = libavl.so
+MDBTGT = proc
+
+MODSRCS = libavl.c \
+	  avl.c
+
+include ../../../../Makefile.cmd
+include ../../../../Makefile.cmd.64
+include ../../Makefile.amd64
+include ../../../Makefile.module
diff --git a/usr/src/cmd/mdb/intel/amd64/libzpool/Makefile b/usr/src/cmd/mdb/intel/amd64/libzpool/Makefile
new file mode 100644
index 000000000000..d2a5a8c2a654
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/amd64/libzpool/Makefile
@@ -0,0 +1,52 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+MODULE = libzpool.so
+MDBTGT = proc
+
+MODSRCS = zfs.c list.c
+
+include ../../../../Makefile.cmd
+include ../../../../Makefile.cmd.64
+include ../../Makefile.amd64
+include ../../../Makefile.module
+
+MODSRCS_DIR = ../../../common/modules/zfs
+GENUNIX_DIR = ../../../common/modules/genunix
+
+CPPFLAGS += -I../../../../../lib/libzpool/common \
+	-I../../../../../uts/common/fs/zfs
+
+C99MODE=	-xc99=%all
+C99LMODE=	-Xc99=%all
+
+dmod/%.o: $(GENUNIX_DIR)/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
+dmod/%.ln: $(GENUNIX_DIR)/%.c
+	$(LINT.c) -c $<
diff --git a/usr/src/cmd/mdb/intel/amd64/zfs/Makefile b/usr/src/cmd/mdb/intel/amd64/zfs/Makefile
new file mode 100644
index 000000000000..972d59937e21
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/amd64/zfs/Makefile
@@ -0,0 +1,41 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+MODULE = zfs.so
+MDBTGT = kvm
+
+MODSRCS = zfs.c
+
+include ../../../../Makefile.cmd
+include ../../../../Makefile.cmd.64
+include ../../Makefile.amd64
+include ../../../Makefile.module
+
+CPPFLAGS += -I../../../../../uts/common/fs/zfs
+
+C99MODE=	-xc99=%all
+C99LMODE=	-Xc99=%all
diff --git a/usr/src/cmd/mdb/intel/ia32/genunix/Makefile b/usr/src/cmd/mdb/intel/ia32/genunix/Makefile
index 72c5f6caf6d3..eec888438144 100644
--- a/usr/src/cmd/mdb/intel/ia32/genunix/Makefile
+++ b/usr/src/cmd/mdb/intel/ia32/genunix/Makefile
@@ -29,6 +29,7 @@ MODULE = genunix.so
 MDBTGT = kvm
 
 COMMONSRCS = \
+	avl.c \
 	bio.c \
 	contract.c \
 	cpupart.c \
diff --git a/usr/src/cmd/mdb/intel/ia32/libavl/Makefile b/usr/src/cmd/mdb/intel/ia32/libavl/Makefile
new file mode 100644
index 000000000000..1f4cfbe07501
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/ia32/libavl/Makefile
@@ -0,0 +1,36 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+MODULE = libavl.so
+MDBTGT = proc
+
+MODSRCS = libavl.c \
+	  avl.c
+
+include ../../../../Makefile.cmd
+include ../../Makefile.ia32
+include ../../../Makefile.module
diff --git a/usr/src/cmd/mdb/intel/ia32/libzpool/Makefile b/usr/src/cmd/mdb/intel/ia32/libzpool/Makefile
new file mode 100644
index 000000000000..c8e8b4bb3477
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/ia32/libzpool/Makefile
@@ -0,0 +1,51 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+MODULE = libzpool.so
+MDBTGT = proc
+
+MODSRCS = zfs.c list.c
+
+include ../../../../Makefile.cmd
+include ../../Makefile.ia32
+include ../../../Makefile.module
+
+MODSRCS_DIR = ../../../common/modules/zfs
+GENUNIX_DIR = ../../../common/modules/genunix
+
+CPPFLAGS += -I../../../../../lib/libzpool/common \
+	-I../../../../../uts/common/fs/zfs
+
+C99MODE=	-xc99=%all
+C99LMODE=	-Xc99=%all
+
+dmod/%.o: $(GENUNIX_DIR)/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
+dmod/%.ln: $(GENUNIX_DIR)/%.c
+	$(LINT.c) -c $<
diff --git a/usr/src/cmd/mdb/intel/ia32/zfs/Makefile b/usr/src/cmd/mdb/intel/ia32/zfs/Makefile
new file mode 100644
index 000000000000..a569d4fd9173
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/ia32/zfs/Makefile
@@ -0,0 +1,41 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+MODULE = zfs.so
+MDBTGT = kvm
+
+MODSRCS = zfs.c
+
+include ../../../../Makefile.cmd
+include ../../Makefile.ia32
+include ../../../Makefile.module
+
+CPPFLAGS += -I../../../../../uts/common/fs/zfs
+
+C99MODE=	-xc99=%all
+C99LMODE=	-Xc99=%all
diff --git a/usr/src/cmd/mdb/sparc/kmdb/kmdb_kdi_isadep.c b/usr/src/cmd/mdb/sparc/kmdb/kmdb_kdi_isadep.c
index 325929a7f3a3..1d46481cfce2 100644
--- a/usr/src/cmd/mdb/sparc/kmdb/kmdb_kdi_isadep.c
+++ b/usr/src/cmd/mdb/sparc/kmdb/kmdb_kdi_isadep.c
@@ -70,7 +70,7 @@ kdi_xc_one(int cpuid, void (*cb)(void))
 
 /*ARGSUSED1*/
 static int
-kdi_init_cpus_cb(dnode_t node, void *arg, void *result)
+kdi_init_cpus_cb(pnode_t node, void *arg, void *result)
 {
 	/*
 	 * Sun4v dosen't support virtual address cache
diff --git a/usr/src/cmd/mdb/sparc/kmdb/kmdb_promif_isadep.c b/usr/src/cmd/mdb/sparc/kmdb/kmdb_promif_isadep.c
index 1b9b4dfcd542..c99878bf31f5 100644
--- a/usr/src/cmd/mdb/sparc/kmdb/kmdb_promif_isadep.c
+++ b/usr/src/cmd/mdb/sparc/kmdb/kmdb_promif_isadep.c
@@ -69,7 +69,7 @@ kmdb_prom_get_handle(char *name)
 char *
 kmdb_prom_get_options_prop(kmdb_auxv_t *kav, char *propname)
 {
-	dnode_t node;
+	pnode_t node;
 	ssize_t len;
 	char *val;
 
@@ -97,18 +97,18 @@ kmdb_prom_free_options_prop(char *val)
 }
 
 int
-kmdb_prom_getprop(dnode_t node, char *name, caddr_t value)
+kmdb_prom_getprop(pnode_t node, char *name, caddr_t value)
 {
 	return (prom_getprop(node, name, value));
 }
 
 typedef struct walk_cpu_data {
-	int (*wcd_cb)(dnode_t, void *, void *);
+	int (*wcd_cb)(pnode_t, void *, void *);
 	void *wcd_arg;
 } walk_cpu_data_t;
 
 static int
-walk_cpus_cb(dnode_t node, void *arg, void *result)
+walk_cpus_cb(pnode_t node, void *arg, void *result)
 {
 	walk_cpu_data_t *wcd = arg;
 
@@ -139,7 +139,7 @@ walk_cpus_cb(dnode_t node, void *arg, void *result)
 }
 
 void
-kmdb_prom_walk_cpus(int (*cb)(dnode_t, void *, void *), void *arg, void *result)
+kmdb_prom_walk_cpus(int (*cb)(pnode_t, void *, void *), void *arg, void *result)
 {
 	walk_cpu_data_t wcd;
 
diff --git a/usr/src/cmd/mdb/sparc/kmdb/kmdb_promif_isadep.h b/usr/src/cmd/mdb/sparc/kmdb/kmdb_promif_isadep.h
index 7b5f0a8ee29f..f2d160aefe4b 100644
--- a/usr/src/cmd/mdb/sparc/kmdb/kmdb_promif_isadep.h
+++ b/usr/src/cmd/mdb/sparc/kmdb/kmdb_promif_isadep.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -39,12 +39,12 @@
 extern "C" {
 #endif
 
-extern void kmdb_prom_walk_cpus(int (*)(dnode_t, void *, void *),
+extern void kmdb_prom_walk_cpus(int (*)(pnode_t, void *, void *),
     void *, void *);
 extern void kmdb_prom_enter_mon(void);
 extern void kmdb_prom_exit_to_mon(void);
 extern void kmdb_prom_interpret(const char *);
-extern int kmdb_prom_getprop(dnode_t, char *, caddr_t);
+extern int kmdb_prom_getprop(pnode_t, char *, caddr_t);
 
 /* private to promif */
 extern int kmdb_prom_translate_virt(uintptr_t, physaddr_t *);
diff --git a/usr/src/cmd/mdb/sparc/v7/libavl/Makefile b/usr/src/cmd/mdb/sparc/v7/libavl/Makefile
new file mode 100644
index 000000000000..a109e5fb66e4
--- /dev/null
+++ b/usr/src/cmd/mdb/sparc/v7/libavl/Makefile
@@ -0,0 +1,36 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+MODULE = libavl.so
+MDBTGT = proc
+
+MODSRCS = libavl.c \
+	  avl.c
+
+include ../../../../Makefile.cmd
+include ../../Makefile.sparcv7
+include ../../../Makefile.module
diff --git a/usr/src/cmd/mdb/sparc/v7/libzpool/Makefile b/usr/src/cmd/mdb/sparc/v7/libzpool/Makefile
new file mode 100644
index 000000000000..501b3ef347a9
--- /dev/null
+++ b/usr/src/cmd/mdb/sparc/v7/libzpool/Makefile
@@ -0,0 +1,51 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+MODULE = libzpool.so
+MDBTGT = proc
+
+MODSRCS = zfs.c list.c
+
+include ../../../../Makefile.cmd
+include ../../Makefile.sparcv7
+include ../../../Makefile.module
+
+MODSRCS_DIR = ../../../common/modules/zfs
+GENUNIX_DIR = ../../../common/modules/genunix
+
+CPPFLAGS += -I../../../../../lib/libzpool/common \
+	-I../../../../../uts/common/fs/zfs
+
+C99MODE=	-xc99=%all
+C99LMODE=	-Xc99=%all
+
+dmod/%.o: $(GENUNIX_DIR)/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
+dmod/%.ln: $(GENUNIX_DIR)/%.c
+	$(LINT.c) -c $<
diff --git a/usr/src/cmd/mdb/sparc/v9/genunix/Makefile b/usr/src/cmd/mdb/sparc/v9/genunix/Makefile
index 7cfa4c1b1f03..460d76057e84 100644
--- a/usr/src/cmd/mdb/sparc/v9/genunix/Makefile
+++ b/usr/src/cmd/mdb/sparc/v9/genunix/Makefile
@@ -29,6 +29,7 @@ MODULE = genunix.so
 MDBTGT = kvm
 
 COMMONSRCS = \
+	avl.c \
 	bio.c \
 	contract.c \
 	cpupart.c \
diff --git a/usr/src/cmd/mdb/sparc/v9/libavl/Makefile b/usr/src/cmd/mdb/sparc/v9/libavl/Makefile
new file mode 100644
index 000000000000..7785db811054
--- /dev/null
+++ b/usr/src/cmd/mdb/sparc/v9/libavl/Makefile
@@ -0,0 +1,37 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+MODULE = libavl.so
+MDBTGT = proc
+
+MODSRCS = libavl.c \
+	  avl.c
+
+include ../../../../Makefile.cmd
+include ../../../../Makefile.cmd.64
+include ../../Makefile.sparcv9
+include ../../../Makefile.module
diff --git a/usr/src/cmd/mdb/sparc/v9/libzpool/Makefile b/usr/src/cmd/mdb/sparc/v9/libzpool/Makefile
new file mode 100644
index 000000000000..ddcd3d89bcb4
--- /dev/null
+++ b/usr/src/cmd/mdb/sparc/v9/libzpool/Makefile
@@ -0,0 +1,52 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+MODULE = libzpool.so
+MDBTGT = proc
+
+MODSRCS = zfs.c list.c
+
+include ../../../../Makefile.cmd
+include ../../../../Makefile.cmd.64
+include ../../Makefile.sparcv9
+include ../../../Makefile.module
+
+MODSRCS_DIR = ../../../common/modules/zfs
+GENUNIX_DIR = ../../../common/modules/genunix
+
+CPPFLAGS += -I../../../../../lib/libzpool/common \
+	-I../../../../../uts/common/fs/zfs
+
+C99MODE=	-xc99=%all
+C99LMODE=	-Xc99=%all
+
+dmod/%.o: $(GENUNIX_DIR)/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
+dmod/%.ln: $(GENUNIX_DIR)/%.c
+	$(LINT.c) -c $<
diff --git a/usr/src/cmd/mdb/sparc/v9/zfs/Makefile b/usr/src/cmd/mdb/sparc/v9/zfs/Makefile
new file mode 100644
index 000000000000..fd6c40877466
--- /dev/null
+++ b/usr/src/cmd/mdb/sparc/v9/zfs/Makefile
@@ -0,0 +1,41 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+MODULE = zfs.so
+MDBTGT = kvm
+
+MODSRCS = zfs.c
+
+include ../../../../Makefile.cmd
+include ../../../../Makefile.cmd.64
+include ../../Makefile.sparcv9
+include ../../../Makefile.module
+
+CPPFLAGS += -I../../../../../uts/common/fs/zfs
+
+C99MODE=	-xc99=%all
+C99LMODE=	-Xc99=%all
diff --git a/usr/src/cmd/mv/Makefile b/usr/src/cmd/mv/Makefile
index 763498f6d500..0225d52219f7 100644
--- a/usr/src/cmd/mv/Makefile
+++ b/usr/src/cmd/mv/Makefile
@@ -20,7 +20,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -50,9 +50,9 @@ XGETFLAGS += -a -x mv.xcl
 CPPFLAGS += -D_FILE_OFFSET_BITS=64
 LINTFLAGS += -DXPG4
 
-lint :=	LDLIBS += -lcmdutils -lavl
-$(PROG) := LDLIBS += $(ZLAZYLOAD) -lcmdutils -lavl $(ZNOLAZYLOAD)
-$(XPG4) := LDLIBS += $(ZLAZYLOAD) -lcmdutils -lavl $(ZNOLAZYLOAD)
+lint :=	LDLIBS += -lcmdutils -lavl -lsec
+$(PROG) := LDLIBS += $(ZLAZYLOAD) -lcmdutils -lavl -lsec $(ZNOLAZYLOAD)
+$(XPG4) := LDLIBS += $(ZLAZYLOAD) -lcmdutils -lavl -lsec $(ZNOLAZYLOAD)
 
 .KEEP_STATE:
 
diff --git a/usr/src/cmd/mv/mv.c b/usr/src/cmd/mv/mv.c
index 507ff30d65c6..3cdceeafb4c2 100644
--- a/usr/src/cmd/mv/mv.c
+++ b/usr/src/cmd/mv/mv.c
@@ -64,6 +64,7 @@
 #include <limits.h>
 #include <sys/acl.h>
 #include <libcmdutils.h>
+#include <aclutils.h>
 
 #define	FTYPE(A)	(A.st_mode)
 #define	FMODE(A)	(A.st_mode)
@@ -138,11 +139,9 @@ static int		attrsilent = 0;
 static int		targetexists = 0;
 static char		yeschr[SCHAR_MAX + 2];
 static char		nochr[SCHAR_MAX + 2];
-static int		s1aclcnt;
-static aclent_t		*s1aclp = NULL;
 static int		cmdarg;		/* command line argument */
 static avl_tree_t	*stree = NULL;	/* source file inode search tree */
-
+static acl_t		*s1acl;
 
 int
 main(int argc, char *argv[])
@@ -803,9 +802,9 @@ cpymve(char *source, char *target)
 
 				if (pflg || mve) {
 					(void) chmod(target, FMODE(s1));
-					if (s1aclp != NULL) {
-						if ((acl(target, SETACL,
-						    s1aclcnt, s1aclp)) < 0) {
+					if (s1acl != NULL) {
+						if ((acl_set(target,
+						    s1acl)) < 0) {
 							if (pflg || mve) {
 								(void) fprintf(
 								    stderr,
@@ -1065,6 +1064,7 @@ chkfiles(char *source, char **to)
 	int	(*statf)() = (cpy &&
 		    !(Pflg || (Hflg && !cmdarg))) ? stat : lstat;
 	char    *target = *to;
+	int	error;
 
 	/*
 	 * Make sure source file exists.
@@ -1088,27 +1088,16 @@ chkfiles(char *source, char **to)
 	 * Get ACL info: don't bother with ln or mv'ing symlinks
 	 */
 	if ((!lnk) && !(mve && ISLNK(s1))) {
-		if (s1aclp != NULL) {
-			free(s1aclp);
-			s1aclp = NULL;
+		if (s1acl != NULL) {
+			acl_free(s1acl);
+			s1acl = NULL;
 		}
-		if ((s1aclcnt = acl(source, GETACLCNT, 0, NULL)) < 0) {
+		if ((error = acl_get(source, ACL_NO_TRIVIAL, &s1acl)) != 0) {
 			(void) fprintf(stderr,
-			    "%s: failed to get acl entries\n", source);
+			    "%s: failed to get acl entries: %s\n", source,
+			    acl_strerror(error));
 			return (1);
 		}
-		if (s1aclcnt > MIN_ACL_ENTRIES) {
-			if ((s1aclp = (aclent_t *)malloc(
-				sizeof (aclent_t) * s1aclcnt)) == NULL) {
-				(void) fprintf(stderr, "Insufficient memory\n");
-				return (1);
-			}
-			if ((acl(source, GETACL, s1aclcnt, s1aclp)) < 0) {
-				(void) fprintf(stderr,
-				    "%s: failed to get acl entries\n", source);
-				return (1);
-			}
-		}
 		/* else: just permission bits */
 	}
 
@@ -1563,8 +1552,9 @@ copydir(char *source, char *target)
 	int pret = 0;		/* need separate flag if -p is specified */
 	mode_t	fixmode = (mode_t)0;	/* cleanup mode after copy */
 	struct stat s1save;
-	int s1aclcnt_save;
-	aclent_t *s1aclp_save = NULL;
+	acl_t  *s1acl_save;
+
+	s1acl_save = NULL;
 
 	if (cpy && !rflg) {
 		(void) fprintf(stderr,
@@ -1597,12 +1587,15 @@ copydir(char *source, char *target)
 		 * s1 gets overwritten when doing the recursive copy.
 		 */
 		s1save = s1;
-		if (s1aclp != NULL) {
-			if ((s1aclp_save = (aclent_t *)malloc(sizeof (aclent_t)
-			    * s1aclcnt)) != NULL) {
-				(void) memcpy(s1aclp_save, s1aclp,
-				    sizeof (aclent_t) * s1aclcnt);
-				s1aclcnt_save = s1aclcnt;
+		if (s1acl != NULL) {
+			s1acl_save = acl_dup(s1acl);
+			if (s1acl_save == NULL) {
+				(void) fprintf(stderr, gettext("%s: "
+				    "Insufficient memory to save acl"
+				    " entry\n"), cmd);
+				if (pflg)
+					return (1);
+
 			}
 #ifdef XPG4
 			else {
@@ -1627,9 +1620,8 @@ copydir(char *source, char *target)
 	 * ACL for directory
 	 */
 	if (pflg || mve) {
-		if (s1aclp_save != NULL) {
-			if ((acl(target, SETACL, s1aclcnt_save, s1aclp_save))
-			    < 0) {
+		if (s1acl_save != NULL) {
+			if (acl_set(target, s1acl_save) < 0) {
 #ifdef XPG4
 				if (pflg || mve) {
 #else
@@ -1639,13 +1631,15 @@ copydir(char *source, char *target)
 					    "%s: failed to set acl entries "
 					    "on %s\n"), cmd, target);
 					if (pflg) {
-						free(s1aclp_save);
+						acl_free(s1acl_save);
+						s1acl_save = NULL;
 						ret++;
 					}
 				}
 				/* else: silent and continue */
 			}
-			free(s1aclp_save);
+			acl_free(s1acl_save);
+			s1acl_save = NULL;
 		}
 		if ((pret = chg_mode(target, UID(s1save), GID(s1save),
 		    FMODE(s1save))) == 0)
@@ -1705,7 +1699,6 @@ use_stdin(void)
 static int
 copyattributes(char *source, char *target)
 {
-	int ret;
 	int sourcedirfd, targetdirfd;
 	int srcfd, targfd;
 	int tmpfd;
@@ -1716,12 +1709,11 @@ copyattributes(char *source, char *target)
 	char *srcbuf, *targbuf;
 	size_t src_size, targ_size;
 	int error = 0;
+	int aclerror;
 	mode_t mode;
 	int clearflg = 0;
-	int	aclcnt;
-	int	attrdiraclcnt;
-	aclent_t *aclp = NULL;
-	aclent_t *attrdiraclp = NULL;
+	acl_t *xacl = NULL;
+	acl_t *attrdiracl = NULL;
 	struct stat attrdir, s3, s4;
 	struct timeval times[2];
 	mode_t	targmode;
@@ -1918,58 +1910,30 @@ copyattributes(char *source, char *target)
 		 * Now set owner and group of attribute directory, implies
 		 * changing the ACL of the hidden attribute directory first.
 		 */
-		if ((attrdiraclcnt = facl(sourcedirfd,
-		    GETACLCNT, 0, NULL)) < 0) {
+		if ((aclerror = facl_get(sourcedirfd,
+		    ACL_NO_TRIVIAL, &attrdiracl)) != 0) {
 			if (!attrsilent) {
 				(void) fprintf(stderr, gettext(
 				    "%s: failed to get acl entries of"
 				    " attribute directory for"
-				    " %s\n"), cmd, source);
+				    " %s : %s\n"), cmd,
+				    source, acl_strerror(aclerror));
 				++error;
 			}
 		}
-		if (attrdiraclcnt > MIN_ACL_ENTRIES) {
-			if ((attrdiraclp = (aclent_t *)malloc(
-				sizeof (aclent_t) * attrdiraclcnt)) == NULL) {
+
+		if (attrdiracl) {
+			if (facl_set(targetdirfd, attrdiracl) != 0) {
 				if (!attrsilent) {
 					(void) fprintf(stderr, gettext(
-						"insufficient memory"
-						" for acl\n"));
+					"%s: failed to set acl entries"
+					" on attribute directory "
+					"for %s\n"), cmd, target);
 					++error;
 				}
-			} else {
-				if ((ret = facl(sourcedirfd, GETACL,
-					attrdiraclcnt, attrdiraclp)) == -1) {
-					if (!attrsilent) {
-						(void) fprintf(stderr,
-						    gettext(
-						    "%s: failed to get acl"
-						    " entries of attribute"
-						    " directory for"
-						    " %s\n"), cmd, target);
-						free(attrdiraclp);
-						attrdiraclp = NULL;
-						attrdiraclcnt = 0;
-						++error;
-					}
-
-				}
-				if (ret != -1 && (facl(targetdirfd, SETACL,
-				    attrdiraclcnt,
-				    attrdiraclp) != 0)) {
-					if (!attrsilent) {
-						(void) fprintf(stderr, gettext(
-						"%s: failed to set acl entries"
-						" on attribute directory "
-						"for %s\n"), cmd, target);
-						++error;
-					}
-					free(attrdiraclp);
-					attrdiraclp = NULL;
-					attrdiraclcnt = 0;
-				}
+				acl_free(attrdiracl);
+				attrdiracl = NULL;
 			}
-
 		}
 	}
 
@@ -2040,52 +2004,17 @@ copyattributes(char *source, char *target)
 		}
 
 		if (pflg || mve) {
-			if ((aclcnt = facl(srcattrfd,
-			    GETACLCNT, 0, NULL)) < 0) {
+			if ((aclerror = facl_get(srcattrfd,
+			    ACL_NO_TRIVIAL, &xacl)) != 0) {
 				if (!attrsilent) {
 					(void) fprintf(stderr, gettext(
 					    "%s: failed to get acl entries of"
 					    " attribute %s for"
-					    " %s: "), cmd, dp->d_name, source);
-					perror("");
+					    " %s: %s"), cmd, dp->d_name,
+					    source, acl_strerror(aclerror));
 					++error;
 				}
 			}
-			if (aclcnt > MIN_ACL_ENTRIES) {
-				if ((aclp = (aclent_t *)malloc(
-					sizeof (aclent_t) * aclcnt)) ==
-						NULL) {
-					if (!attrsilent) {
-						(void) fprintf(stderr, gettext(
-							"insufficient memory"
-							" for acl: "));
-						perror("");
-						++error;
-					}
-				} else {
-
-					if ((facl(srcattrfd, GETACL,
-						aclcnt, aclp)) < 0) {
-						if (!attrsilent) {
-							(void) fprintf(stderr,
-							    gettext(
-							    "%s: failed to get"
-							    " acl entries of"
-							    " attribute %s for"
-							    /*CSTYLED*/
-							    " %s: "), cmd,
-							    dp->d_name, target);
-							free(aclp);
-							aclp = NULL;
-							perror("");
-							++error;
-						}
-
-					}
-
-				}
-			}
-
 		}
 
 		(void) unlinkat(targetdirfd, dp->d_name, 0);
@@ -2105,8 +2034,8 @@ copyattributes(char *source, char *target)
 		/*
 		 * preserve ACL
 		 */
-		if ((pflg || mve) && aclp != NULL) {
-			if ((facl(targattrfd, SETACL, aclcnt, aclp)) < 0) {
+		if ((pflg || mve) && xacl != NULL) {
+			if ((facl_set(targattrfd, xacl)) < 0) {
 				if (!attrsilent) {
 					(void) fprintf(stderr, gettext(
 					    "%s: failed to set acl entries on"
@@ -2114,9 +2043,8 @@ copyattributes(char *source, char *target)
 					    "%s\n"), cmd, dp->d_name, target);
 					++error;
 				}
-				free(aclp);
-				aclp = NULL;
-				aclcnt = 0;
+				acl_free(xacl);
+				xacl = NULL;
 			}
 		}
 
@@ -2231,11 +2159,10 @@ copyattributes(char *source, char *target)
 			}
 		}
 next:
-		if (aclp != NULL) {
-			free(aclp);
-			aclp = NULL;
+		if (xacl != NULL) {
+			acl_free(xacl);
+			xacl = NULL;
 		}
-		aclcnt = 0;
 		if (srcbuf != NULL)
 			free(srcbuf);
 		if (targbuf != NULL)
@@ -2248,10 +2175,14 @@ copyattributes(char *source, char *target)
 		srcbuf = targbuf = NULL;
 	}
 out:
-	if (aclp != NULL)
-		free(aclp);
-	if (attrdiraclp != NULL)
-		free(attrdiraclp);
+	if (xacl != NULL) {
+		acl_free(xacl);
+		xacl = NULL;
+	}
+	if (attrdiracl != NULL) {
+		acl_free(attrdiracl);
+		attrdiracl = NULL;
+	}
 	if (srcbuf)
 		free(srcbuf);
 	if (targbuf)
diff --git a/usr/src/cmd/pack/Makefile b/usr/src/cmd/pack/Makefile
index 938e9e9f16ad..49dd679511a5 100644
--- a/usr/src/cmd/pack/Makefile
+++ b/usr/src/cmd/pack/Makefile
@@ -22,7 +22,7 @@
 #
 #ident	"%Z%%M%	%I%	%E% SMI"
 #
-# Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
@@ -32,6 +32,7 @@ include ../Makefile.cmd
 
 CFLAGS += $(CCVERBOSE)
 XGETFLAGS += -a -x $(PROG).xcl
+LDLIBS += -lsec
 
 .KEEP_STATE:
 
diff --git a/usr/src/cmd/pack/pack.c b/usr/src/cmd/pack/pack.c
index fd67a70a3cfc..c8aad347b3f2 100644
--- a/usr/src/cmd/pack/pack.c
+++ b/usr/src/cmd/pack/pack.c
@@ -51,6 +51,8 @@
 #include <string.h>
 #include <dirent.h>
 #include <unistd.h>
+#include <sys/acl.h>
+#include <aclutils.h>
 
 #undef lint
 
@@ -350,7 +352,9 @@ main(int argc, char *argv[])
 	register char *cp;
 	int k, sep, errflg = 0;
 	int c;
+	int error;
 	int fcount = 0; /* count failures */
+	acl_t *aclp = NULL;
 
 	(void) setlocale(LC_ALL, "");
 #if !defined(TEXT_DOMAIN)	/* Should be defined by cc -D */
@@ -448,6 +452,7 @@ main(int argc, char *argv[])
 				"pack: %s: already exists\n"), filename);
 			goto closein;
 		}
+
 		if ((outfile = creat(filename, status.st_mode)) < 0) {
 			fprintf(stderr, gettext(
 				"pack: %s: cannot create: "), filename);
@@ -455,6 +460,13 @@ main(int argc, char *argv[])
 			goto closein;
 		}
 
+		error = facl_get(infile, ACL_NO_TRIVIAL, &aclp);
+
+		if (error != 0) {
+			fprintf(stderr, gettext(
+			    "pack: %s: cannot retrieve ACL: %s\n"), argv[k],
+			    acl_strerror(error));
+		}
 		if (packfile(argv[k]) &&
 		    ((pathconf(argv[k], _PC_XATTR_EXISTS) != 1) ||
 				(mv_xattrs(infile, outfile,
@@ -509,6 +521,12 @@ main(int argc, char *argv[])
 				perror("");
 			}
 			chown(filename, status.st_uid, status.st_gid);
+			if (aclp && (facl_set(outfile, aclp) < 0)) {
+				fprintf(stderr, gettext(
+				    "pack: %s: failed to set acl entries\n"),
+				    filename);
+				perror("");
+			}
 			if (!errflg)
 				fcount--;  /* success after all */
 		} else {
@@ -517,6 +535,10 @@ main(int argc, char *argv[])
 			}
 			unlink(filename);
 		}
+
+		if (aclp)
+			acl_free(aclp);
+
 closein:	close(outfile);
 		close(infile);
 	}
diff --git a/usr/src/cmd/picl/plugins/sun4u/snowbird/frutree/picllibdevinfo.c b/usr/src/cmd/picl/plugins/sun4u/snowbird/frutree/picllibdevinfo.c
index 265993d2c6ad..091e72383e08 100644
--- a/usr/src/cmd/picl/plugins/sun4u/snowbird/frutree/picllibdevinfo.c
+++ b/usr/src/cmd/picl/plugins/sun4u/snowbird/frutree/picllibdevinfo.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -73,10 +73,10 @@ typedef struct {
 	int		n_serial;
 	int		n_parallel;
 	int		n_network;
-} list_t;
+} plist_t;
 
 static void
-free_list(list_t *listptr)
+free_list(plist_t *listptr)
 {
 	port_info_t	*tmp;
 	port_info_t	*nextptr;
@@ -110,7 +110,7 @@ compare(const void *a, const void *b)
  * assigns GeoAddr property for ports based on bus-addr
  */
 static picl_errno_t
-assign_geo_addr(list_t *list, frutree_port_type_t type)
+assign_geo_addr(plist_t *list, frutree_port_type_t type)
 {
 
 	int i = 0;
@@ -181,7 +181,7 @@ assign_geo_addr(list_t *list, frutree_port_type_t type)
 }
 
 static picl_errno_t
-create_port_config_info(list_t *list, frutree_device_args_t *devp)
+create_port_config_info(plist_t *list, frutree_device_args_t *devp)
 {
 	port_info_t *port_info = NULL;
 	frutree_cache_t	*cachep = NULL;
@@ -316,7 +316,7 @@ probe_tree(di_node_t node, void *arg)
 	char *devfs_path = NULL;
 	char *bus_addr = NULL;
 	char *drv_name = NULL;
-	list_t *listptr = NULL;
+	plist_t *listptr = NULL;
 	port_info_t *port_info = NULL;
 	frutree_port_type_t port_type = UNKNOWN_PORT;
 	di_minor_t minor = DI_MINOR_NIL;
@@ -324,7 +324,7 @@ probe_tree(di_node_t node, void *arg)
 	if (arg == NULL) {
 		return (DI_WALK_TERMINATE);
 	}
-	listptr = (list_t *)arg;
+	listptr = (plist_t *)arg;
 
 	while ((minor = di_minor_next(node, minor)) != DI_MINOR_NIL) {
 		nodetype = di_minor_nodetype(minor);
@@ -412,7 +412,7 @@ probe_libdevinfo(frutree_frunode_t *frup, frutree_device_args_t ** device,
 {
 	di_node_t	rnode;
 	picl_errno_t	rc;
-	list_t	list;
+	plist_t	list;
 
 	if (frup == NULL) {
 		return (PICL_FAILURE);
diff --git a/usr/src/cmd/picl/plugins/sun4u/snowbird/frutree/piclscsi.c b/usr/src/cmd/picl/plugins/sun4u/snowbird/frutree/piclscsi.c
index 9ba70fff8c90..25280a5644a9 100644
--- a/usr/src/cmd/picl/plugins/sun4u/snowbird/frutree/piclscsi.c
+++ b/usr/src/cmd/picl/plugins/sun4u/snowbird/frutree/piclscsi.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -101,23 +101,23 @@ typedef struct node {
 typedef struct linked_list {
 	node_t *first;
 	int num_nodes;
-} list_t;
+} plist_t;
 
 typedef struct scsi_info {
 	frutree_frunode_t *frup;
 	cfga_list_data_t *cfgalist;
-	list_t *list;
+	plist_t *list;
 	int num_list;
 	boolean_t compare_cfgadm;
 	int geo_addr;
 } scsi_info_t;
 
-static list_t *scsi_list = NULL;
+static plist_t *scsi_list = NULL;
 static cfga_list_data_t *cfglist = NULL;
 static int nlist = 0;
 
 static void
-free_list(list_t *list)
+free_list(plist_t *list)
 {
 	node_t	*tmp = NULL, *tmp1 = NULL;
 
@@ -136,7 +136,7 @@ free_list(list_t *list)
  * This routine gets the list of scsi controllers present
  */
 static cfga_err_t
-populate_controllers_list(list_t *cntrl_list, cfga_list_data_t *list, int num)
+populate_controllers_list(plist_t *cntrl_list, cfga_list_data_t *list, int num)
 {
 	int i;
 	node_t *nodeptr = NULL;
@@ -202,7 +202,7 @@ scsi_info_init()
 		}
 	}
 
-	scsi_list = (list_t *)malloc(sizeof (list_t));
+	scsi_list = (plist_t *)malloc(sizeof (plist_t));
 	if (scsi_list == NULL) {
 		free(cfglist);
 		return (PICL_NOSPACE);
@@ -231,7 +231,7 @@ scsi_info_fini()
  * caller should allocate memory for ap_id
  */
 static picl_errno_t
-find_scsi_controller(char *devfs_path, list_t *list, char *ap_id)
+find_scsi_controller(char *devfs_path, plist_t *list, char *ap_id)
 {
 	node_t	*tmp = NULL;
 	char *lasts = NULL;
@@ -274,7 +274,7 @@ get_scsislot_name(char *devfs_path, char *bus_addr, char *name)
 	picl_errno_t	rc;
 	int target_id = 0;
 	int numlist;
-	list_t			list;
+	plist_t			list;
 	cfga_err_t		ap_list_err;
 	cfga_list_data_t 	*cfgalist = NULL;
 	char controller[MAXPATHLEN];
@@ -410,7 +410,7 @@ get_bus_addr(char *scsi_loc, char **bus_addr)
  */
 static picl_errno_t
 dyn_probe_for_scsi_frus(frutree_frunode_t *frup, cfga_list_data_t *cfgalist,
-	list_t *list, int numlist)
+	plist_t *list, int numlist)
 {
 	picl_errno_t rc;
 	int i, geo_addr = 0;
@@ -797,7 +797,7 @@ probe_disks(di_node_t node, void *arg)
 
 static picl_errno_t
 probe_scsi_in_libdevinfo(frutree_frunode_t *frup, cfga_list_data_t *cfgalist,
-	list_t *list, int num_list, boolean_t compare_cfgadm)
+	plist_t *list, int num_list, boolean_t compare_cfgadm)
 {
 	di_node_t	rnode;
 	scsi_info_t	*scsi_data = NULL;
@@ -840,7 +840,7 @@ probe_for_scsi_frus(frutree_frunode_t *frup)
 {
 	int numlist;
 	picl_errno_t rc;
-	list_t list;
+	plist_t list;
 	cfga_err_t ap_list_err;
 	cfga_list_data_t *cfgalist = NULL;
 
diff --git a/usr/src/cmd/pt_chmod/Makefile b/usr/src/cmd/pt_chmod/Makefile
index c63f74d2fb78..39666fc6285a 100644
--- a/usr/src/cmd/pt_chmod/Makefile
+++ b/usr/src/cmd/pt_chmod/Makefile
@@ -22,7 +22,7 @@
 #
 #ident	"%Z%%M%	%I%	%E% SMI"
 #
-# Copyright 1989-2003 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
@@ -32,7 +32,7 @@ include ../Makefile.cmd
 
 FILEMODE= 04511
 
-LDLIBS += -ldevinfo
+LDLIBS += -ldevinfo -lsec
 
 .KEEP_STATE:
 
diff --git a/usr/src/cmd/pt_chmod/pt_chmod.c b/usr/src/cmd/pt_chmod/pt_chmod.c
index 4d09efd5a040..55aeb33d598f 100644
--- a/usr/src/cmd/pt_chmod/pt_chmod.c
+++ b/usr/src/cmd/pt_chmod/pt_chmod.c
@@ -92,23 +92,8 @@ main(int argc, char **argv)
 	} while (fdetach(tty) == 0);
 
 	/* Remove ACLs */
-	if (acl(tty, GETACLCNT, 0, NULL) > MIN_ACL_ENTRIES) {
-		aclent_t acls[3];
 
-		acls[0].a_type = USER_OBJ;
-		acls[0].a_id = 0;
-		acls[0].a_perm = 6;
-
-		acls[1].a_type = GROUP_OBJ;
-		acls[1].a_id = gid;
-		acls[1].a_perm = 2;
-
-		acls[2].a_type = OTHER_OBJ;
-		acls[2].a_id = 0;
-		acls[2].a_perm = 0;
-
-		(void) acl(tty, SETACL, 3, acls);
-	}
+	(void) acl_strip(tty, 0, gid, 0620);
 
 	if (chown(tty, getuid(), gid))
 		return (1);
diff --git a/usr/src/cmd/setfacl/setfacl.c b/usr/src/cmd/setfacl/setfacl.c
index 0f97da84c3a5..c5e14dfd4a1a 100644
--- a/usr/src/cmd/setfacl/setfacl.c
+++ b/usr/src/cmd/setfacl/setfacl.c
@@ -30,6 +30,11 @@
 static char sccsid[] = "@(#)setfacl.c	1.10	05/06/16 SMI";
 #endif
 
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
 /*
  * setfacl [-r] -f aclfile file ...
  * setfacl [-r] -d acl_entries file ...
@@ -48,6 +53,7 @@ static char sccsid[] = "@(#)setfacl.c	1.10	05/06/16 SMI";
 #include <sys/acl.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <errno.h>
 
 
 #define	ADD	1
@@ -202,6 +208,14 @@ get_acl_info(char *filep, aclent_t **aclpp)
 	int	aclcnt;
 
 	if ((aclcnt = acl(filep, GETACLCNT, 0, NULL)) < 0) {
+		if (errno == ENOSYS) {
+			(void) fprintf(stderr,
+			    gettext("file system doesn't support aclent_t "
+			    "style ACL's.\n"
+			    "See acl(5) for more information on"
+			    " ACL styles support by Solaris.\n"));
+			return (-1);
+		}
 		(void) fprintf(stderr,
 		    gettext("%s: failed to get acl count\n"), filep);
 		perror("get acl count error");
diff --git a/usr/src/cmd/sum/sum.c b/usr/src/cmd/sum/sum.c
index f303d627b7c4..c6df811930d0 100644
--- a/usr/src/cmd/sum/sum.c
+++ b/usr/src/cmd/sum/sum.c
@@ -40,6 +40,8 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <sys/types.h>
+#include <errno.h>
+#include <string.h>
 
 static void usage(void);
 
@@ -115,9 +117,9 @@ main(int argc, char **argv)
 		}
 		if (ferror(f)) {
 			errflg++;
-			(void) fprintf(stderr,
-			gettext("sum: read error on %s\n"),
-			    (argc > 0) ? argv[i] : "-");
+			(void) fprintf(stderr, gettext("sum: read error "
+			    "on '%s': %s\n"), (argc > 0) ? argv[i] : "-",
+			    strerror(errno));
 		}
 		if (alg == 1)
 			(void) printf("%.5u %6lld", sum,
diff --git a/usr/src/cmd/svc/milestone/devices-local b/usr/src/cmd/svc/milestone/devices-local
index e64659447565..d58518c56c86 100644
--- a/usr/src/cmd/svc/milestone/devices-local
+++ b/usr/src/cmd/svc/milestone/devices-local
@@ -21,7 +21,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T.
@@ -30,8 +30,6 @@
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
 
-# GLXXX - The SysV copyright should be unnecessary now?
-
 # Initiate the device reconfiguration process in case we need some
 # device links established so that we can successfully perform our
 # remaining standard mounts.
@@ -40,6 +38,8 @@ if [ `/sbin/zonename` != "global" ]; then
 	exit 0
 fi
 
+. /lib/svc/share/smf_include.sh
+
 svcprop -q -p system/reconfigure system/svc/restarter:default
 if [ $? -eq 0 ]; then
 	echo 'Configuring devices.' > /dev/msglog 2>&1
@@ -74,4 +74,9 @@ if [ $? -eq 0 ]; then
 	fi
 fi
 
+# Create any zvol devices
+if [ -x /usr/sbin/zfs ]; then
+	/usr/sbin/zfs volinit || exit $SMF_EXIT_ERR_FATAL
+fi
+
 exit 0
diff --git a/usr/src/cmd/svc/milestone/fs-local b/usr/src/cmd/svc/milestone/fs-local
index 8199f0f1beea..34c771e39dae 100644
--- a/usr/src/cmd/svc/milestone/fs-local
+++ b/usr/src/cmd/svc/milestone/fs-local
@@ -28,6 +28,8 @@
 
 . /lib/svc/share/smf_include.sh
 
+result=$SMF_EXIT_OK
+
 # Mount all local filesystems.
 
 cd /; /sbin/mountall -l >/dev/msglog
@@ -36,7 +38,7 @@ if [ $rc -ne 0 ]; then
 	msg="WARNING: /sbin/mountall -l failed: exit status $rc"
 	echo $msg
 	echo "$SMF_FMRI:" $msg >/dev/msglog
-	exit $SMF_EXIT_ERR_FATAL
+	result=$SMF_EXIT_ERR_FATAL
 fi
 
 # get rid of transient reboot entry in GRUB menu
@@ -82,4 +84,17 @@ if [ -n "$vlist" ]; then
 	fi
 fi
 
-exit $SMF_EXIT_OK
+# Mount all ZFS filesystems.
+
+if [ -x /usr/sbin/zfs ]; then
+	/usr/sbin/zfs mount -a >/dev/msglog 2>&1 
+	rc=$?
+	if [ $rc -ne 0 ]; then
+		msg="WARNING: /usr/sbin/zfs mount -a failed: exit status $rc"
+		echo $msg
+		echo "$SMF_FMRI:" $msg >/dev/msglog
+		result=$SMF_EXIT_ERR_FATAL
+	fi
+fi
+
+exit $result
diff --git a/usr/src/cmd/tar/tar.c b/usr/src/cmd/tar/tar.c
index 9a005ce4b7e8..66e60e225f0a 100644
--- a/usr/src/cmd/tar/tar.c
+++ b/usr/src/cmd/tar/tar.c
@@ -69,6 +69,7 @@
 #include <limits.h>
 #include <iconv.h>
 #include <assert.h>
+#include <aclutils.h>
 #if defined(__SunOS_5_6) || defined(__SunOS_5_7)
 extern int defcntl();
 #endif
@@ -381,7 +382,7 @@ struct	file_list	{
 static	struct	file_list	*exclude_tbl[TABLE_SIZE],
 				*include_tbl[TABLE_SIZE];
 
-static int	append_secattr(char **, int *, int, aclent_t *, char);
+static int	append_secattr(char **, int *, acl_t *);
 static void	write_ancillary(union hblock *, char *, int, char);
 
 static void add_file_to_table(struct file_list *table[], char *str);
@@ -493,6 +494,7 @@ static char *get_component(char *path);
 static int retry_attrdir_open(char *name);
 static char *skipslashes(char *string, char *start);
 static void chop_endslashes(char *path);
+
 static	struct stat stbuf;
 
 static	int	checkflag = 0;
@@ -2392,11 +2394,11 @@ doxtract(char *argv[])
 	int error;
 	int symflag;
 	int want;
-	aclent_t	*aclp = NULL;	/* acl buffer pointer */
-	int		aclcnt = 0;	/* acl entries count */
+	acl_t	*aclp = NULL;	/* acl info */
 	timestruc_t	time_zero;	/* used for call to doDirTimes */
 	int		dircreate;
 	int convflag;
+	int cnt;
 
 	time_zero.tv_sec = 0;
 	time_zero.tv_nsec = 0;
@@ -2895,16 +2897,14 @@ doxtract(char *argv[])
 #if defined(O_XATTR)
 			if (xattrp != (struct xattr_buf *)NULL) {
 				if (Hiddendir)
-					ret = facl(dirfd, SETACL,
-						aclcnt, aclp);
+					ret = facl_set(dirfd, aclp);
 				else
-					ret = facl(ofile, SETACL,
-						aclcnt, aclp);
+					ret = facl_set(ofile, aclp);
 			} else {
-				ret = acl(namep, SETACL, aclcnt, aclp);
+				ret = acl_set(namep, aclp);
 			}
 #else
-			ret = acl(namep, SETACL, aclcnt, aclp);
+			ret = acl_set(namep, &aclp);
 #endif
 			if (ret < 0) {
 				if (pflag) {
@@ -2914,7 +2914,7 @@ doxtract(char *argv[])
 				}
 				/* else: silent and continue */
 			}
-			free(aclp);
+			acl_free(aclp);
 			aclp = NULL;
 		}
 
@@ -2986,30 +2986,41 @@ doxtract(char *argv[])
 					}
 					bytes -= TBLOCK;
 				}
+				bytes = stbuf.st_size;
 				/* got all attributes in secp */
 				tp = secp;
 				do {
 					attr = (struct sec_attr *)tp;
 					switch (attr->attr_type) {
 					case UFSD_ACL:
+					case ACE_ACL:
 						(void) sscanf(attr->attr_len,
-						    "%7o", (uint_t *)&aclcnt);
+						    "%7o",
+						    (uint_t *)
+						    &cnt);
 						/* header is 8 */
 						attrsize = 8 + (int)strlen(
 						    &attr->attr_info[0]) + 1;
-						aclp = aclfromtext(
-						    &attr->attr_info[0], &cnt);
-						if (aclp == NULL) {
+
+						error =
+						    acl_fromtext(
+						    &attr->attr_info[0], &aclp);
+
+						if (error != 0) {
 							(void) fprintf(stderr,
 							    gettext(
 							    "aclfromtext "
-							    "failed\n"));
+							    "failed: %s\n"),
+							    acl_strerror(
+							    error));
+							bytes -= attrsize;
 							break;
 						}
-						if (aclcnt != cnt) {
+						if (acl_cnt(aclp) != cnt) {
 							(void) fprintf(stderr,
 							    gettext(
 							    "aclcnt error\n"));
+							bytes -= attrsize;
 							break;
 						}
 						bytes -= attrsize;
@@ -5520,9 +5531,7 @@ int
 append_secattr(
 	char	 **secinfo,	/* existing security info */
 	int	 *secinfo_len,	/* length of existing security info */
-	int	 size,		/* new attribute size: unit depends on type */
-	aclent_t *attrp,	/* new attribute data pointer */
-	char	 attr_type)	/* new attribute type */
+	acl_t	*aclp)
 {
 	char	*new_secinfo;
 	char	*attrtext;
@@ -5530,12 +5539,13 @@ append_secattr(
 	int	oldsize;
 
 	/* no need to add */
-	if (attrp == NULL)
+	if (aclp == (void *)NULL)
 		return (0);
 
-	switch (attr_type) {
-	case UFSD_ACL:
-		attrtext = acltotext((aclent_t *)attrp, size);
+	switch (acl_type(aclp)) {
+	case ACLENT_T:
+	case ACE_T:
+		attrtext = acl_totext(aclp);
 		if (attrtext == NULL) {
 			(void) fprintf(stderr, "acltotext failed\n");
 			return (-1);
@@ -5547,9 +5557,10 @@ append_secattr(
 			(void) fprintf(stderr, "can't allocate memory\n");
 			return (-1);
 		}
-		attr->attr_type = UFSD_ACL;
+		attr->attr_type = (acl_type(aclp) == ACLENT_T) ?
+		    UFSD_ACL : ACE_ACL;
 		(void) sprintf(attr->attr_len,
-		    "%06o", size); /* acl entry count */
+		    "%06o", acl_cnt(aclp)); /* acl entry count */
 		(void) strcpy((char *)&attr->attr_info[0], attrtext);
 		free(attrtext);
 		break;
@@ -6705,11 +6716,11 @@ static int
 put_extra_attributes(char *longname, char *shortname, char *prefix,
 		int filetype, char typeflag)
 {
-	int		aclcnt;
-	static aclent_t	*aclp;
+	static acl_t *aclp = NULL;
+	int error;
 
-	if (aclp != (aclent_t *)NULL) {
-		free(aclp);
+	if (aclp != NULL) {
+		acl_free(aclp);
 		aclp = NULL;
 	}
 #if defined(O_XATTR)
@@ -6730,34 +6741,20 @@ put_extra_attributes(char *longname, char *shortname, char *prefix,
 		if (((stbuf.st_mode & S_IFMT) != S_IFLNK)) {
 			/*
 			 * Get ACL info: dont bother allocating space if
-			 * there are only standard permissions, i.e. ACL
-			 * count <= 4
+			 * there is only a trivial ACL.
 			 */
-			if ((aclcnt = acl(shortname, GETACLCNT, 0, NULL)) < 0) {
+			if ((error = acl_get(shortname, ACL_NO_TRIVIAL,
+			    &aclp)) != 0) {
 				(void) fprintf(stderr, gettext(
-				    "%s: failed to get acl count\n"), longname);
+				    "%s: failed to retrieve acl : %s\n"),
+				    longname, acl_strerror(error));
 				return (1);
 			}
-			if (aclcnt > MIN_ACL_ENTRIES) {
-				if ((aclp = (aclent_t *)malloc(
-				    sizeof (aclent_t) * aclcnt)) == NULL) {
-					(void) fprintf(stderr, gettext(
-					    "Insufficient memory\n"));
-					return (1);
-				}
-				if (acl(shortname, GETACL, aclcnt, aclp) < 0) {
-					(void) fprintf(stderr, gettext(
-					    "%s: failed to get acl entries\n"),
-					    longname);
-					return (1);
-				}
-			}
 		}
 
 		/* append security attributes if any */
-		if (aclp != (aclent_t *)NULL) {
-			(void) append_secattr(&secinfo, &len, aclcnt,
-			    aclp, UFSD_ACL);
+		if (aclp != NULL) {
+			(void) append_secattr(&secinfo, &len, aclp);
 			(void) write_ancillary(&dblock, secinfo, len, ACL_HDR);
 		}
 	}
diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c
index c0a20c6a64cc..9e8567f2dc1a 100644
--- a/usr/src/cmd/truss/codes.c
+++ b/usr/src/cmd/truss/codes.c
@@ -88,6 +88,7 @@
 #include <sys/ptms.h>
 #include <sys/aggr.h>
 #include <sys/dld.h>
+#include <sys/fs/zfs.h>
 
 #include "ramdata.h"
 #include "proto.h"
@@ -855,10 +856,76 @@ const struct ioc {
 	/* dld data-link ioctls */
 	{ (uint_t)DLDIOCATTR,		"DLDIOCATTR",		"dld_ioc_attr"},
 	{ (uint_t)DLDIOCVLAN,		"DLDIOCVLAN",		"dld_ioc_vlan"},
+
+	/* ZFS ioctls */
+	{ (uint_t)ZFS_IOC_POOL_CREATE,		"ZFS_IOC_POOL_CREATE",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_POOL_DESTROY,		"ZFS_IOC_POOL_DESTROY",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_POOL_IMPORT,		"ZFS_IOC_POOL_IMPORT",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_POOL_EXPORT,		"ZFS_IOC_POOL_EXPORT",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_POOL_CONFIGS,		"ZFS_IOC_POOL_CONFIGS",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_POOL_GUID,		"ZFS_IOC_POOL_GUID",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_POOL_STATS,		"ZFS_IOC_POOL_STATS",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_POOL_TRYIMPORT,	"ZFS_IOC_POOL_TRYIMPORT",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_POOL_SCRUB,		"ZFS_IOC_POOL_SCRUB",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_POOL_FREEZE,		"ZFS_IOC_POOL_FREEZE",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_VDEV_ADD,		"ZFS_IOC_VDEV_ADD",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_VDEV_REMOVE,		"ZFS_IOC_VDEV_REMOVE",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_VDEV_ONLINE,		"ZFS_IOC_VDEV_ONLINE",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_VDEV_OFFLINE,		"ZFS_IOC_VDEV_OFFLINE",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_VDEV_ATTACH,		"ZFS_IOC_VDEV_ATTACH",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_VDEV_DETACH,		"ZFS_IOC_VDEV_DETACH",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_OBJSET_STATS,		"ZFS_IOC_OBJSET_STATS",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_DATASET_LIST_NEXT,	"ZFS_IOC_DATASET_LIST_NEXT",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_SNAPSHOT_LIST_NEXT,	"ZFS_IOC_SNAPSHOT_LIST_NEXT",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_SET_PROP,		"ZFS_IOC_SET_PROP",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_SET_QUOTA,		"ZFS_IOC_SET_QUOTA",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_SET_RESERVATION,	"ZFS_IOC_SET_RESERVATION",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_SET_VOLSIZE,		"ZFS_IOC_SET_VOLSIZE",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_SET_VOLBLOCKSIZE,	"ZFS_IOC_SET_VOLBLOCKSIZE",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_CREATE_MINOR,		"ZFS_IOC_CREATE_MINOR",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_REMOVE_MINOR,		"ZFS_IOC_REMOVE_MINOR",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_CREATE,		"ZFS_IOC_CREATE",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_DESTROY,		"ZFS_IOC_DESTROY",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_ROLLBACK,		"ZFS_IOC_ROLLBACK",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_RENAME,		"ZFS_IOC_RENAME",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_RECVBACKUP,		"ZFS_IOC_RECVBACKUP",
+		"zfs_cmd_t" },
+	{ (uint_t)ZFS_IOC_SENDBACKUP,		"ZFS_IOC_SENDBACKUP",
+		"zfs_cmd_t" },
+
 	{ (uint_t)0, NULL, NULL	}
 };
 
-
 void
 ioctl_ioccom(char *buf, size_t size, uint_t code, int nbytes, int x, int y)
 {
diff --git a/usr/src/cmd/truss/print.c b/usr/src/cmd/truss/print.c
index 767652168b73..c58e8c23a364 100644
--- a/usr/src/cmd/truss/print.c
+++ b/usr/src/cmd/truss/print.c
@@ -1117,6 +1117,9 @@ prt_acl(private_t *pri, int raw, long val)	/* print acl() code */
 		case GETACL:		s = "GETACL";		break;
 		case SETACL:		s = "SETACL";		break;
 		case GETACLCNT:		s = "GETACLCNT";	break;
+		case ACE_GETACL:	s = "ACE_GETACL";	break;
+		case ACE_SETACL:	s = "ACE_SETACL";	break;
+		case ACE_GETACLCNT:	s = "ACE_GETACLCNT";	break;
 		}
 	}
 
diff --git a/usr/src/cmd/ttymon/Makefile b/usr/src/cmd/ttymon/Makefile
index 251af55ff3bb..2db4d184d0dd 100644
--- a/usr/src/cmd/ttymon/Makefile
+++ b/usr/src/cmd/ttymon/Makefile
@@ -20,7 +20,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -64,7 +64,7 @@ include ../Makefile.cmd
 CPPFLAGS += -DSYS_NAME
 $(XPG4):= CPPFLAGS += -DXPG4
 sttydefs := LDLIBS += -lnsl
-ttymon := LDLIBS += -lnsl -ldevinfo
+ttymon := LDLIBS += -lnsl -lsec -ldevinfo
 
 # Only stty can be built with -DEUC.  ttymon will dump core unless further
 # changes are made to it.
diff --git a/usr/src/cmd/ttymon/tmexpress.c b/usr/src/cmd/ttymon/tmexpress.c
index caef33b343fa..f9bc5363e26d 100644
--- a/usr/src/cmd/ttymon/tmexpress.c
+++ b/usr/src/cmd/ttymon/tmexpress.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -431,23 +431,6 @@ revokedevaccess(char *dev, uid_t uid, gid_t gid, mode_t mode)
 	} while (fdetach(dev) == 0);
 
 	/* Remove ACLs */
-	if (acl(dev, GETACLCNT, 0, NULL) > MIN_ACL_ENTRIES) {
-		aclent_t acls[3];
 
-		acls[0].a_type = USER_OBJ;
-		acls[0].a_id = uid;
-		acls[0].a_perm = 0;
-
-		acls[1].a_type = GROUP_OBJ;
-		acls[1].a_id = gid;
-		acls[1].a_perm = 0;
-
-		acls[2].a_type = OTHER_OBJ;
-		acls[2].a_id = 0;
-		acls[2].a_perm = 0;
-
-		(void) acl(dev, SETACL, 3, acls);
-	}
-
-	(void) chmod(dev, mode);
+	(void) acl_strip(dev, uid, gid, mode);
 }
diff --git a/usr/src/cmd/unpack/Makefile b/usr/src/cmd/unpack/Makefile
index 87d0d01e1137..076a2013e8a6 100644
--- a/usr/src/cmd/unpack/Makefile
+++ b/usr/src/cmd/unpack/Makefile
@@ -22,7 +22,7 @@
 #
 #ident	"%Z%%M%	%I%	%E% SMI"
 #
-# Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
@@ -30,6 +30,7 @@ PROG= unpack
 
 include ../Makefile.cmd
 CFLAGS += $(CCVERBOSE)
+LDLIBS += -lsec
 
 XGETFLAGS += -a -x unpack.xcl
 .KEEP_STATE:
diff --git a/usr/src/cmd/unpack/unpack.c b/usr/src/cmd/unpack/unpack.c
index 0d2fe15cb72e..766d12906612 100644
--- a/usr/src/cmd/unpack/unpack.c
+++ b/usr/src/cmd/unpack/unpack.c
@@ -24,7 +24,7 @@
 
 
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -49,6 +49,8 @@
 #include <limits.h>
 #include <sys/param.h>
 #include <dirent.h>
+#include <sys/acl.h>
+#include <aclutils.h>
 
 static struct utimbuf u_times;
 
@@ -240,11 +242,13 @@ main(int argc, char *argv[])
 {
 	extern int optind;
 	int i, k;
+	int error;
 	int sep, errflg = 0, pcat = 0;
 	register char *p1, *cp;
 	int fcount = 0;		/* failure count */
 	int max_name;
 	void onsig(int);
+	acl_t *aclp;
 
 
 	if (signal(SIGHUP, SIG_IGN) != SIG_IGN)
@@ -334,6 +338,14 @@ main(int argc, char *argv[])
 		if (pcat)
 			outfile = 1;	/* standard output */
 		else {
+
+			error = facl_get(infile, ACL_NO_TRIVIAL, &aclp);
+			if (error != 0) {
+				(void) printf(gettext(
+				    "%s: %s: cannot retrieve ACL : %s\n"),
+				argv0, filename, acl_strerror(error));
+			}
+
 			max_name = pathconf(filename, _PC_NAME_MAX);
 			if (max_name == -1) {
 				/* no limit on length of filename */
@@ -395,6 +407,12 @@ main(int argc, char *argv[])
 				}
 				(void) chown(argvk,
 						status.st_uid, status.st_gid);
+				if (aclp && (facl_set(outfile, aclp) < 0)) {
+					(void) printf(gettext("%s: cannot "
+					    "set ACL on %s: "), argv0, argvk);
+					perror("");
+				}
+
 				rmflg = 0;
 				(void) printf(gettext("%s: %s: unpacked\n"),
 					argv0, argvk);
@@ -415,6 +433,9 @@ main(int argc, char *argv[])
 done:		(void) close(infile);
 		if (!pcat)
 			(void) close(outfile);
+
+		if (aclp)
+			acl_free(aclp);
 	}
 	return (fcount);
 }
diff --git a/usr/src/cmd/zdb/Makefile b/usr/src/cmd/zdb/Makefile
new file mode 100644
index 000000000000..0ab3c2b8f09a
--- /dev/null
+++ b/usr/src/cmd/zdb/Makefile
@@ -0,0 +1,55 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+PROG:sh=	basename `pwd`
+
+include ../Makefile.cmd
+
+$(INTEL_BLD)SUBDIRS	= $(MACH)
+$(BUILD64)SUBDIRS	+= $(MACH64)
+
+all	:=	TARGET = all
+install	:=	TARGET = install
+clean	:=	TARGET = clean
+clobber	:=	TARGET = clobber
+lint	:=	TARGET = lint
+
+.KEEP_STATE:
+
+all clean clobber lint:	$(SUBDIRS)
+
+install:	$(SUBDIRS)
+	-$(RM) $(ROOTUSRSBINPROG)
+	-$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG)
+
+$(SUBDIRS):	FRC
+	@cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/zdb/Makefile.com b/usr/src/cmd/zdb/Makefile.com
new file mode 100644
index 000000000000..e3156cdeb096
--- /dev/null
+++ b/usr/src/cmd/zdb/Makefile.com
@@ -0,0 +1,62 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+PROG:sh=	cd ..; basename `pwd`
+SRCS= ../$(PROG).c ../zdb_il.c
+
+include ../../Makefile.cmd
+
+INCS += -I../../../lib/libzpool/common 
+INCS +=	-I../../../uts/common/fs/zfs
+
+LDLIBS += -lzpool -lumem -lavl -lnvpair
+
+C99MODE=	-xc99=%all
+C99LMODE=	-Xc99=%all
+
+CFLAGS += $(CCVERBOSE)
+CFLAGS64 += $(CCVERBOSE)
+CPPFLAGS += -D_LARGEFILE64_SOURCE=1 -D_REENTRANT $(INCS)
+
+# lint complains about unused _umem_* functions
+LINTFLAGS += -xerroff=E_NAME_DEF_NOT_USED2 
+LINTFLAGS64 += -xerroff=E_NAME_DEF_NOT_USED2  
+
+.KEEP_STATE:
+
+all: $(PROG)
+
+$(PROG): $(SRCS)
+	$(LINK.c) -o $(PROG) $(SRCS) $(LDLIBS)
+	$(POST_PROCESS)
+
+clean:
+
+lint:	lint_SRCS
+
+include ../../Makefile.targ
diff --git a/usr/src/cmd/zdb/amd64/Makefile b/usr/src/cmd/zdb/amd64/Makefile
new file mode 100644
index 000000000000..c2f8b37b5d96
--- /dev/null
+++ b/usr/src/cmd/zdb/amd64/Makefile
@@ -0,0 +1,32 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+include ../../Makefile.cmd.64
+
+install: all $(ROOTUSRSBINPROG64)
diff --git a/usr/src/cmd/zdb/i386/Makefile b/usr/src/cmd/zdb/i386/Makefile
new file mode 100644
index 000000000000..5c93bf6ac6b6
--- /dev/null
+++ b/usr/src/cmd/zdb/i386/Makefile
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+
+install: all $(ROOTUSRSBINPROG32)
diff --git a/usr/src/cmd/zdb/inc.flg b/usr/src/cmd/zdb/inc.flg
new file mode 100644
index 000000000000..bb65300ccae9
--- /dev/null
+++ b/usr/src/cmd/zdb/inc.flg
@@ -0,0 +1,30 @@
+#!/bin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+find_files "s.*" usr/src/uts/common/fs/zfs/sys
+echo_file usr/src/uts/common/sys/fs/zfs.h
diff --git a/usr/src/cmd/zdb/sparc/Makefile b/usr/src/cmd/zdb/sparc/Makefile
new file mode 100644
index 000000000000..5c93bf6ac6b6
--- /dev/null
+++ b/usr/src/cmd/zdb/sparc/Makefile
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+
+install: all $(ROOTUSRSBINPROG32)
diff --git a/usr/src/cmd/zdb/sparcv9/Makefile b/usr/src/cmd/zdb/sparcv9/Makefile
new file mode 100644
index 000000000000..c2f8b37b5d96
--- /dev/null
+++ b/usr/src/cmd/zdb/sparcv9/Makefile
@@ -0,0 +1,32 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+include ../../Makefile.cmd.64
+
+install: all $(ROOTUSRSBINPROG64)
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
new file mode 100644
index 000000000000..9163d6a0b0df
--- /dev/null
+++ b/usr/src/cmd/zdb/zdb.c
@@ -0,0 +1,1869 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_pool.h>
+#include <sys/dbuf.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/dmu_traverse.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+
+const char cmdname[] = "zdb";
+uint8_t dump_opt[256];
+
+typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
+
+extern void dump_intent_log(zilog_t *);
+uint64_t *zopt_object = NULL;
+int zopt_objects = 0;
+int zdb_advance = ADVANCE_PRE;
+zbookmark_t zdb_noread = { 0, 0, ZB_NO_LEVEL, 0 };
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+const char *
+_umem_debug_init()
+{
+	return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+	return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+
+static void
+usage(void)
+{
+	(void) fprintf(stderr,
+	    "Usage: %s [-udibcsvLU] [-O order] [-B os:obj:level:blkid] "
+	    "dataset [object...]\n"
+	    "       %s -C [pool]\n"
+	    "       %s -l dev\n",
+	    cmdname, cmdname, cmdname);
+
+	(void) fprintf(stderr, "	-u uberblock\n");
+	(void) fprintf(stderr, "	-d datasets\n");
+	(void) fprintf(stderr, "        -C cached pool configuration\n");
+	(void) fprintf(stderr, "	-i intent logs\n");
+	(void) fprintf(stderr, "	-b block statistics\n");
+	(void) fprintf(stderr, "	-c checksum all data blocks\n");
+	(void) fprintf(stderr, "	-s report stats on zdb's I/O\n");
+	(void) fprintf(stderr, "	-v verbose (applies to all others)\n");
+	(void) fprintf(stderr, "        -l dump label contents\n");
+	(void) fprintf(stderr, "	-L live pool (allows some errors)\n");
+	(void) fprintf(stderr, "	-O [!]<pre|post|prune|data|holes> "
+	    "visitation order\n");
+	(void) fprintf(stderr, "	-U use zpool.cache in /tmp\n");
+	(void) fprintf(stderr, "	-B objset:object:level:blkid -- "
+	    "simulate bad block\n");
+	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
+	    "to make only that option verbose\n");
+	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
+	exit(1);
+}
+
+static void
+fatal(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	(void) fprintf(stderr, "%s: ", cmdname);
+	(void) vfprintf(stderr, fmt, ap);
+	va_end(ap);
+	(void) fprintf(stderr, "\n");
+
+	exit(1);
+}
+
+static void
+dump_nvlist(nvlist_t *list, int indent)
+{
+	nvpair_t *elem = NULL;
+
+	while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
+		switch (nvpair_type(elem)) {
+		case DATA_TYPE_STRING:
+			{
+				char *value;
+
+				VERIFY(nvpair_value_string(elem, &value) == 0);
+				(void) printf("%*s%s='%s'\n", indent, "",
+				    nvpair_name(elem), value);
+			}
+			break;
+
+		case DATA_TYPE_UINT64:
+			{
+				uint64_t value;
+
+				VERIFY(nvpair_value_uint64(elem, &value) == 0);
+				(void) printf("%*s%s=%llu\n", indent, "",
+				    nvpair_name(elem), (u_longlong_t)value);
+			}
+			break;
+
+		case DATA_TYPE_NVLIST:
+			{
+				nvlist_t *value;
+
+				VERIFY(nvpair_value_nvlist(elem, &value) == 0);
+				(void) printf("%*s%s\n", indent, "",
+				    nvpair_name(elem));
+				dump_nvlist(value, indent + 4);
+			}
+			break;
+
+		case DATA_TYPE_NVLIST_ARRAY:
+			{
+				nvlist_t **value;
+				uint_t c, count;
+
+				VERIFY(nvpair_value_nvlist_array(elem, &value,
+				    &count) == 0);
+
+				for (c = 0; c < count; c++) {
+					(void) printf("%*s%s[%u]\n", indent, "",
+					    nvpair_name(elem), c);
+					dump_nvlist(value[c], indent + 8);
+				}
+			}
+			break;
+
+		default:
+
+			(void) printf("bad config type %d for %s\n",
+			    nvpair_type(elem), nvpair_name(elem));
+		}
+	}
+}
+
+/* ARGSUSED */
+static void
+dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	nvlist_t *nv;
+	size_t nvsize = *(uint64_t *)data;
+	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
+
+	dmu_read(os, object, 0, nvsize, packed);
+
+	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
+
+	umem_free(packed, nvsize);
+
+	dump_nvlist(nv, 8);
+
+	nvlist_free(nv);
+}
+
+const char dump_zap_stars[] = "****************************************";
+const int dump_zap_width = sizeof (dump_zap_stars) - 1;
+
+static void
+dump_zap_histogram(uint64_t histo[ZAP_HISTOGRAM_SIZE])
+{
+	int i;
+	int minidx = ZAP_HISTOGRAM_SIZE - 1;
+	int maxidx = 0;
+	uint64_t max = 0;
+
+	for (i = 0; i < ZAP_HISTOGRAM_SIZE; i++) {
+		if (histo[i] > max)
+			max = histo[i];
+		if (histo[i] > 0 && i > maxidx)
+			maxidx = i;
+		if (histo[i] > 0 && i < minidx)
+			minidx = i;
+	}
+
+	if (max < dump_zap_width)
+		max = dump_zap_width;
+
+	for (i = minidx; i <= maxidx; i++)
+		(void) printf("\t\t\t%u: %6llu %s\n", i, (u_longlong_t)histo[i],
+		    &dump_zap_stars[(max - histo[i]) * dump_zap_width / max]);
+}
+
+static void
+dump_zap_stats(objset_t *os, uint64_t object)
+{
+	int error;
+	zap_stats_t zs;
+
+	error = zap_get_stats(os, object, &zs);
+	if (error)
+		return;
+
+	if (zs.zs_ptrtbl_len == 0) {
+		ASSERT(zs.zs_num_blocks == 1);
+		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
+		    (u_longlong_t)zs.zs_blocksize,
+		    (u_longlong_t)zs.zs_num_entries);
+		return;
+	}
+
+	(void) printf("\tFat ZAP stats:\n");
+	(void) printf("\t\tPointer table: %llu elements\n",
+	    (u_longlong_t)zs.zs_ptrtbl_len);
+	(void) printf("\t\tZAP entries: %llu\n",
+	    (u_longlong_t)zs.zs_num_entries);
+	(void) printf("\t\tLeaf blocks: %llu\n",
+	    (u_longlong_t)zs.zs_num_leafs);
+	(void) printf("\t\tTotal blocks: %llu\n",
+	    (u_longlong_t)zs.zs_num_blocks);
+	(void) printf("\t\tOversize blocks: %llu\n",
+	    (u_longlong_t)zs.zs_num_blocks_large);
+
+	(void) printf("\t\tLeafs with 2^n pointers:\n");
+	dump_zap_histogram(zs.zs_leafs_with_2n_pointers);
+
+	(void) printf("\t\tLeafs with n chained:\n");
+	dump_zap_histogram(zs.zs_leafs_with_n_chained);
+
+	(void) printf("\t\tBlocks with n*5 entries:\n");
+	dump_zap_histogram(zs.zs_blocks_with_n5_entries);
+
+	(void) printf("\t\tBlocks n/10 full:\n");
+	dump_zap_histogram(zs.zs_blocks_n_tenths_full);
+
+	(void) printf("\t\tEntries with n chunks:\n");
+	dump_zap_histogram(zs.zs_entries_using_n_chunks);
+
+	(void) printf("\t\tBuckets with n entries:\n");
+	dump_zap_histogram(zs.zs_buckets_with_n_entries);
+}
+
+/*ARGSUSED*/
+static void
+dump_none(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+void
+dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+	void *prop;
+	int i;
+
+	dump_zap_stats(os, object);
+	(void) printf("\n");
+
+	for (zap_cursor_init(&zc, os, object);
+	    zap_cursor_retrieve(&zc, &attr) == 0;
+	    zap_cursor_advance(&zc)) {
+		(void) printf("\t\t%s = ", attr.za_name);
+		if (attr.za_num_integers == 0) {
+			(void) printf("\n");
+			continue;
+		}
+		prop = umem_zalloc(attr.za_num_integers *
+		    attr.za_integer_length, UMEM_NOFAIL);
+		(void) zap_lookup(os, object, attr.za_name,
+		    attr.za_integer_length, attr.za_num_integers, prop);
+		if (attr.za_integer_length == 1) {
+			(void) printf("%s", (char *)prop);
+		} else {
+			for (i = 0; i < attr.za_num_integers; i++) {
+				switch (attr.za_integer_length) {
+				case 2:
+					(void) printf("%u ",
+					    ((uint16_t *)prop)[i]);
+					break;
+				case 4:
+					(void) printf("%u ",
+					    ((uint32_t *)prop)[i]);
+					break;
+				case 8:
+					(void) printf("%lld ",
+					    (u_longlong_t)((int64_t *)prop)[i]);
+					break;
+				}
+			}
+		}
+		(void) printf("\n");
+		umem_free(prop, attr.za_num_integers * attr.za_integer_length);
+	}
+}
+
+static void
+dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
+{
+	uint64_t alloc, offset, entry;
+	int mapshift = sm->sm_shift;
+	uint64_t mapstart = sm->sm_start;
+	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID" };
+
+	if (smo->smo_object == 0)
+		return;
+
+	/*
+	 * Print out the freelist entries in both encoded and decoded form.
+	 */
+	alloc = 0;
+	for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) {
+		dmu_read(os, smo->smo_object, offset, sizeof (entry), &entry);
+		if (SM_DEBUG_DECODE(entry)) {
+			(void) printf("\t\t[%4llu] %s: txg %llu, pass %llu\n",
+			    (u_longlong_t)(offset / sizeof (entry)),
+			    ddata[SM_DEBUG_ACTION_DECODE(entry)],
+			    SM_DEBUG_TXG_DECODE(entry),
+			    SM_DEBUG_SYNCPASS_DECODE(entry));
+		} else {
+			(void) printf("\t\t[%4llu]    %c  range:"
+			    " %08llx-%08llx  size: %06llx\n",
+			    (u_longlong_t)(offset / sizeof (entry)),
+			    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
+			    (SM_OFFSET_DECODE(entry) << mapshift) + mapstart,
+			    (SM_OFFSET_DECODE(entry) << mapshift) + mapstart +
+			    (SM_RUN_DECODE(entry) << mapshift),
+			    (SM_RUN_DECODE(entry) << mapshift));
+			if (SM_TYPE_DECODE(entry) == SM_ALLOC)
+				alloc += SM_RUN_DECODE(entry) << mapshift;
+			else
+				alloc -= SM_RUN_DECODE(entry) << mapshift;
+		}
+	}
+	if (alloc != smo->smo_alloc) {
+		(void) printf("space_map_object alloc (%llu) INCONSISTENT "
+		    "with space map summary (%llu)\n",
+		    (u_longlong_t)smo->smo_alloc, (u_longlong_t)alloc);
+	}
+}
+
+static void
+dump_metaslab(metaslab_t *msp)
+{
+	char freebuf[5];
+	space_map_obj_t *smo = msp->ms_smo;
+	vdev_t *vd = msp->ms_group->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+
+	nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf);
+
+	if (dump_opt['d'] <= 5) {
+		(void) printf("\t%10llx   %10llu   %5s\n",
+		    (u_longlong_t)msp->ms_map.sm_start,
+		    (u_longlong_t)smo->smo_object,
+		    freebuf);
+		return;
+	}
+
+	(void) printf(
+	    "\tvdev %llu   offset %08llx   spacemap %4llu   free %5s\n",
+	    (u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start,
+	    (u_longlong_t)smo->smo_object, freebuf);
+
+	ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
+
+	dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
+}
+
+static void
+dump_metaslabs(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *vd;
+	int c, m;
+
+	(void) printf("\nMetaslabs:\n");
+
+	for (c = 0; c < rvd->vdev_children; c++) {
+		vd = rvd->vdev_child[c];
+
+		spa_config_enter(spa, RW_READER);
+		(void) printf("\n    vdev %llu = %s\n\n",
+		    (u_longlong_t)vd->vdev_id, vdev_description(vd));
+		spa_config_exit(spa);
+
+		if (dump_opt['d'] <= 5) {
+			(void) printf("\t%10s   %10s   %5s\n",
+			    "offset", "spacemap", "free");
+			(void) printf("\t%10s   %10s   %5s\n",
+			    "------", "--------", "----");
+		}
+		for (m = 0; m < vd->vdev_ms_count; m++)
+			dump_metaslab(vd->vdev_ms[m]);
+		(void) printf("\n");
+	}
+}
+
+static void
+dump_dtl(vdev_t *vd, int indent)
+{
+	avl_tree_t *t = &vd->vdev_dtl_map.sm_root;
+	spa_t *spa = vd->vdev_spa;
+	space_seg_t *ss;
+	vdev_t *pvd;
+	int c;
+
+	if (indent == 0)
+		(void) printf("\nDirty time logs:\n\n");
+
+	spa_config_enter(spa, RW_READER);
+	(void) printf("\t%*s%s\n", indent, "", vdev_description(vd));
+	spa_config_exit(spa);
+
+	for (ss = avl_first(t); ss; ss = AVL_NEXT(t, ss)) {
+		/*
+		 * Everything in this DTL must appear in all parent DTL unions.
+		 */
+		for (pvd = vd; pvd; pvd = pvd->vdev_parent)
+			ASSERT(vdev_dtl_contains(&pvd->vdev_dtl_map,
+			    ss->ss_start, ss->ss_end - ss->ss_start));
+		(void) printf("\t%*soutage [%llu,%llu] length %llu\n",
+		    indent, "",
+		    (u_longlong_t)ss->ss_start,
+		    (u_longlong_t)ss->ss_end - 1,
+		    (u_longlong_t)ss->ss_end - ss->ss_start);
+	}
+
+	(void) printf("\n");
+
+	if (dump_opt['d'] > 5 && vd->vdev_children == 0) {
+		dump_spacemap(vd->vdev_spa->spa_meta_objset, &vd->vdev_dtl,
+		    &vd->vdev_dtl_map);
+		(void) printf("\n");
+	}
+
+	for (c = 0; c < vd->vdev_children; c++)
+		dump_dtl(vd->vdev_child[c], indent + 4);
+}
+
+/*ARGSUSED*/
+static void
+dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+static uint64_t
+blkid2offset(dnode_phys_t *dnp, int level, uint64_t blkid)
+{
+	if (level < 0)
+		return (blkid);
+
+	return ((blkid << (level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
+	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+}
+
+/* ARGSUSED */
+static int
+zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
+{
+	zbookmark_t *zb = &bc->bc_bookmark;
+	blkptr_t *bp = &bc->bc_blkptr;
+	dva_t *dva = &bp->blk_dva[0];
+	void *data = bc->bc_data;
+	dnode_phys_t *dnp = bc->bc_dnode;
+	char buffer[300];
+	int l;
+
+	if (bc->bc_errno) {
+		(void) sprintf(buffer,
+		    "Error %d reading <%llu, %llu, %d, %llu>: ",
+		    bc->bc_errno,
+		    (u_longlong_t)zb->zb_objset,
+		    (u_longlong_t)zb->zb_object,
+		    zb->zb_level,
+		    (u_longlong_t)zb->zb_blkid);
+		goto out;
+	}
+
+	if (zb->zb_level == -1) {
+		ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
+		ASSERT3U(BP_GET_LEVEL(bp), ==, 0);
+	} else {
+		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
+		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
+	}
+
+	if (zb->zb_level > 0) {
+		uint64_t fill = 0;
+		blkptr_t *bpx, *bpend;
+
+		for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx);
+		    bpx < bpend; bpx++) {
+			if (bpx->blk_birth != 0) {
+				ASSERT(bpx->blk_fill > 0);
+				fill += bpx->blk_fill;
+			} else {
+				ASSERT(bpx->blk_fill == 0);
+			}
+		}
+		ASSERT3U(fill, ==, bp->blk_fill);
+	}
+
+	if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) {
+		uint64_t fill = 0;
+		dnode_phys_t *dnx, *dnend;
+
+		for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT);
+		    dnx < dnend; dnx++) {
+			if (dnx->dn_type != DMU_OT_NONE)
+				fill++;
+		}
+		ASSERT3U(fill, ==, bp->blk_fill);
+	}
+
+	(void) sprintf(buffer, "%16llx ",
+	    (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid));
+
+	ASSERT(zb->zb_level >= 0);
+
+	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
+		if (l == zb->zb_level) {
+			(void) sprintf(buffer + strlen(buffer), "L%x",
+			    zb->zb_level);
+		} else {
+			(void) sprintf(buffer + strlen(buffer), " ");
+		}
+	}
+
+out:
+	if (bp->blk_birth == 0) {
+		(void) sprintf(buffer + strlen(buffer), "<hole>");
+		(void) printf("%s\n", buffer);
+	} else {
+		// XXBP - Need to print number of active BPs here
+		(void) sprintf(buffer + strlen(buffer),
+		    "vdev=%llu off=%llx %llxL/%llxP/%llxA F=%llu B=%llu",
+		    (u_longlong_t)DVA_GET_VDEV(dva),
+		    (u_longlong_t)DVA_GET_OFFSET(dva),
+		    (u_longlong_t)BP_GET_LSIZE(bp),
+		    (u_longlong_t)BP_GET_PSIZE(bp),
+		    (u_longlong_t)DVA_GET_ASIZE(dva),
+		    (u_longlong_t)bp->blk_fill,
+		    (u_longlong_t)bp->blk_birth);
+
+		(void) printf("%s\n", buffer);
+	}
+
+	return (bc->bc_errno ? ERESTART : 0);
+}
+
+/*ARGSUSED*/
+static void
+dump_indirect(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	traverse_handle_t *th;
+	uint64_t objset = dmu_objset_id(os);
+	int advance = zdb_advance;
+
+	(void) printf("Indirect blocks:\n");
+
+	if (object == 0)
+		advance |= ADVANCE_DATA;
+
+	th = traverse_init(dmu_objset_spa(os), zdb_indirect_cb, NULL, advance,
+	    ZIO_FLAG_CANFAIL);
+	th->th_noread = zdb_noread;
+
+	traverse_add_dnode(th, 0, -1ULL, objset, object);
+
+	while (traverse_more(th) == EAGAIN)
+		continue;
+
+	(void) printf("\n");
+
+	traverse_fini(th);
+}
+
+/*ARGSUSED*/
+static void
+dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	dsl_dir_phys_t *dd = data;
+	time_t crtime;
+	char used[6], compressed[6], uncompressed[6], quota[6], resv[6];
+
+	if (dd == NULL)
+		return;
+
+	ASSERT(size == sizeof (*dd));
+
+	crtime = dd->dd_creation_time;
+	nicenum(dd->dd_used_bytes, used);
+	nicenum(dd->dd_compressed_bytes, compressed);
+	nicenum(dd->dd_uncompressed_bytes, uncompressed);
+	nicenum(dd->dd_quota, quota);
+	nicenum(dd->dd_reserved, resv);
+
+	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
+	(void) printf("\t\thead_dataset_obj = %llu\n",
+	    (u_longlong_t)dd->dd_head_dataset_obj);
+	(void) printf("\t\tparent_dir_obj = %llu\n",
+	    (u_longlong_t)dd->dd_parent_obj);
+	(void) printf("\t\tclone_parent_obj = %llu\n",
+	    (u_longlong_t)dd->dd_clone_parent_obj);
+	(void) printf("\t\tchild_dir_zapobj = %llu\n",
+	    (u_longlong_t)dd->dd_child_dir_zapobj);
+	(void) printf("\t\tused_bytes = %s\n", used);
+	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
+	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
+	(void) printf("\t\tquota = %s\n", quota);
+	(void) printf("\t\treserved = %s\n", resv);
+	(void) printf("\t\tprops_zapobj = %llu\n",
+	    (u_longlong_t)dd->dd_props_zapobj);
+}
+
+/*ARGSUSED*/
+static void
+dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	dsl_dataset_phys_t *ds = data;
+	time_t crtime;
+	char used[6], compressed[6], uncompressed[6], unique[6], blkbuf[300];
+
+	if (ds == NULL)
+		return;
+
+	ASSERT(size == sizeof (*ds));
+	crtime = ds->ds_creation_time;
+	nicenum(ds->ds_used_bytes, used);
+	nicenum(ds->ds_compressed_bytes, compressed);
+	nicenum(ds->ds_uncompressed_bytes, uncompressed);
+	nicenum(ds->ds_unique_bytes, unique);
+	sprintf_blkptr(blkbuf, &ds->ds_bp);
+
+	(void) printf("\t\tdataset_obj = %llu\n",
+	    (u_longlong_t)ds->ds_dir_obj);
+	(void) printf("\t\tprev_snap_obj = %llu\n",
+	    (u_longlong_t)ds->ds_prev_snap_obj);
+	(void) printf("\t\tprev_snap_txg = %llu\n",
+	    (u_longlong_t)ds->ds_prev_snap_txg);
+	(void) printf("\t\tnext_snap_obj = %llu\n",
+	    (u_longlong_t)ds->ds_next_snap_obj);
+	(void) printf("\t\tsnapnames_zapobj = %llu\n",
+	    (u_longlong_t)ds->ds_snapnames_zapobj);
+	(void) printf("\t\tnum_children = %llu\n",
+	    (u_longlong_t)ds->ds_num_children);
+	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
+	(void) printf("\t\tcreation_txg = %llu\n",
+	    (u_longlong_t)ds->ds_creation_txg);
+	(void) printf("\t\tdeadlist_obj = %llu\n",
+	    (u_longlong_t)ds->ds_deadlist_obj);
+	(void) printf("\t\tused_bytes = %s\n", used);
+	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
+	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
+	(void) printf("\t\tunique = %s\n", unique);
+	(void) printf("\t\tfsid_guid = %llu\n",
+	    (u_longlong_t)ds->ds_fsid_guid);
+	(void) printf("\t\tguid = %llu\n",
+	    (u_longlong_t)ds->ds_guid);
+	(void) printf("\t\trestoring = %llu\n",
+	    (u_longlong_t)ds->ds_restoring);
+	(void) printf("\t\tbp = %s\n", blkbuf);
+}
+
+static void
+dump_bplist(objset_t *mos, uint64_t object, char *name)
+{
+	bplist_t bpl = { 0 };
+	blkptr_t blk, *bp = &blk;
+	uint64_t itor = 0;
+	char numbuf[6];
+
+	if (dump_opt['d'] < 3)
+		return;
+
+	bplist_open(&bpl, mos, object);
+	if (bplist_empty(&bpl)) {
+		bplist_close(&bpl);
+		return;
+	}
+
+	nicenum(bpl.bpl_phys->bpl_bytes, numbuf);
+
+	(void) printf("\n    %s: %llu entries, %s\n",
+	    name, (u_longlong_t)bpl.bpl_phys->bpl_entries, numbuf);
+
+	if (dump_opt['d'] < 5) {
+		bplist_close(&bpl);
+		return;
+	}
+
+	(void) printf("\n");
+
+	while (bplist_iterate(&bpl, &itor, bp) == 0) {
+		ASSERT(bp->blk_birth != 0);
+		// XXBP - Do we want to see all DVAs, or just one?
+		(void) printf("\tItem %3llu: vdev=%llu off=%llx "
+		    "%llxL/%llxP/%llxA F=%llu B=%llu\n",
+		    (u_longlong_t)itor - 1,
+		    (u_longlong_t)DVA_GET_VDEV(&bp->blk_dva[0]),
+		    (u_longlong_t)DVA_GET_OFFSET(&bp->blk_dva[0]),
+		    (u_longlong_t)BP_GET_LSIZE(bp),
+		    (u_longlong_t)BP_GET_PSIZE(bp),
+		    (u_longlong_t)DVA_GET_ASIZE(&bp->blk_dva[0]),
+		    (u_longlong_t)bp->blk_fill,
+		    (u_longlong_t)bp->blk_birth);
+	}
+
+	bplist_close(&bpl);
+}
+
+static char *
+znode_path(objset_t *os, uint64_t object, char *pathbuf, size_t size)
+{
+	dmu_buf_t *db;
+	dmu_object_info_t doi;
+	znode_phys_t *zp;
+	uint64_t parent = 0;
+	size_t complen;
+	char component[MAXNAMELEN + 1];
+	char *path;
+
+	path = pathbuf + size;
+	*--path = '\0';
+
+	for (;;) {
+		db = dmu_bonus_hold(os, object);
+		if (db == NULL)
+			break;
+
+		dmu_buf_read(db);
+		dmu_object_info_from_db(db, &doi);
+		zp = db->db_data;
+		parent = zp->zp_parent;
+		dmu_buf_rele(db);
+
+		if (doi.doi_bonus_type != DMU_OT_ZNODE)
+			break;
+
+		if (parent == object) {
+			if (path[0] != '/')
+				*--path = '/';
+			return (path);
+		}
+
+		if (zap_value_search(os, parent, object, component) != 0)
+			break;
+
+		complen = strlen(component);
+		path -= complen;
+		bcopy(component, path, complen);
+		*--path = '/';
+
+		object = parent;
+	}
+
+	(void) sprintf(component, "???<object#%llu>", (u_longlong_t)object);
+
+	complen = strlen(component);
+	path -= complen;
+	bcopy(component, path, complen);
+
+	return (path);
+}
+
+/*ARGSUSED*/
+static void
+dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	znode_phys_t *zp = data;
+	time_t z_crtime, z_atime, z_mtime, z_ctime;
+	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
+
+	ASSERT(size >= sizeof (znode_phys_t));
+
+	if (dump_opt['d'] < 3) {
+		(void) printf("\t%s\n",
+		    znode_path(os, object, path, sizeof (path)));
+		return;
+	}
+
+	z_crtime = (time_t)zp->zp_crtime[0];
+	z_atime = (time_t)zp->zp_atime[0];
+	z_mtime = (time_t)zp->zp_mtime[0];
+	z_ctime = (time_t)zp->zp_ctime[0];
+
+	(void) printf("\tpath	%s\n",
+	    znode_path(os, object, path, sizeof (path)));
+	(void) printf("\tatime	%s", ctime(&z_atime));
+	(void) printf("\tmtime	%s", ctime(&z_mtime));
+	(void) printf("\tctime	%s", ctime(&z_ctime));
+	(void) printf("\tcrtime	%s", ctime(&z_crtime));
+	(void) printf("\tgen	%llu\n", (u_longlong_t)zp->zp_gen);
+	(void) printf("\tmode	%llo\n", (u_longlong_t)zp->zp_mode);
+	(void) printf("\tsize	%llu\n", (u_longlong_t)zp->zp_size);
+	(void) printf("\tparent	%llu\n", (u_longlong_t)zp->zp_parent);
+	(void) printf("\tlinks	%llu\n", (u_longlong_t)zp->zp_links);
+	(void) printf("\txattr	%llu\n", (u_longlong_t)zp->zp_xattr);
+	(void) printf("\trdev	0x%016llx\n", (u_longlong_t)zp->zp_rdev);
+}
+
+/*ARGSUSED*/
+static void
+dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = {
+	dump_none,		/* unallocated			*/
+	dump_zap,		/* object directory		*/
+	dump_uint64,		/* object array			*/
+	dump_none,		/* packed nvlist		*/
+	dump_packed_nvlist,	/* packed nvlist size		*/
+	dump_none,		/* bplist			*/
+	dump_none,		/* bplist header		*/
+	dump_none,		/* SPA space map header		*/
+	dump_none,		/* SPA space map		*/
+	dump_none,		/* ZIL intent log		*/
+	dump_dnode,		/* DMU dnode			*/
+	dump_dmu_objset,	/* DMU objset			*/
+	dump_dsl_dir,	/* DSL directory			*/
+	dump_zap,		/* DSL directory child map	*/
+	dump_zap,		/* DSL dataset snap map		*/
+	dump_zap,		/* DSL props			*/
+	dump_dsl_dataset,	/* DSL dataset			*/
+	dump_znode,		/* ZFS znode			*/
+	dump_acl,		/* ZFS ACL			*/
+	dump_uint8,		/* ZFS plain file		*/
+	dump_zap,		/* ZFS directory		*/
+	dump_zap,		/* ZFS master node		*/
+	dump_zap,		/* ZFS delete queue		*/
+	dump_uint8,		/* zvol object			*/
+	dump_zap,		/* zvol prop			*/
+	dump_uint8,		/* other uint8[]		*/
+	dump_uint64,		/* other uint64[]		*/
+	dump_zap,		/* other ZAP			*/
+};
+
+static void
+dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
+{
+	dmu_buf_t *db = NULL;
+	dmu_object_info_t doi;
+	dnode_t *dn;
+	void *bonus = NULL;
+	size_t bsize = 0;
+	char iblk[6], dblk[6], lsize[6], psize[6], bonus_size[6], segsize[6];
+	char aux[50];
+	int error;
+
+	if (*print_header) {
+		(void) printf("\n    Object  lvl   iblk   dblk  lsize"
+		    "  psize  type\n");
+		*print_header = 0;
+	}
+
+	if (object == 0) {
+		dn = os->os->os_meta_dnode;
+	} else {
+		db = dmu_bonus_hold(os, object);
+		if (db == NULL)
+			fatal("dmu_bonus_hold(%llu) failed", object);
+		dmu_buf_read(db);
+		bonus = db->db_data;
+		bsize = db->db_size;
+		dn = ((dmu_buf_impl_t *)db)->db_dnode;
+	}
+	dmu_object_info_from_dnode(dn, &doi);
+
+	nicenum(doi.doi_metadata_block_size, iblk);
+	nicenum(doi.doi_data_block_size, dblk);
+	nicenum(doi.doi_data_block_size * (doi.doi_max_block_offset + 1),
+	    lsize);
+	nicenum(doi.doi_physical_blks << 9, psize);
+	nicenum(doi.doi_bonus_size, bonus_size);
+
+	aux[0] = '\0';
+
+	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6)
+		(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
+		zio_checksum_table[doi.doi_checksum].ci_name);
+
+	if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6)
+		(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
+		zio_compress_table[doi.doi_compress].ci_name);
+
+	(void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %s%s\n",
+	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk, lsize,
+	    psize, dmu_ot[doi.doi_type].ot_name, aux);
+
+	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
+		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %s\n",
+		    "", "", "", "", bonus_size, "bonus",
+		    dmu_ot[doi.doi_bonus_type].ot_name);
+	}
+
+	if (verbosity >= 4) {
+		object_viewer[doi.doi_bonus_type](os, object, bonus, bsize);
+		object_viewer[doi.doi_type](os, object, NULL, 0);
+		*print_header = 1;
+	}
+
+	if (verbosity >= 5)
+		dump_indirect(os, object, NULL, 0);
+
+	if (verbosity >= 5) {
+		/*
+		 * Report the list of segments that comprise the object.
+		 */
+		uint64_t start = 0;
+		uint64_t end;
+		uint64_t blkfill = 1;
+		int minlvl = 1;
+
+		if (dn->dn_type == DMU_OT_DNODE) {
+			minlvl = 0;
+			blkfill = DNODES_PER_BLOCK;
+		}
+
+		for (;;) {
+			error = dnode_next_offset(dn, B_FALSE, &start, minlvl,
+			    blkfill);
+			if (error)
+				break;
+			end = start;
+			error = dnode_next_offset(dn, B_TRUE, &end, minlvl,
+			    blkfill);
+			nicenum(end - start, segsize);
+			(void) printf("\t\tsegment [%016llx, %016llx)"
+			    " size %5s\n", (u_longlong_t)start,
+			    (u_longlong_t)end, segsize);
+			if (error)
+				break;
+			start = end;
+		}
+	}
+
+	if (db != NULL)
+		dmu_buf_rele(db);
+}
+
+static char *objset_types[DMU_OST_NUMTYPES] = {
+	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
+
+/*ARGSUSED*/
+static void
+dump_dir(objset_t *os)
+{
+	dmu_objset_stats_t dds;
+	uint64_t object, object_count;
+	char numbuf[8];
+	char blkbuf[300];
+	char osname[MAXNAMELEN];
+	char *type = "UNKNOWN";
+	int verbosity = dump_opt['d'];
+	int print_header = 1;
+	int i, error;
+
+	dmu_objset_stats(os, &dds);
+
+	if (dds.dds_type < DMU_OST_NUMTYPES)
+		type = objset_types[dds.dds_type];
+
+	if (dds.dds_type == DMU_OST_META) {
+		dds.dds_creation_txg = TXG_INITIAL;
+		dds.dds_last_txg = os->os->os_rootbp.blk_birth;
+		dds.dds_objects_used = os->os->os_rootbp.blk_fill;
+		dds.dds_space_refd =
+		    os->os->os_spa->spa_dsl_pool->dp_mos_dir->dd_used_bytes;
+	}
+
+	ASSERT3U(dds.dds_objects_used, ==, os->os->os_rootbp.blk_fill);
+
+	nicenum(dds.dds_space_refd, numbuf);
+
+	if (verbosity >= 4) {
+		(void) strcpy(blkbuf, ", rootbp ");
+		sprintf_blkptr(blkbuf + strlen(blkbuf), &os->os->os_rootbp);
+	} else {
+		blkbuf[0] = '\0';
+	}
+
+	dmu_objset_name(os, osname);
+
+	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, last_txg %llu, "
+	    "%s, %llu objects%s\n",
+	    osname, type, (u_longlong_t)dmu_objset_id(os),
+	    (u_longlong_t)dds.dds_creation_txg,
+	    (u_longlong_t)dds.dds_last_txg,
+	    numbuf,
+	    (u_longlong_t)dds.dds_objects_used,
+	    blkbuf);
+
+	dump_intent_log(dmu_objset_zil(os));
+
+	if (dmu_objset_ds(os) != NULL)
+		dump_bplist(dmu_objset_pool(os)->dp_meta_objset,
+		    dmu_objset_ds(os)->ds_phys->ds_deadlist_obj, "Deadlist");
+
+	if (verbosity < 2)
+		return;
+
+	if (zopt_objects != 0) {
+		for (i = 0; i < zopt_objects; i++)
+			dump_object(os, zopt_object[i], verbosity,
+			    &print_header);
+		(void) printf("\n");
+		return;
+	}
+
+	dump_object(os, 0, verbosity, &print_header);
+	object_count = 1;
+
+	object = 0;
+	while ((error = dmu_object_next(os, &object, B_FALSE)) == 0) {
+		dump_object(os, object, verbosity, &print_header);
+		object_count++;
+	}
+
+	ASSERT3U(object_count, ==, dds.dds_objects_used);
+
+	(void) printf("\n");
+
+	if (error != ESRCH)
+		fatal("dmu_object_next() = %d", error);
+}
+
+static void
+dump_uberblock(uberblock_t *ub)
+{
+	time_t timestamp = ub->ub_timestamp;
+
+	(void) printf("Uberblock\n\n");
+	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
+	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
+	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
+	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
+	(void) printf("\ttimestamp = %llu UTC = %s",
+	    (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
+	if (dump_opt['u'] >= 3) {
+		char blkbuf[300];
+		sprintf_blkptr(blkbuf, &ub->ub_rootbp);
+		(void) printf("\trootbp = %s\n", blkbuf);
+	}
+	(void) printf("\n");
+}
+
+static void
+dump_config(const char *pool)
+{
+	spa_t *spa = NULL;
+
+	mutex_enter(&spa_namespace_lock);
+	while ((spa = spa_next(spa)) != NULL) {
+		if (pool == NULL)
+			(void) printf("%s\n", spa_name(spa));
+		if (pool == NULL || strcmp(pool, spa_name(spa)) == 0)
+			dump_nvlist(spa->spa_config, 4);
+	}
+	mutex_exit(&spa_namespace_lock);
+}
+
+static void
+dump_label(const char *dev)
+{
+	int fd;
+	vdev_label_t label;
+	char *buf = label.vl_vdev_phys.vp_nvlist;
+	size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
+	struct stat64 statbuf;
+	uint64_t psize;
+	int l;
+
+	if ((fd = open(dev, O_RDONLY)) < 0) {
+		(void) printf("cannot open '%s': %s\n", dev, strerror(errno));
+		exit(1);
+	}
+
+	if (fstat64(fd, &statbuf) != 0) {
+		(void) printf("failed to stat '%s': %s\n", dev,
+		    strerror(errno));
+		exit(1);
+	}
+
+	psize = statbuf.st_size;
+	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
+
+	for (l = 0; l < VDEV_LABELS; l++) {
+
+		nvlist_t *config = NULL;
+
+		(void) printf("--------------------------------------------\n");
+		(void) printf("LABEL %d\n", l);
+		(void) printf("--------------------------------------------\n");
+
+		if (pread(fd, &label, sizeof (label),
+		    vdev_label_offset(psize, l, 0)) != sizeof (label)) {
+			(void) printf("failed to read label %d\n", l);
+			continue;
+		}
+
+		if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
+			(void) printf("failed to unpack label %d\n", l);
+			continue;
+		}
+		dump_nvlist(config, 4);
+		nvlist_free(config);
+	}
+}
+
+/*ARGSUSED*/
+static void
+dump_one_dir(char *dsname, void *arg)
+{
+	int error;
+	objset_t *os;
+
+	error = dmu_objset_open(dsname, DMU_OST_ANY,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+	if (error) {
+		(void) printf("Could not open %s\n", dsname);
+		return;
+	}
+	dump_dir(os);
+	dmu_objset_close(os);
+}
+
+static void
+zdb_space_map_load(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *vd;
+	int c, m, error;
+
+	for (c = 0; c < rvd->vdev_children; c++) {
+		vd = rvd->vdev_child[c];
+		for (m = 0; m < vd->vdev_ms_count; m++) {
+			metaslab_t *msp = vd->vdev_ms[m];
+			space_map_t *sm = &msp->ms_allocmap[0];
+			mutex_enter(&msp->ms_lock);
+			error = space_map_load(sm, msp->ms_smo, SM_ALLOC,
+			    spa->spa_meta_objset, msp->ms_usable_end,
+			    sm->sm_size - msp->ms_usable_space);
+			mutex_exit(&msp->ms_lock);
+			if (error)
+				fatal("%s bad space map #%d, error %d",
+				    spa->spa_name, c, error);
+		}
+	}
+}
+
+static int
+zdb_space_map_claim(spa_t *spa, blkptr_t *bp)
+{
+	dva_t *dva = &bp->blk_dva[0];
+	uint64_t vdev = DVA_GET_VDEV(dva);
+	uint64_t offset = DVA_GET_OFFSET(dva);
+	uint64_t size = DVA_GET_ASIZE(dva);
+	vdev_t *vd;
+	metaslab_t *msp;
+	space_map_t *allocmap, *freemap;
+	int error;
+
+	if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
+		return (ENXIO);
+
+	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
+		return (ENXIO);
+
+	if (DVA_GET_GANG(dva)) {
+		zio_gbh_phys_t gbh;
+		blkptr_t blk = *bp;
+		int g;
+
+		/* LINTED - compile time assert */
+		ASSERT(sizeof (zio_gbh_phys_t) == SPA_GANGBLOCKSIZE);
+		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+		DVA_SET_GANG(&blk.blk_dva[0], 0);
+		DVA_SET_ASIZE(&blk.blk_dva[0], size);
+		BP_SET_CHECKSUM(&blk, ZIO_CHECKSUM_GANG_HEADER);
+		BP_SET_PSIZE(&blk, SPA_GANGBLOCKSIZE);
+		BP_SET_LSIZE(&blk, SPA_GANGBLOCKSIZE);
+		BP_SET_COMPRESS(&blk, ZIO_COMPRESS_OFF);
+		error = zio_wait(zio_read(NULL, spa, &blk,
+		    &gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
+		    ZIO_PRIORITY_SYNC_READ,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD));
+		if (error)
+			return (error);
+		if (BP_SHOULD_BYTESWAP(&blk))
+			byteswap_uint64_array(&gbh, SPA_GANGBLOCKSIZE);
+		for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+			if (gbh.zg_blkptr[g].blk_birth == 0)
+				break;
+			error = zdb_space_map_claim(spa, &gbh.zg_blkptr[g]);
+			if (error)
+				return (error);
+		}
+	}
+
+	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+	allocmap = &msp->ms_allocmap[0];
+	freemap = &msp->ms_freemap[0];
+
+	mutex_enter(&msp->ms_lock);
+	if (space_map_contains(freemap, offset, size)) {
+		mutex_exit(&msp->ms_lock);
+		return (EAGAIN);	/* allocated more than once */
+	}
+
+	if (!space_map_contains(allocmap, offset, size)) {
+		mutex_exit(&msp->ms_lock);
+		return (ESTALE);	/* not allocated at all */
+	}
+
+	space_map_remove(allocmap, offset, size);
+	space_map_add(freemap, offset, size);
+
+	mutex_exit(&msp->ms_lock);
+
+	return (0);
+}
+
+static void
+zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	metaslab_t *msp;
+
+	/* LINTED */
+	msp = (metaslab_t *)((char *)sm - offsetof(metaslab_t, ms_allocmap[0]));
+
+	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
+	    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
+	    (u_longlong_t)start,
+	    (u_longlong_t)size);
+}
+
+static void
+zdb_space_map_vacate(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *vd;
+	int c, m;
+
+	for (c = 0; c < rvd->vdev_children; c++) {
+		vd = rvd->vdev_child[c];
+		for (m = 0; m < vd->vdev_ms_count; m++) {
+			metaslab_t *msp = vd->vdev_ms[m];
+			mutex_enter(&msp->ms_lock);
+			space_map_vacate(&msp->ms_allocmap[0], zdb_leak,
+			    &msp->ms_allocmap[0]);
+			space_map_vacate(&msp->ms_freemap[0], NULL, NULL);
+			mutex_exit(&msp->ms_lock);
+		}
+	}
+}
+
+static void
+zdb_refresh_ubsync(spa_t *spa)
+{
+	uberblock_t ub = { 0 };
+	vdev_t *rvd = spa->spa_root_vdev;
+	zio_t *zio;
+
+	/*
+	 * Reopen all devices to purge zdb's vdev caches.
+	 */
+	vdev_reopen(rvd, NULL);
+
+	/*
+	 * Reload the uberblock.
+	 */
+	zio = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+	vdev_uberblock_load(zio, rvd, &ub);
+	(void) zio_wait(zio);
+
+	if (ub.ub_txg != 0)
+		spa->spa_ubsync = ub;
+}
+
+/*
+ * Verify that the sum of the sizes of all blocks in the pool adds up
+ * to the SPA's sa_alloc total.
+ */
+typedef struct zdb_blkstats {
+	uint64_t	zb_asize;
+	uint64_t	zb_lsize;
+	uint64_t	zb_psize;
+	uint64_t	zb_count;
+} zdb_blkstats_t;
+
+#define	DMU_OT_DEFERRED	DMU_OT_NONE
+#define	DMU_OT_TOTAL	DMU_OT_NUMTYPES
+
+#define	ZB_TOTAL	ZB_MAXLEVEL
+
+typedef struct zdb_cb {
+	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][DMU_OT_TOTAL + 1];
+	uint64_t	zcb_errors[256];
+	traverse_blk_cache_t *zcb_cache;
+	int		zcb_readfails;
+	int		zcb_haderrors;
+} zdb_cb_t;
+
+static blkptr_cb_t zdb_blkptr_cb;
+
+static void
+zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type)
+{
+	int i, error;
+
+	for (i = 0; i < 4; i++) {
+		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
+		int t = (i & 1) ? type : DMU_OT_TOTAL;
+		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
+
+		zb->zb_asize += BP_GET_ASIZE(bp);
+		zb->zb_lsize += BP_GET_LSIZE(bp);
+		zb->zb_psize += BP_GET_PSIZE(bp);
+		zb->zb_count++;
+	}
+
+	if (dump_opt['L'])
+		return;
+
+	error = zdb_space_map_claim(spa, bp);
+
+	if (error == 0)
+		return;
+
+	if (error == EAGAIN)
+		(void) fatal("double-allocation, bp=%p", bp);
+
+	if (error == ESTALE)
+		(void) fatal("reference to freed block, bp=%p", bp);
+
+	(void) fatal("fatal error %d in bp %p", error, bp);
+}
+
+static void
+zdb_log_block_cb(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t first_txg)
+{
+	if (bp->blk_birth < first_txg) {
+		zdb_cb_t *zcb = arg;
+		traverse_blk_cache_t bc = *zcb->zcb_cache;
+		zbookmark_t *zb = &bc.bc_bookmark;
+
+		zb->zb_objset = bp->blk_cksum.zc_word[2];
+		zb->zb_blkid = bp->blk_cksum.zc_word[3];
+		bc.bc_blkptr = *bp;
+
+		(void) zdb_blkptr_cb(&bc, zilog->zl_spa, arg);
+	}
+}
+
+static int
+zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+{
+	zbookmark_t *zb = &bc->bc_bookmark;
+	zdb_cb_t *zcb = arg;
+	blkptr_t *bp = &bc->bc_blkptr;
+	dmu_object_type_t type = BP_GET_TYPE(bp);
+	char blkbuf[300];
+	int error = 0;
+
+	if (bc->bc_errno) {
+		if (zcb->zcb_readfails++ < 10 && dump_opt['L']) {
+			zdb_refresh_ubsync(spa);
+			error = EAGAIN;
+		} else {
+			zcb->zcb_haderrors = 1;
+			zcb->zcb_errors[bc->bc_errno]++;
+			error = ERESTART;
+		}
+
+		if (dump_opt['b'] >= 3 || (dump_opt['b'] >= 2 && bc->bc_errno))
+			sprintf_blkptr(blkbuf, bp);
+		else
+			blkbuf[0] = '\0';
+
+		(void) printf("zdb_blkptr_cb: Got error %d reading "
+		    "<%llu, %llu, %d, %llx> %s -- %s\n",
+		    bc->bc_errno,
+		    (u_longlong_t)zb->zb_objset,
+		    (u_longlong_t)zb->zb_object,
+		    zb->zb_level,
+		    (u_longlong_t)zb->zb_blkid,
+		    blkbuf,
+		    error == EAGAIN ? "retrying" : "skipping");
+
+		return (error);
+	}
+
+	zcb->zcb_readfails = 0;
+
+	ASSERT(bp->blk_birth != 0);
+
+	zdb_count_block(spa, zcb, bp, type);
+
+	if (dump_opt['b'] >= 4) {
+		sprintf_blkptr(blkbuf, bp);
+		(void) printf("objset %llu object %llu offset 0x%llx %s\n",
+		    (u_longlong_t)zb->zb_objset,
+		    (u_longlong_t)zb->zb_object,
+		    (u_longlong_t)blkid2offset(bc->bc_dnode,
+			zb->zb_level, zb->zb_blkid),
+		    blkbuf);
+	}
+
+	if (type == DMU_OT_OBJSET) {
+		objset_phys_t *osphys = bc->bc_data;
+		zilog_t zilog = { 0 };
+		zilog.zl_header = &osphys->os_zil_header;
+		zilog.zl_spa = spa;
+
+		zcb->zcb_cache = bc;
+
+		zil_parse(&zilog, zdb_log_block_cb, NULL, zcb,
+		    spa_first_txg(spa));
+	}
+
+	return (0);
+}
+
+static int
+dump_block_stats(spa_t *spa)
+{
+	traverse_handle_t *th;
+	zdb_cb_t zcb = { 0 };
+	zdb_blkstats_t *zb, *tzb;
+	uint64_t alloc, space;
+	int leaks = 0;
+	int advance = zdb_advance;
+	int flags;
+	int e;
+
+	if (dump_opt['c'])
+		advance |= ADVANCE_DATA;
+
+	advance |= ADVANCE_PRUNE;
+
+	(void) printf("\nTraversing all blocks to %sverify"
+	    " nothing leaked ...\n",
+	    dump_opt['c'] ? "verify checksums and " : "");
+
+	/*
+	 * Load all space maps.  As we traverse the pool, if we find a block
+	 * that's not in its space map, that indicates a double-allocation,
+	 * reference to a freed block, or an unclaimed block.  Otherwise we
+	 * remove the block from the space map.  If the space maps are not
+	 * empty when we're done, that indicates leaked blocks.
+	 */
+	if (!dump_opt['L'])
+		zdb_space_map_load(spa);
+
+	/*
+	 * If there's a deferred-free bplist, process that first.
+	 */
+	if (spa->spa_sync_bplist_obj != 0) {
+		bplist_t *bpl = &spa->spa_sync_bplist;
+		blkptr_t blk;
+		uint64_t itor = 0;
+
+		bplist_open(bpl, spa->spa_meta_objset,
+		    spa->spa_sync_bplist_obj);
+
+		while (bplist_iterate(bpl, &itor, &blk) == 0) {
+			zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
+			if (dump_opt['b'] >= 4) {
+				char blkbuf[300];
+				sprintf_blkptr(blkbuf, &blk);
+				(void) printf("[%s] %s\n",
+				    "deferred free", blkbuf);
+			}
+		}
+
+		bplist_close(bpl);
+	}
+
+	/*
+	 * Now traverse the pool.  If we're read all data to verify checksums,
+	 * do a scrubbing read so that we validate all copies.
+	 */
+	flags = ZIO_FLAG_CANFAIL;
+	if (advance & ADVANCE_DATA)
+		flags |= ZIO_FLAG_SCRUB;
+	th = traverse_init(spa, zdb_blkptr_cb, &zcb, advance, flags);
+	th->th_noread = zdb_noread;
+
+	traverse_add_pool(th, 0, -1ULL);
+
+	while (traverse_more(th) == EAGAIN)
+		continue;
+
+	traverse_fini(th);
+
+	if (zcb.zcb_haderrors) {
+		(void) printf("\nError counts:\n\n");
+		(void) printf("\t%5s  %s\n", "errno", "count");
+		for (e = 0; e < 256; e++) {
+			if (zcb.zcb_errors[e] != 0) {
+				(void) printf("\t%5d  %llu\n",
+				    e, (u_longlong_t)zcb.zcb_errors[e]);
+			}
+		}
+	}
+
+	/*
+	 * Report any leaked segments.
+	 */
+	if (!dump_opt['L'])
+		zdb_space_map_vacate(spa);
+
+	if (dump_opt['L'])
+		(void) printf("\n\n *** Live pool traversal; "
+		    "block counts are only approximate ***\n\n");
+
+	alloc = spa_get_alloc(spa);
+	space = spa_get_space(spa);
+
+	tzb = &zcb.zcb_type[ZB_TOTAL][DMU_OT_TOTAL];
+
+	if (tzb->zb_asize == alloc) {
+		(void) printf("\n\tNo leaks (block sum matches space"
+		    " maps exactly)\n");
+	} else {
+		(void) printf("block traversal size %llu != alloc %llu "
+		    "(leaked %lld)\n",
+		    (u_longlong_t)tzb->zb_asize,
+		    (u_longlong_t)alloc,
+		    (u_longlong_t)(alloc - tzb->zb_asize));
+		leaks = 1;
+	}
+
+	if (tzb->zb_count == 0)
+		return (2);
+
+	(void) printf("\n");
+	(void) printf("\tbp count:      %10llu\n",
+	    (u_longlong_t)tzb->zb_count);
+	(void) printf("\tbp logical:    %10llu\t avg: %6llu\n",
+	    (u_longlong_t)tzb->zb_lsize,
+	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
+	(void) printf("\tbp physical:   %10llu\t avg:"
+	    " %6llu\tcompression: %6.2f\n",
+	    (u_longlong_t)tzb->zb_psize,
+	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
+	    (double)tzb->zb_lsize / tzb->zb_psize);
+	(void) printf("\tbp allocated:  %10llu\t avg:"
+	    " %6llu\tcompression: %6.2f\n",
+	    (u_longlong_t)tzb->zb_asize,
+	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
+	    (double)tzb->zb_lsize / tzb->zb_asize);
+	(void) printf("\tSPA allocated: %10llu\tused: %5.2f%%\n",
+	    (u_longlong_t)alloc, 100.0 * alloc / space);
+
+	if (dump_opt['b'] >= 2) {
+		int l, t, level;
+		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
+		    "\t  avg\t comp\t%%Total\tType\n");
+
+		for (t = 0; t <= DMU_OT_NUMTYPES; t++) {
+			char csize[6], lsize[6], psize[6], asize[6], avg[6];
+			char *typename;
+
+			typename = t == DMU_OT_DEFERRED ? "deferred free" :
+			    t == DMU_OT_TOTAL ? "Total" : dmu_ot[t].ot_name;
+
+			if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
+				(void) printf("%6s\t%5s\t%5s\t%5s"
+				    "\t%5s\t%5s\t%6s\t%s\n",
+				    "-",
+				    "-",
+				    "-",
+				    "-",
+				    "-",
+				    "-",
+				    "-",
+				    typename);
+				continue;
+			}
+
+			for (l = ZB_TOTAL - 1; l >= -1; l--) {
+				level = (l == -1 ? ZB_TOTAL : l);
+				zb = &zcb.zcb_type[level][t];
+
+				if (zb->zb_asize == 0)
+					continue;
+
+				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
+					continue;
+
+				if (level == 0 && zb->zb_asize ==
+				    zcb.zcb_type[ZB_TOTAL][t].zb_asize)
+					continue;
+
+				nicenum(zb->zb_count, csize);
+				nicenum(zb->zb_lsize, lsize);
+				nicenum(zb->zb_psize, psize);
+				nicenum(zb->zb_asize, asize);
+				nicenum(zb->zb_asize / zb->zb_count, avg);
+
+				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
+				    "\t%5.2f\t%6.2f\t",
+				    csize, lsize, psize, asize, avg,
+				    (double)zb->zb_lsize / zb->zb_psize,
+				    100.0 * zb->zb_asize / tzb->zb_asize);
+
+				if (level == ZB_TOTAL)
+					(void) printf("%s\n", typename);
+				else
+					(void) printf("    L%d %s\n",
+					    level, typename);
+			}
+		}
+	}
+
+	(void) printf("\n");
+
+	if (leaks)
+		return (2);
+
+	if (zcb.zcb_haderrors)
+		return (3);
+
+	return (0);
+}
+
+static void
+dump_zpool(spa_t *spa)
+{
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	int rc = 0;
+
+	if (dump_opt['u'])
+		dump_uberblock(&spa->spa_uberblock);
+
+	if (dump_opt['d'] || dump_opt['i']) {
+		dump_dir(dp->dp_meta_objset);
+		if (dump_opt['d'] >= 3) {
+			dump_bplist(dp->dp_meta_objset,
+			    spa->spa_sync_bplist_obj, "Deferred frees");
+			dump_dtl(spa->spa_root_vdev, 0);
+			dump_metaslabs(spa);
+		}
+		dmu_objset_find(spa->spa_name, dump_one_dir, NULL,
+		    DS_FIND_SNAPSHOTS);
+	}
+
+	if (dump_opt['b'] || dump_opt['c'])
+		rc = dump_block_stats(spa);
+
+	if (dump_opt['s'])
+		show_pool_stats(spa);
+
+	if (rc != 0)
+		exit(rc);
+}
+
+int
+main(int argc, char **argv)
+{
+	int i, c;
+	struct rlimit rl = { 1024, 1024 };
+	spa_t *spa;
+	objset_t *os = NULL;
+	char *endstr;
+	int dump_all = 1;
+	int verbose = 0;
+	int error;
+	int flag, set;
+
+	(void) setrlimit(RLIMIT_NOFILE, &rl);
+
+	dprintf_setup(&argc, argv);
+
+	while ((c = getopt(argc, argv, "udibcsvCLO:B:Ul")) != -1) {
+		switch (c) {
+		case 'u':
+		case 'd':
+		case 'i':
+		case 'b':
+		case 'c':
+		case 's':
+		case 'C':
+		case 'l':
+			dump_opt[c]++;
+			dump_all = 0;
+			break;
+		case 'L':
+			dump_opt[c]++;
+			break;
+		case 'O':
+			endstr = optarg;
+			if (endstr[0] == '!') {
+				endstr++;
+				set = 0;
+			} else {
+				set = 1;
+			}
+			if (strcmp(endstr, "post") == 0) {
+				flag = ADVANCE_PRE;
+				set = !set;
+			} else if (strcmp(endstr, "pre") == 0) {
+				flag = ADVANCE_PRE;
+			} else if (strcmp(endstr, "prune") == 0) {
+				flag = ADVANCE_PRUNE;
+			} else if (strcmp(endstr, "data") == 0) {
+				flag = ADVANCE_DATA;
+			} else if (strcmp(endstr, "holes") == 0) {
+				flag = ADVANCE_HOLES;
+			} else {
+				usage();
+			}
+			if (set)
+				zdb_advance |= flag;
+			else
+				zdb_advance &= ~flag;
+			break;
+		case 'B':
+			endstr = optarg - 1;
+			zdb_noread.zb_objset = strtoull(endstr + 1, &endstr, 0);
+			zdb_noread.zb_object = strtoull(endstr + 1, &endstr, 0);
+			zdb_noread.zb_level = strtol(endstr + 1, &endstr, 0);
+			zdb_noread.zb_blkid = strtoull(endstr + 1, &endstr, 16);
+			(void) printf("simulating bad block "
+			    "<%llu, %llu, %d, %llx>\n",
+			    (u_longlong_t)zdb_noread.zb_objset,
+			    (u_longlong_t)zdb_noread.zb_object,
+			    zdb_noread.zb_level,
+			    (u_longlong_t)zdb_noread.zb_blkid);
+			break;
+		case 'v':
+			verbose++;
+			break;
+		case 'U':
+			spa_config_dir = "/tmp";
+			break;
+		default:
+			usage();
+			break;
+		}
+	}
+
+	kernel_init(FREAD);
+
+	for (c = 0; c < 256; c++) {
+		if (dump_all && c != 'L' && c != 'l')
+			dump_opt[c] = 1;
+		if (dump_opt[c])
+			dump_opt[c] += verbose;
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		if (dump_opt['C']) {
+			dump_config(NULL);
+			return (0);
+		}
+		usage();
+	}
+
+	if (dump_opt['l']) {
+		dump_label(argv[0]);
+		return (0);
+	}
+
+	if (dump_opt['C'])
+		dump_config(argv[0]);
+
+	if (strchr(argv[0], '/') != NULL) {
+		error = dmu_objset_open(argv[0], DMU_OST_ANY,
+		    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+	} else {
+		error = spa_open(argv[0], &spa, FTAG);
+	}
+
+	if (error)
+		fatal("can't open %s: error %d", argv[0], error);
+
+	argv++;
+	if (--argc > 0) {
+		zopt_objects = argc;
+		zopt_object = calloc(zopt_objects, sizeof (uint64_t));
+		for (i = 0; i < zopt_objects; i++) {
+			errno = 0;
+			zopt_object[i] = strtoull(argv[i], NULL, 0);
+			if (zopt_object[i] == 0 && errno != 0)
+				fatal("bad object number %s: %s",
+				    argv[i], strerror(errno));
+		}
+	}
+
+	if (os != NULL) {
+		dump_dir(os);
+		dmu_objset_close(os);
+	} else {
+		dump_zpool(spa);
+		spa_close(spa, FTAG);
+	}
+
+	kernel_fini();
+
+	return (0);
+}
diff --git a/usr/src/cmd/zdb/zdb_il.c b/usr/src/cmd/zdb/zdb_il.c
new file mode 100644
index 000000000000..924d4a1dece6
--- /dev/null
+++ b/usr/src/cmd/zdb/zdb_il.c
@@ -0,0 +1,337 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Print intent log header and statistics.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+
+extern uint8_t dump_opt[256];
+
+static void
+print_log_bp(blkptr_t *bp, const char *prefix)
+{
+	char blkbuf[200];
+
+	sprintf_blkptr(blkbuf, bp);
+	(void) printf("%s%s\n", prefix, blkbuf);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_create(zilog_t *zilog, int txtype, lr_create_t *lr)
+{
+	time_t crtime = lr->lr_crtime[0];
+	char *name = (char *)(lr + 1);
+	char *link = name + strlen(name) + 1;
+
+	if (txtype == TX_SYMLINK)
+		(void) printf("\t\t\t%s -> %s\n", name, link);
+	else
+		(void) printf("\t\t\t%s\n", name);
+
+	(void) printf("\t\t\t%s", ctime(&crtime));
+	(void) printf("\t\t\tdoid %llu, foid %llu, mode %llo\n",
+	    (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_foid,
+	    (longlong_t)lr->lr_mode);
+	(void) printf("\t\t\tuid %llu, gid %llu, gen %llu, rdev 0x%llx\n",
+	    (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid,
+	    (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_remove(zilog_t *zilog, int txtype, lr_remove_t *lr)
+{
+	(void) printf("\t\t\tdoid %llu, name %s\n",
+	    (u_longlong_t)lr->lr_doid, (char *)(lr + 1));
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_link(zilog_t *zilog, int txtype, lr_link_t *lr)
+{
+	(void) printf("\t\t\tdoid %llu, link_obj %llu, name %s\n",
+	    (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj,
+	    (char *)(lr + 1));
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr)
+{
+	char *snm = (char *)(lr + 1);
+	char *tnm = snm + strlen(snm) + 1;
+
+	(void) printf("\t\t\tsdoid %llu, tdoid %llu\n",
+	    (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid);
+	(void) printf("\t\t\tsrc %s tgt %s\n", snm, tnm);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
+{
+	char *data, *dlimit;
+	blkptr_t *bp = &lr->lr_blkptr;
+	char buf[SPA_MAXBLOCKSIZE];
+	int verbose = MAX(dump_opt['d'], dump_opt['i']);
+	int error;
+
+	(void) printf("\t\t\tfoid %llu, offset 0x%llx,"
+	    " length 0x%llx, blkoff 0x%llx\n",
+	    (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
+	    (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blkoff);
+
+	if (verbose < 5)
+		return;
+
+	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+		(void) printf("\t\t\thas blkptr, %s\n",
+		    bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
+		    "will claim" : "won't claim");
+		print_log_bp(bp, "\t\t\t");
+		if (bp->blk_birth == 0) {
+			bzero(buf, sizeof (buf));
+		} else {
+			error = zio_wait(zio_read(NULL, zilog->zl_spa,
+			    bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
+			    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL));
+			if (error)
+				return;
+		}
+		data = buf + lr->lr_blkoff;
+	} else {
+		data = (char *)(lr + 1);
+	}
+
+	dlimit = data + MIN(lr->lr_length,
+	    (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));
+
+	(void) printf("\t\t\t");
+	while (data < dlimit) {
+		if (isprint(*data))
+			(void) printf("%c ", *data);
+		else
+			(void) printf("%2X", *data);
+		data++;
+	}
+	(void) printf("\n");
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_truncate(zilog_t *zilog, int txtype, lr_truncate_t *lr)
+{
+	(void) printf("\t\t\tfoid %llu, offset 0x%llx, length 0x%llx\n",
+	    (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
+	    (u_longlong_t)lr->lr_length);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_setattr(zilog_t *zilog, int txtype, lr_setattr_t *lr)
+{
+	time_t atime = (time_t)lr->lr_atime[0];
+	time_t mtime = (time_t)lr->lr_mtime[0];
+
+	(void) printf("\t\t\tfoid %llu, mask 0x%llx\n",
+	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask);
+
+	if (lr->lr_mask & AT_MODE) {
+		(void) printf("\t\t\tAT_MODE  %llo\n",
+		    (longlong_t)lr->lr_mode);
+	}
+
+	if (lr->lr_mask & AT_UID) {
+		(void) printf("\t\t\tAT_UID   %llu\n",
+		    (u_longlong_t)lr->lr_uid);
+	}
+
+	if (lr->lr_mask & AT_GID) {
+		(void) printf("\t\t\tAT_GID   %llu\n",
+		    (u_longlong_t)lr->lr_gid);
+	}
+
+	if (lr->lr_mask & AT_SIZE) {
+		(void) printf("\t\t\tAT_SIZE  %llu\n",
+		    (u_longlong_t)lr->lr_size);
+	}
+
+	if (lr->lr_mask & AT_ATIME) {
+		(void) printf("\t\t\tAT_ATIME %llu.%09llu %s",
+		    (u_longlong_t)lr->lr_atime[0],
+		    (u_longlong_t)lr->lr_atime[1],
+		    ctime(&atime));
+	}
+
+	if (lr->lr_mask & AT_MTIME) {
+		(void) printf("\t\t\tAT_MTIME %llu.%09llu %s",
+		    (u_longlong_t)lr->lr_mtime[0],
+		    (u_longlong_t)lr->lr_mtime[1],
+		    ctime(&mtime));
+	}
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_acl(zilog_t *zilog, int txtype, lr_acl_t *lr)
+{
+	(void) printf("\t\t\tfoid %llu, aclcnt %llu\n",
+	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
+}
+
+typedef void (*zil_prt_rec_func_t)();
+typedef struct zil_rec_info {
+	zil_prt_rec_func_t	zri_print;
+	char			*zri_name;
+	uint64_t		zri_count;
+} zil_rec_info_t;
+
+static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
+	{	NULL,			"Total      "	},
+	{	zil_prt_rec_create,	"TX_CREATE  "	},
+	{	zil_prt_rec_create,	"TX_MKDIR   "	},
+	{	zil_prt_rec_create,	"TX_MKXATTR "	},
+	{	zil_prt_rec_create,	"TX_SYMLINK "	},
+	{	zil_prt_rec_remove,	"TX_REMOVE  "	},
+	{	zil_prt_rec_remove,	"TX_RMDIR   "	},
+	{	zil_prt_rec_link,	"TX_LINK    "	},
+	{	zil_prt_rec_rename,	"TX_RENAME  "	},
+	{	zil_prt_rec_write,	"TX_WRITE   "	},
+	{	zil_prt_rec_truncate,	"TX_TRUNCATE"	},
+	{	zil_prt_rec_setattr,	"TX_SETATTR "	},
+	{	zil_prt_rec_acl,	"TX_ACL     "	},
+};
+
+/* ARGSUSED */
+static void
+print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t first_txg)
+{
+	int txtype;
+	int verbose = MAX(dump_opt['d'], dump_opt['i']);
+
+	txtype = lr->lrc_txtype;
+
+	ASSERT(txtype != 0 && (uint_t)txtype < TX_MAX_TYPE);
+	ASSERT(lr->lrc_txg);
+
+	(void) printf("\t\t%s len %6llu, txg %llu, seq %llu\n",
+	    zil_rec_info[txtype].zri_name,
+	    (u_longlong_t)lr->lrc_reclen,
+	    (u_longlong_t)lr->lrc_txg,
+	    (u_longlong_t)lr->lrc_seq);
+
+	if (txtype && verbose >= 3)
+		zil_rec_info[txtype].zri_print(zilog, txtype, lr);
+
+	zil_rec_info[txtype].zri_count++;
+	zil_rec_info[0].zri_count++;
+}
+
+/* ARGSUSED */
+static void
+print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t first_txg)
+{
+	char blkbuf[200];
+	int verbose = MAX(dump_opt['d'], dump_opt['i']);
+
+	if (verbose <= 3)
+		return;
+
+	if (verbose >= 5) {
+		(void) strcpy(blkbuf, ", ");
+		sprintf_blkptr(blkbuf + strlen(blkbuf), bp);
+	} else {
+		blkbuf[0] = '\0';
+	}
+
+	(void) printf("\tBlock seqno %llu, %s%s\n",
+	    (u_longlong_t)bp->blk_cksum.zc_word[3],
+	    bp->blk_birth >= first_txg ? "will claim" : "won't claim", blkbuf);
+}
+
+static void
+print_log_stats(int verbose)
+{
+	int i, w, p10;
+
+	if (verbose > 3)
+		(void) printf("\n");
+
+	if (zil_rec_info[0].zri_count == 0)
+		return;
+
+	for (w = 1, p10 = 10; zil_rec_info[0].zri_count >= p10; p10 *= 10)
+		w++;
+
+	for (i = 0; i < TX_MAX_TYPE; i++)
+		if (zil_rec_info[i].zri_count || verbose >= 3)
+			(void) printf("\t\t%s %*llu\n",
+			    zil_rec_info[i].zri_name, w,
+			    (u_longlong_t)zil_rec_info[i].zri_count);
+	(void) printf("\n");
+}
+
+/* ARGSUSED */
+void
+dump_intent_log(zilog_t *zilog)
+{
+	zil_header_t *zh = zilog->zl_header;
+	int verbose = MAX(dump_opt['d'], dump_opt['i']);
+	int i;
+
+	if (zh->zh_log.blk_birth == 0 || verbose < 2)
+		return;
+
+	(void) printf("\n    ZIL header: claim_txg %llu, seq %llu\n",
+	    (u_longlong_t)zh->zh_claim_txg, (u_longlong_t)zh->zh_replay_seq);
+
+	if (verbose >= 4)
+		print_log_bp(&zh->zh_log, "\n\tfirst block: ");
+
+	for (i = 0; i < TX_MAX_TYPE; i++)
+		zil_rec_info[i].zri_count = 0;
+
+	if (verbose >= 2) {
+		(void) printf("\n");
+		zil_parse(zilog, print_log_block, print_log_record, NULL,
+		    spa_first_txg(zilog->zl_spa));
+		print_log_stats(verbose);
+	}
+}
diff --git a/usr/src/cmd/zfs/Makefile b/usr/src/cmd/zfs/Makefile
new file mode 100644
index 000000000000..3a80b1c77e80
--- /dev/null
+++ b/usr/src/cmd/zfs/Makefile
@@ -0,0 +1,93 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+PROG=		zfs
+OBJS=		zfs_main.o zfs_iter.o
+SRCS=		$(OBJS:%.o=%.c)
+POFILES=	zfs_main.po zfs_iter.po
+POFILE=		zfs.po
+
+include ../Makefile.cmd
+
+FSTYPE=         zfs
+LINKPROGS=	mount umount
+ROOTETCFSTYPE=  $(ROOTETC)/fs/$(FSTYPE)
+USRLIBFSTYPE=	$(ROOTLIB)/fs/$(FSTYPE)
+
+LDLIBS += -lzfs -luutil -lumem
+
+C99MODE=	-xc99=%all
+C99LMODE=	-Xc99=%all
+
+CPPFLAGS += -D_LARGEFILE64_SOURCE=1 -D_REENTRANT
+
+# lint complains about unused _umem_* functions
+LINTFLAGS += -xerroff=E_NAME_DEF_NOT_USED2 
+LINTFLAGS64 += -xerroff=E_NAME_DEF_NOT_USED2 
+
+ROOTUSRSBINLINKS = $(PROG:%=$(ROOTUSRSBIN)/%)
+USRLIBFSTYPELINKS = $(LINKPROGS:%=$(USRLIBFSTYPE)/%)
+ROOTETCFSTYPELINKS = $(LINKPROGS:%=$(ROOTETCFSTYPE)/%)
+
+.KEEP_STATE:
+
+.PARALLEL:
+
+all: $(PROG)
+
+$(PROG): $(OBJS)
+	$(LINK.c) -o $@ $(OBJS) $(LDLIBS)
+	$(POST_PROCESS)
+
+install: all $(ROOTSBINPROG) $(ROOTUSRSBINLINKS) $(USRLIBFSTYPELINKS) \
+	$(ROOTETCFSTYPELINKS)
+
+$(POFILE): $(POFILES)
+	$(RM) $@
+	cat $(POFILES) > $@
+
+clean:
+	$(RM) $(OBJS)
+
+lint:	lint_SRCS
+
+# Links from /usr/sbin to /sbin
+$(ROOTUSRSBINLINKS):
+	-$(RM) $@; $(SYMLINK) ../../sbin/$(PROG) $@
+
+# Links from /usr/lib/fs/zfs to /sbin
+$(USRLIBFSTYPELINKS):
+	-$(RM) $@; $(SYMLINK) ../../../../sbin/$(PROG) $@
+
+# Links from /etc/fs/zfs to /sbin
+$(ROOTETCFSTYPELINKS):
+	-$(RM) $@; $(SYMLINK) ../../../sbin/$(PROG) $@
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/zfs/zfs_iter.c b/usr/src/cmd/zfs/zfs_iter.c
new file mode 100644
index 000000000000..8f065c03bdbf
--- /dev/null
+++ b/usr/src/cmd/zfs/zfs_iter.c
@@ -0,0 +1,247 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <libintl.h>
+#include <libuutil.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+
+#include <libzfs.h>
+
+#include "zfs_util.h"
+
+/*
+ * This is a private interface used to gather up all the datasets specified on
+ * the command line so that we can iterate over them in order.
+ *
+ * First, we iterate over all filesystems, gathering them together into an
+ * AVL tree sorted by name.  For snapshots, we order them according to
+ * creation time.  We report errors for any explicitly specified datasets
+ * that we couldn't open.
+ *
+ * When finished, we have an AVL tree of ZFS handles.  We go through and execute
+ * the provided callback for each one, passing whatever data the user supplied.
+ */
+
+typedef struct zfs_node {
+	zfs_handle_t	*zn_handle;
+	uu_avl_node_t	zn_avlnode;
+} zfs_node_t;
+
+typedef struct callback_data {
+	uu_avl_t	*cb_avl;
+	int		cb_recurse;
+	zfs_type_t	cb_types;
+} callback_data_t;
+
+uu_avl_pool_t *avl_pool;
+
+/*
+ * Called for each dataset.  If the object the object is of an appropriate type,
+ * add it to the avl tree and recurse over any children as necessary.
+ */
+int
+zfs_callback(zfs_handle_t *zhp, void *data)
+{
+	callback_data_t *cb = data;
+	int dontclose = 0;
+
+	/*
+	 * If this object is of the appropriate type, add it to the AVL tree.
+	 */
+	if (zfs_get_type(zhp) & cb->cb_types) {
+		uu_avl_index_t idx;
+		zfs_node_t *node = safe_malloc(sizeof (zfs_node_t));
+
+		node->zn_handle = zhp;
+		uu_avl_node_init(node, &node->zn_avlnode, avl_pool);
+		if (uu_avl_find(cb->cb_avl, node, NULL, &idx) == NULL) {
+			uu_avl_insert(cb->cb_avl, node, idx);
+			dontclose = 1;
+		} else {
+			free(node);
+		}
+	}
+
+	/*
+	 * If 'recurse' is set, and the datasets can have datasets of the
+	 * appropriate type, then recurse over its children.
+	 */
+	if (cb->cb_recurse && (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM ||
+	    (cb->cb_types & ZFS_TYPE_SNAPSHOT)))
+		(void) zfs_iter_children(zhp, zfs_callback, data);
+
+	if (!dontclose)
+		zfs_close(zhp);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_compare(const void *larg, const void *rarg, void *unused)
+{
+	zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle;
+	zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle;
+	const char *lname = zfs_get_name(l);
+	const char *rname = zfs_get_name(r);
+	char *lat, *rat;
+	uint64_t lcreate, rcreate;
+	int ret;
+
+	lat = (char *)strchr(lname, '@');
+	rat = (char *)strchr(rname, '@');
+
+	if (lat != NULL)
+		*lat = '\0';
+	if (rat != NULL)
+		*rat = '\0';
+
+	ret = strcmp(lname, rname);
+	if (ret == 0) {
+		/*
+		 * If we're comparing a dataset to one of its snapshots, we
+		 * always make the full dataset first.
+		 */
+		if (lat == NULL) {
+			ret = -1;
+		} else if (rat == NULL) {
+			ret = 1;
+		} else {
+			/*
+			 * If we have two snapshots from the same dataset, then
+			 * we want to sort them according to creation time.  We
+			 * use the hidden CREATETXG property to get an absolute
+			 * ordering of snapshots.
+			 */
+			lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG);
+			rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG);
+
+			if (lcreate < rcreate)
+				ret = -1;
+			else if (lcreate > rcreate)
+				ret = 1;
+		}
+	}
+
+	if (lat != NULL)
+		*lat = '@';
+	if (rat != NULL)
+		*rat = '@';
+
+	return (ret);
+}
+
+int
+zfs_for_each(int argc, char **argv, int recurse, zfs_type_t types,
+    zfs_iter_f callback, void *data)
+{
+	callback_data_t cb;
+	int ret = 0;
+	zfs_node_t *node;
+	uu_avl_walk_t *walk;
+
+	avl_pool = uu_avl_pool_create("zfs_pool", sizeof (zfs_node_t),
+	    offsetof(zfs_node_t, zn_avlnode), zfs_compare, UU_DEFAULT);
+
+	if (avl_pool == NULL) {
+		(void) fprintf(stderr,
+		    gettext("internal error: out of memory\n"));
+		exit(1);
+	}
+
+	cb.cb_recurse = recurse;
+	cb.cb_types = types;
+	if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) {
+		(void) fprintf(stderr,
+		    gettext("internal error: out of memory\n"));
+		exit(1);
+	}
+
+	if (argc == 0) {
+		/*
+		 * If given no arguments, iterate over all datasets.
+		 */
+		cb.cb_recurse = 1;
+		ret = zfs_iter_root(zfs_callback, &cb);
+	} else {
+		int i;
+		zfs_handle_t *zhp;
+		zfs_type_t argtype;
+
+		/*
+		 * If we're recursive, then we always allow filesystems as
+		 * arguments.  If we also are interested in snapshots, then we
+		 * can take volumes as well.
+		 */
+		argtype = types;
+		if (recurse) {
+			argtype |= ZFS_TYPE_FILESYSTEM;
+			if (types & ZFS_TYPE_SNAPSHOT)
+				argtype |= ZFS_TYPE_VOLUME;
+		}
+
+		for (i = 0; i < argc; i++) {
+			if ((zhp = zfs_open(argv[i], argtype)) != NULL)
+				ret = zfs_callback(zhp, &cb);
+			else
+				ret = 1;
+		}
+	}
+
+	/*
+	 * At this point we've got our AVL tree full of zfs handles, so iterate
+	 * over each one and execute the real user callback.
+	 */
+	for (node = uu_avl_first(cb.cb_avl); node != NULL;
+	    node = uu_avl_next(cb.cb_avl, node))
+		ret |= callback(node->zn_handle, data);
+
+	/*
+	 * Finally, clean up the AVL tree.
+	 */
+	if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) {
+		(void) fprintf(stderr,
+		    gettext("internal error: out of memory"));
+		exit(1);
+	}
+
+	while ((node = uu_avl_walk_next(walk)) != NULL) {
+		uu_avl_remove(cb.cb_avl, node);
+		zfs_close(node->zn_handle);
+		free(node);
+	}
+
+	uu_avl_walk_end(walk);
+	uu_avl_destroy(cb.cb_avl);
+	uu_avl_pool_destroy(avl_pool);
+
+	return (ret);
+}
diff --git a/usr/src/cmd/zfs/zfs_iter.h b/usr/src/cmd/zfs/zfs_iter.h
new file mode 100644
index 000000000000..03428b827b37
--- /dev/null
+++ b/usr/src/cmd/zfs/zfs_iter.h
@@ -0,0 +1,42 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	ZFS_ITER_H
+#define	ZFS_ITER_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+int zfs_for_each(int, char **, int, zfs_type_t, zfs_iter_f, void *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* ZFS_ITER_H */
diff --git a/usr/src/cmd/zfs/zfs_main.c b/usr/src/cmd/zfs/zfs_main.c
new file mode 100644
index 000000000000..78e3fecf9950
--- /dev/null
+++ b/usr/src/cmd/zfs/zfs_main.c
@@ -0,0 +1,2787 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <assert.h>
+#include <errno.h>
+#include <libgen.h>
+#include <libintl.h>
+#include <libuutil.h>
+#include <locale.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <zone.h>
+#include <sys/mkdev.h>
+#include <sys/mntent.h>
+#include <sys/mnttab.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+
+#include <libzfs.h>
+
+#include "zfs_iter.h"
+
+static FILE *mnttab_file;
+
+static int zfs_do_clone(int argc, char **argv);
+static int zfs_do_create(int argc, char **argv);
+static int zfs_do_destroy(int argc, char **argv);
+static int zfs_do_get(int argc, char **argv);
+static int zfs_do_inherit(int argc, char **argv);
+static int zfs_do_list(int argc, char **argv);
+static int zfs_do_mount(int argc, char **argv);
+static int zfs_do_rename(int argc, char **argv);
+static int zfs_do_rollback(int argc, char **argv);
+static int zfs_do_set(int argc, char **argv);
+static int zfs_do_snapshot(int argc, char **argv);
+static int zfs_do_unmount(int argc, char **argv);
+static int zfs_do_share(int argc, char **argv);
+static int zfs_do_unshare(int argc, char **argv);
+static int zfs_do_backup(int argc, char **argv);
+static int zfs_do_restore(int argc, char **argv);
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+const char *
+_umem_debug_init()
+{
+	return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+	return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+
+typedef struct zfs_command {
+	const char	*name;
+	int		(*func)(int argc, char **argv);
+	const char	*usage;
+} zfs_command_t;
+
+/*
+ * Master command table.  Each ZFS command has a name, associated function, and
+ * usage message.  These commands are organized according to how they are
+ * displayed in the usage message.  An empty command (one with a NULL name)
+ * indicates an empty line in the generic usage message.  A command with a NULL
+ * usage message indicates an alias for an existing command, and is not
+ * displayed in the general usage message.
+ */
+static zfs_command_t command_table[] = {
+	{ "create",	zfs_do_create,
+	    "\tcreate <filesystem>\n"
+	    "\tcreate [-s] [-b blocksize] -V <size> <volume>\n"		},
+	{ "destroy",	zfs_do_destroy,
+	    "\tdestroy [-rRf] <filesystem|volume|snapshot>\n"		},
+	{ NULL },
+	{ "snapshot",	zfs_do_snapshot,
+	    "\tsnapshot <filesystem@name|volume@name>\n"		},
+	{ "rollback",	zfs_do_rollback,
+	    "\trollback [-rRf] <snapshot>\n"				},
+	{ "clone",	zfs_do_clone,
+	    "\tclone <snapshot> <filesystem|volume>\n"			},
+	{ "rename",	zfs_do_rename,
+	    "\trename <filesystems|volume|snapshot> "
+	    "<filesystem|volume|snapshot>\n"				},
+	{ NULL },
+	{ "list",	zfs_do_list,
+	    "\tlist [-rH] [-o property[,property]...] [-t type[,type]...]\n"
+	    "\t    [filesystem|volume|snapshot] ...\n"			},
+	{ NULL },
+	{ "set",	zfs_do_set,
+	    "\tset <property=value> <filesystem|volume> ...\n"		},
+	{ "get", 	zfs_do_get,
+	    "\tget [-rHp] [-o field[,field]...] [-s source[,source]...]\n"
+	    "\t    <all | property[,property]...> "
+	    "<filesystem|volume|snapshot> ...\n"			},
+	{ "inherit",	zfs_do_inherit,
+	    "\tinherit [-r] <property> <filesystem|volume> ...\n"	},
+	{ NULL },
+	{ "mount",	zfs_do_mount,
+	    "\tmount\n"
+	    "\tmount [-o opts] [-O] -a\n"
+	    "\tmount [-o opts] [-O] <filesystem>\n"			},
+	{ NULL },
+	{ "unmount",	zfs_do_unmount,
+	    "\tunmount [-f] -a\n"
+	    "\tunmount [-f] <filesystem|mountpoint>\n"			},
+	{ NULL },
+	{ "share",	zfs_do_share,
+	    "\tshare -a\n"
+	    "\tshare <filesystem>\n"					},
+	{ NULL },
+	{ "unshare",	zfs_do_unshare,
+	    "\tunshare [-f] -a\n"
+	    "\tunshare [-f] <filesystem|mountpoint>\n"			},
+	{ NULL },
+	{ "backup",	zfs_do_backup,
+	    "\tbackup [-i <snapshot>] <snapshot>\n"			},
+	{ "restore",	zfs_do_restore,
+	    "\trestore [-vn] <filesystem|volume|snapshot>\n"
+	    "\trestore [-vn] -d <filesystem>\n"				},
+};
+
+#define	NCOMMAND	(sizeof (command_table) / sizeof (command_table[0]))
+
+zfs_command_t *current_command;
+
+/*
+ * Utility function to guarantee malloc() success.
+ */
+void *
+safe_malloc(size_t size)
+{
+	void *data;
+
+	if ((data = calloc(1, size)) == NULL) {
+		(void) fprintf(stderr, "internal error: out of memory\n");
+		exit(1);
+	}
+
+	return (data);
+}
+
+/*
+ * Display usage message.  If we're inside a command, display only the usage for
+ * that command.  Otherwise, iterate over the entire command table and display
+ * a complete usage message.
+ */
+static void
+usage(int requested)
+{
+	int i;
+	int show_properties = FALSE;
+	FILE *fp = requested ? stdout : stderr;
+
+	if (current_command == NULL) {
+
+		(void) fprintf(fp, gettext("usage: zfs command args ...\n"));
+		(void) fprintf(fp,
+		    gettext("where 'command' is one of the following:\n\n"));
+
+		for (i = 0; i < NCOMMAND; i++) {
+			if (command_table[i].name == NULL)
+				(void) fprintf(fp, "\n");
+			else
+				(void) fprintf(fp, "%s",
+				    command_table[i].usage);
+		}
+
+		(void) fprintf(fp, gettext("\nEach dataset is of the form: "
+		    "pool/[dataset/]*dataset[@name]\n"));
+	} else {
+		(void) fprintf(fp, gettext("usage:\n"));
+		(void) fprintf(fp, current_command->usage);
+	}
+
+	if (current_command == NULL ||
+	    strcmp(current_command->name, "set") == 0 ||
+	    strcmp(current_command->name, "get") == 0 ||
+	    strcmp(current_command->name, "inherit") == 0 ||
+	    strcmp(current_command->name, "list") == 0)
+		show_properties = TRUE;
+
+	if (show_properties) {
+
+		(void) fprintf(fp,
+		    gettext("\nThe following properties are supported:\n"));
+
+		(void) fprintf(fp, "\n\t%-13s  %s  %s   %s\n\n",
+		    "PROPERTY", "EDIT", "INHERIT", "VALUES");
+
+		for (i = 0; i < ZFS_NPROP_VISIBLE; i++) {
+			(void) fprintf(fp, "\t%-13s  ", zfs_prop_to_name(i));
+
+			if (zfs_prop_readonly(i))
+				(void) fprintf(fp, "  NO    ");
+			else
+				(void) fprintf(fp, " YES    ");
+
+			if (zfs_prop_inheritable(i))
+				(void) fprintf(fp, "  YES   ");
+			else
+				(void) fprintf(fp, "   NO   ");
+
+			if (zfs_prop_values(i) == NULL)
+				(void) fprintf(fp, "-\n");
+			else
+				(void) fprintf(fp, "%s\n", zfs_prop_values(i));
+		}
+		(void) fprintf(fp, gettext("\nSizes are specified in bytes "
+		    "with standard units such as K, M, G, etc.\n"));
+	}
+
+	exit(requested ? 0 : 2);
+}
+
+/*
+ * zfs clone <fs, snap, vol> fs
+ *
+ * Given an existing dataset, create a writable copy whose initial contents
+ * are the same as the source.  The newly created dataset maintains a
+ * dependency on the original; the original cannot be destroyed so long as
+ * the clone exists.
+ */
+static int
+zfs_do_clone(int argc, char **argv)
+{
+	zfs_handle_t *zhp;
+	int ret;
+
+	/* check options */
+	if (argc > 1 && argv[1][0] == '-') {
+		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+		    argv[1][1]);
+		usage(FALSE);
+	}
+
+	/* check number of arguments */
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing source dataset "
+		    "argument\n"));
+		usage(FALSE);
+	}
+	if (argc < 3) {
+		(void) fprintf(stderr, gettext("missing target dataset "
+		    "argument\n"));
+		usage(FALSE);
+	}
+	if (argc > 3) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(FALSE);
+	}
+
+	/* open the source dataset */
+	if ((zhp = zfs_open(argv[1], ZFS_TYPE_SNAPSHOT)) == NULL)
+		return (1);
+
+	/* pass to libzfs */
+	ret = zfs_clone(zhp, argv[2]);
+
+	/* create the mountpoint if necessary */
+	if (ret == 0) {
+		zfs_handle_t *clone = zfs_open(argv[2], ZFS_TYPE_ANY);
+		if (clone != NULL) {
+			if ((ret = zfs_mount(clone, NULL, 0)) == 0)
+				ret = zfs_share(clone);
+			zfs_close(clone);
+		}
+	}
+
+	zfs_close(zhp);
+
+	return (ret == 0 ? 0 : 1);
+}
+
+/*
+ * zfs create fs
+ * zfs create [-s] -V vol size
+ *
+ * Create a new dataset.  This command can be used to create filesystems
+ * and volumes.  Snapshot creation is handled by 'zfs snapshot'.
+ * For volumes, the user must specify a size to be used.
+ *
+ * The '-s' flag applies only to volumes, and indicates that we should not try
+ * to set the reservation for this volume.  By default we set a reservation
+ * equal to the size for any volume.
+ */
+static int
+zfs_do_create(int argc, char **argv)
+{
+	zfs_type_t type = ZFS_TYPE_FILESYSTEM;
+	zfs_handle_t *zhp;
+	char *size = NULL;
+	char *blocksize = NULL;
+	int c;
+	int noreserve = FALSE;
+	int ret;
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":V:b:s")) != -1) {
+		switch (c) {
+		case 'V':
+			type = ZFS_TYPE_VOLUME;
+			size = optarg;
+			break;
+		case 'b':
+			blocksize = optarg;
+			break;
+		case 's':
+			noreserve = TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing size "
+			    "argument\n"));
+			usage(FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	if (noreserve && type != ZFS_TYPE_VOLUME) {
+		(void) fprintf(stderr, gettext("'-s' can only be used when "
+		    "creating a volume\n"));
+		usage(FALSE);
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc == 0) {
+		(void) fprintf(stderr, gettext("missing %s argument\n"),
+		    zfs_type_to_name(type));
+		usage(FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(FALSE);
+	}
+
+	/* pass to libzfs */
+	if (zfs_create(argv[0], type, size, blocksize) != 0)
+		return (1);
+
+	if ((zhp = zfs_open(argv[0], ZFS_TYPE_ANY)) == NULL)
+		return (1);
+
+	/*
+	 * Volume handling.  By default, we try to create a reservation of equal
+	 * size for the volume.  If we can't do this, then destroy the dataset
+	 * and report an error.
+	 */
+	if (type == ZFS_TYPE_VOLUME && !noreserve) {
+		if (zfs_prop_set(zhp, ZFS_PROP_RESERVATION, size) != 0) {
+			(void) fprintf(stderr, gettext("use '-s' to create a "
+			    "volume without a matching reservation\n"));
+			(void) zfs_destroy(zhp);
+			return (1);
+		}
+	}
+
+	/*
+	 * Mount and/or share the new filesystem as appropriate.  We provide a
+	 * verbose error message to let the user know that their filesystem was
+	 * in fact created, even if we failed to mount or share it.
+	 */
+	if (zfs_mount(zhp, NULL, 0) != 0) {
+		(void) fprintf(stderr, gettext("filesystem successfully "
+		    "created, but not mounted\n"));
+		ret = 1;
+	} else if (zfs_share(zhp) != 0) {
+		(void) fprintf(stderr, gettext("filesystem successfully "
+		    "created, but not shared\n"));
+		ret = 1;
+	} else {
+		ret = 0;
+	}
+
+	zfs_close(zhp);
+	return (ret);
+}
+
+/*
+ * zfs destroy [-rf] <fs, snap, vol>
+ *
+ * 	-r	Recursively destroy all children
+ * 	-R	Recursively destroy all dependents, including clones
+ * 	-f	Force unmounting of any dependents
+ *
+ * Destroys the given dataset.  By default, it will unmount any filesystems,
+ * and refuse to destroy a dataset that has any dependents.  A dependent can
+ * either be a child, or a clone of a child.
+ */
+typedef struct destroy_cbdata {
+	int		cb_first;
+	int		cb_force;
+	int		cb_recurse;
+	int		cb_error;
+	int		cb_needforce;
+	int		cb_doclones;
+	zfs_handle_t	*cb_target;
+} destroy_cbdata_t;
+
+/*
+ * Check for any dependents based on the '-r' or '-R' flags.
+ */
+static int
+destroy_check_dependent(zfs_handle_t *zhp, void *data)
+{
+	destroy_cbdata_t *cbp = data;
+	const char *tname = zfs_get_name(cbp->cb_target);
+	const char *name = zfs_get_name(zhp);
+
+	if (strncmp(tname, name, strlen(tname)) == 0 &&
+	    (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) {
+		/*
+		 * This is a direct descendant, not a clone somewhere else in
+		 * the hierarchy.
+		 */
+		if (cbp->cb_recurse)
+			goto out;
+
+		if (cbp->cb_first) {
+			(void) fprintf(stderr, gettext("cannot destroy '%s': "
+			    "%s has children\n"),
+			    zfs_get_name(cbp->cb_target),
+			    zfs_type_to_name(zfs_get_type(cbp->cb_target)));
+			(void) fprintf(stderr, gettext("use '-r' to destroy "
+			    "the following datasets:\n"));
+			cbp->cb_first = 0;
+			cbp->cb_error = 1;
+		}
+
+		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+	} else {
+		/*
+		 * This is a clone.  We only want to report this if the '-r'
+		 * wasn't specified, or the target is a snapshot.
+		 */
+		if (!cbp->cb_recurse &&
+		    zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT)
+			goto out;
+
+		if (cbp->cb_first) {
+			(void) fprintf(stderr, gettext("cannot destroy '%s': "
+			    "%s has dependent clones\n"),
+			    zfs_get_name(cbp->cb_target),
+			    zfs_type_to_name(zfs_get_type(cbp->cb_target)));
+			(void) fprintf(stderr, gettext("use '-R' to destroy "
+			    "the following datasets:\n"));
+			cbp->cb_first = 0;
+			cbp->cb_error = 1;
+		}
+
+		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+	}
+
+out:
+	zfs_close(zhp);
+	return (0);
+}
+
+static int
+destroy_callback(zfs_handle_t *zhp, void *data)
+{
+	destroy_cbdata_t *cbp = data;
+
+	/*
+	 * Ignore pools (which we've already flagged as an error before getting
+	 * here.
+	 */
+	if (strchr(zfs_get_name(zhp), '/') == NULL &&
+	    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
+		zfs_close(zhp);
+		return (0);
+	}
+
+	/*
+	 * Bail out on the first error.
+	 */
+	if (zfs_unmount(zhp, NULL, cbp->cb_force ? MS_FORCE : 0) != 0 ||
+	    zfs_destroy(zhp) != 0) {
+		zfs_close(zhp);
+		return (-1);
+	}
+
+	zfs_close(zhp);
+	return (0);
+}
+
+
+static int
+zfs_do_destroy(int argc, char **argv)
+{
+	destroy_cbdata_t cb = { 0 };
+	int c;
+	zfs_handle_t *zhp;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "frR")) != -1) {
+		switch (c) {
+		case 'f':
+			cb.cb_force = 1;
+			break;
+		case 'r':
+			cb.cb_recurse = 1;
+			break;
+		case 'R':
+			cb.cb_recurse = 1;
+			cb.cb_doclones = 1;
+			break;
+		case '?':
+		default:
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc == 0) {
+		(void) fprintf(stderr, gettext("missing path argument\n"));
+		usage(FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(FALSE);
+	}
+
+	/* Open the given dataset */
+	if ((zhp = zfs_open(argv[0], ZFS_TYPE_ANY)) == NULL)
+		return (1);
+
+	cb.cb_target = zhp;
+
+	/*
+	 * Perform an explicit check for pools before going any further.
+	 */
+	if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL &&
+	    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
+		(void) fprintf(stderr, gettext("cannot destroy '%s': "
+		    "operation does not apply to pools\n"),
+		    zfs_get_name(zhp));
+		(void) fprintf(stderr, gettext("use 'zfs destroy -r "
+		    "%s' to destroy all datasets in the pool\n"),
+		    zfs_get_name(zhp));
+		(void) fprintf(stderr, gettext("use 'zpool destroy %s' "
+		    "to destroy the pool itself\n"), zfs_get_name(zhp));
+		zfs_close(zhp);
+		return (1);
+	}
+
+
+	/*
+	 * Check for any dependents and/or clones.
+	 */
+	cb.cb_first = 1;
+	if (!cb.cb_doclones)
+		(void) zfs_iter_dependents(zhp, destroy_check_dependent, &cb);
+
+	if (cb.cb_error) {
+		zfs_close(zhp);
+		return (1);
+	}
+
+	/*
+	 * Do the real thing.
+	 */
+	if (zfs_iter_dependents(zhp, destroy_callback, &cb) == 0 &&
+	    destroy_callback(zhp, &cb) == 0)
+		return (0);
+
+	return (1);
+}
+
+/*
+ * zfs get [-rH] [-o field[,field]...] [-s source[,source]...]
+ * 	prop[,prop...] < fs | snap | vol > ...
+ *
+ *	-r	recurse over any child datasets
+ *	-H	scripted mode.  Headers are stripped, and fields are separated
+ *		by tabs instead of spaces.
+ *	-o	Set of fields to display.  One of "name,property,value,source".
+ *		Default is all four.
+ *	-s	Set of sources to allow.  One of
+ *		"local,default,inherited,temporary,none".  Default is all
+ *		five.
+ *	-p	Display values in parsable (literal) format.
+ *
+ *  Prints properties for the given datasets.  The user can control which
+ *  columns to display as well as which property types to allow.
+ */
+typedef struct get_cbdata {
+	int cb_scripted;
+	int cb_sources;
+	int cb_literal;
+	int cb_columns[4];
+	zfs_prop_t cb_prop[ZFS_NPROP_ALL];
+	int cb_nprop;
+} get_cbdata_t;
+
+#define	GET_COL_NAME		1
+#define	GET_COL_PROPERTY	2
+#define	GET_COL_VALUE		3
+#define	GET_COL_SOURCE		4
+
+/*
+ * Display a single line of output, according to the settings in the callback
+ * structure.
+ */
+static void
+print_one_property(zfs_handle_t *zhp, get_cbdata_t *cbp, zfs_prop_t prop,
+    const char *value, zfs_source_t sourcetype, const char *source)
+{
+	int i;
+	int width;
+	const char *str;
+	char buf[128];
+
+	/*
+	 * Ignore those source types that the user has chosen to ignore.
+	 */
+	if ((sourcetype & cbp->cb_sources) == 0)
+		return;
+
+	for (i = 0; i < 4; i++) {
+		switch (cbp->cb_columns[i]) {
+		case GET_COL_NAME:
+			width = 15;
+			str = zfs_get_name(zhp);
+			break;
+
+		case GET_COL_PROPERTY:
+			width = 13;
+			str = zfs_prop_to_name(prop);
+			break;
+
+		case GET_COL_VALUE:
+			width = 25;
+			str = value;
+			break;
+
+		case GET_COL_SOURCE:
+			width = 15;
+			switch (sourcetype) {
+			case ZFS_SRC_NONE:
+				str = "-";
+				break;
+
+			case ZFS_SRC_DEFAULT:
+				str = "default";
+				break;
+
+			case ZFS_SRC_LOCAL:
+				str = "local";
+				break;
+
+			case ZFS_SRC_TEMPORARY:
+				str = "temporary";
+				break;
+
+			case ZFS_SRC_INHERITED:
+				(void) snprintf(buf, sizeof (buf),
+				    "inherited from %s", source);
+				str = buf;
+				break;
+			}
+			break;
+
+		default:
+			continue;
+		}
+
+		if (cbp->cb_columns[i + 1] == 0)
+			(void) printf("%s", str);
+		else if (cbp->cb_scripted)
+			(void) printf("%s\t", str);
+		else
+			(void) printf("%-*s  ", width, str);
+
+	}
+
+	(void) printf("\n");
+}
+
+/*
+ * Invoked to display the properties for a single dataset.
+ */
+static int
+get_callback(zfs_handle_t *zhp, void *data)
+{
+	char buf[ZFS_MAXPROPLEN];
+	zfs_source_t sourcetype;
+	char source[ZFS_MAXNAMELEN];
+	get_cbdata_t *cbp = data;
+	int i;
+
+	/*
+	 * If we've been given a list of properties, always list properties
+	 * in the order given.  Otherwise, iterate over all properties and
+	 * determine if we should display them.
+	 */
+	if (cbp->cb_nprop != 0) {
+		for (i = 0; i < cbp->cb_nprop; i++) {
+			if (zfs_prop_get(zhp, cbp->cb_prop[i], buf,
+			    sizeof (buf), &sourcetype, source, sizeof (source),
+			    cbp->cb_literal) != 0) {
+				(void) strlcpy(buf, "-", sizeof (buf));
+				sourcetype = ZFS_SRC_NONE;
+			}
+
+			print_one_property(zhp, cbp, cbp->cb_prop[i],
+			    buf, sourcetype, source);
+		}
+	} else {
+		for (i = 0; i < ZFS_NPROP_VISIBLE; i++) {
+			if (zfs_prop_get(zhp, i, buf,
+			    sizeof (buf), &sourcetype, source, sizeof (source),
+			    cbp->cb_literal) == 0) {
+				print_one_property(zhp, cbp, i,
+				    buf, sourcetype, source);
+			}
+		}
+	}
+
+	return (0);
+}
+
+static int
+zfs_do_get(int argc, char **argv)
+{
+	get_cbdata_t cb = { 0 };
+	int recurse = 0;
+	int c;
+	char **subopts = zfs_prop_column_subopts();
+	char **shortsubopts = zfs_prop_column_short_subopts();
+	int prop;
+	char *value, *fields, *save_fields;
+	int i;
+
+	/*
+	 * Set up default columns and sources.
+	 */
+	cb.cb_sources = ZFS_SRC_ALL;
+	cb.cb_columns[0] = GET_COL_NAME;
+	cb.cb_columns[1] = GET_COL_PROPERTY;
+	cb.cb_columns[2] = GET_COL_VALUE;
+	cb.cb_columns[3] = GET_COL_SOURCE;
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":o:s:rHp")) != -1) {
+		switch (c) {
+		case 'p':
+			cb.cb_literal = TRUE;
+			break;
+		case 'r':
+			recurse = TRUE;
+			break;
+		case 'H':
+			cb.cb_scripted = TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(FALSE);
+			break;
+		case 'o':
+			/*
+			 * Process the set of columns to display.  We zero out
+			 * the structure to give us a blank slate.
+			 */
+			bzero(&cb.cb_columns, sizeof (cb.cb_columns));
+			i = 0;
+			while (*optarg != '\0') {
+				static char *col_subopts[] =
+				    { "name", "property", "value", "source",
+				    NULL };
+
+				if (i == 4) {
+					(void) fprintf(stderr, gettext("too "
+					    "many fields given to -o "
+					    "option\n"));
+					usage(FALSE);
+				}
+
+				switch (getsubopt(&optarg, col_subopts,
+				    &value)) {
+				case 0:
+					cb.cb_columns[i++] = GET_COL_NAME;
+					break;
+				case 1:
+					cb.cb_columns[i++] = GET_COL_PROPERTY;
+					break;
+				case 2:
+					cb.cb_columns[i++] = GET_COL_VALUE;
+					break;
+				case 3:
+					cb.cb_columns[i++] = GET_COL_SOURCE;
+					break;
+				default:
+					(void) fprintf(stderr,
+					    gettext("invalid column name "
+					    "'%s'\n"), value);
+					    usage(FALSE);
+				}
+			}
+			break;
+
+		case 's':
+			cb.cb_sources = 0;
+			while (*optarg != '\0') {
+				static char *source_subopts[] = {
+					"local", "default", "inherited",
+					"temporary", "none", NULL };
+
+				switch (getsubopt(&optarg, source_subopts,
+				    &value)) {
+				case 0:
+					cb.cb_sources |= ZFS_SRC_LOCAL;
+					break;
+				case 1:
+					cb.cb_sources |= ZFS_SRC_DEFAULT;
+					break;
+				case 2:
+					cb.cb_sources |= ZFS_SRC_INHERITED;
+					break;
+				case 3:
+					cb.cb_sources |= ZFS_SRC_TEMPORARY;
+					break;
+				case 4:
+					cb.cb_sources |= ZFS_SRC_NONE;
+					break;
+				default:
+					(void) fprintf(stderr,
+					    gettext("invalid source "
+					    "'%s'\n"), value);
+					    usage(FALSE);
+				}
+			}
+			break;
+
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing property "
+		    "argument\n"));
+		usage(FALSE);
+	}
+
+	fields = argv[0];
+
+	/*
+	 * Leaving 'cb_nprop' at 0 will cause the callback to iterate over all
+	 * known properties.
+	 */
+	if (strcmp(fields, "all") != 0) {
+		while (*fields != '\0') {
+			if (cb.cb_nprop == ZFS_NPROP_ALL) {
+				(void) fprintf(stderr, gettext("too many "
+				    "properties given to -o option\n"));
+				usage(FALSE);
+			}
+
+			save_fields = fields;
+			if ((prop = getsubopt(&fields, subopts,
+			    &value)) == -1) {
+				fields = save_fields;
+				prop = getsubopt(&fields, shortsubopts, &value);
+			}
+
+			if (prop == -1) {
+				(void) fprintf(stderr,
+				    gettext("invalid property '%s'\n"), value);
+				usage(FALSE);
+			}
+
+			/*
+			 * The 'name' property is a one-off special for 'zfs
+			 * list', but is not a valid property for 'zfs get'.
+			 */
+			if (zfs_prop_column_name(prop) == NULL ||
+			    prop == ZFS_PROP_NAME) {
+				(void) fprintf(stderr, gettext("invalid "
+				    "property '%s'\n"), zfs_prop_to_name(prop));
+				usage(FALSE);
+			}
+
+			cb.cb_prop[cb.cb_nprop++] = prop;
+		}
+	}
+
+	argc--;
+	argv++;
+
+	/* check for at least one dataset name */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing dataset argument\n"));
+		usage(FALSE);
+	}
+
+	/*
+	 * Print out any headers
+	 */
+	if (!cb.cb_scripted) {
+		int i;
+		for (i = 0; i < 4; i++) {
+			switch (cb.cb_columns[i]) {
+			case GET_COL_NAME:
+				(void) printf("%-15s  ", "NAME");
+				break;
+			case GET_COL_PROPERTY:
+				(void) printf("%-13s  ", "PROPERTY");
+				break;
+			case GET_COL_VALUE:
+				(void) printf("%-25s  ", "VALUE");
+				break;
+			case GET_COL_SOURCE:
+				(void) printf("%s", "SOURCE");
+				break;
+			}
+		}
+		(void) printf("\n");
+	}
+
+	free(subopts);
+	for (i = 0; i < ZFS_NPROP_ALL; i++)
+		if (shortsubopts[i][0])
+			free(shortsubopts[i]);
+	free(shortsubopts);
+
+	/* run for each object */
+	return (zfs_for_each(argc, argv, recurse, ZFS_TYPE_ANY,
+	    get_callback, &cb));
+}
+
+/*
+ * inherit [-r] <property> <fs|vol> ...
+ *
+ * 	-r	Recurse over all children
+ *
+ * For each dataset specified on the command line, inherit the given property
+ * from its parent.  Inheriting a property at the pool level will cause it to
+ * use the default value.  The '-r' flag will recurse over all children, and is
+ * useful for setting a property on a hierarchy-wide basis, regardless of any
+ * local modifications for each dataset.
+ */
+static int
+inherit_callback(zfs_handle_t *zhp, void *data)
+{
+	zfs_prop_t prop = (zfs_prop_t)data;
+
+	return (zfs_prop_inherit(zhp, prop) != 0);
+}
+
+static int
+zfs_do_inherit(int argc, char **argv)
+{
+	int recurse = 0;
+	int c;
+	zfs_prop_t prop;
+	char *propname;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "r")) != -1) {
+		switch (c) {
+		case 'r':
+			recurse = TRUE;
+			break;
+		case '?':
+		default:
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing property argument\n"));
+		usage(FALSE);
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing dataset argument\n"));
+		usage(FALSE);
+	}
+
+	propname = argv[0];
+
+	/*
+	 * Get and validate the property before iterating over the datasets.  We
+	 * do this now so as to avoid printing out an error message for each and
+	 * every dataset.
+	 */
+	if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL) {
+		(void) fprintf(stderr, gettext("invalid property '%s'\n"),
+		    propname);
+		usage(FALSE);
+	}
+	if (zfs_prop_readonly(prop)) {
+		(void) fprintf(stderr, gettext("%s property is read-only\n"),
+		    propname);
+		return (1);
+	}
+	if (!zfs_prop_inheritable(prop)) {
+		(void) fprintf(stderr, gettext("%s property cannot be "
+		    "inherited\n"), propname);
+		(void) fprintf(stderr, gettext("use 'zfs set %s=none' to "
+		    "clear\n"), propname);
+		return (1);
+	}
+
+	return (zfs_for_each(argc - 1, argv + 1, recurse,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    inherit_callback, (void *)prop));
+}
+
+/*
+ * list [-rH] [-a | -s] [-o prop[,prop]*] [fs | vol] ...
+ *
+ * 	-r	Recurse over all children
+ * 	-H	Scripted mode; elide headers and separate colums by tabs
+ * 	-a	Display all datasets
+ * 	-s	Display only snapshots
+ * 	-o	Control which fields to display.
+ *
+ * When given no arguments, lists all filesystems in the system.
+ * Otherwise, list the specified datasets, optionally recursing down them if
+ * '-r' is specified.
+ *
+ * If '-a' is given, then all datasets (including snapshots) are displayed.  If
+ * '-s' is given, then only snapshots are displayed.  Use of these options
+ * change the default set of fields output, which can still be overridden with
+ * '-o'.
+ */
+typedef struct list_cbdata {
+	int	cb_first;
+	int	cb_scripted;
+	int	cb_fields[ZFS_NPROP_ALL];
+	int	cb_fieldcount;
+} list_cbdata_t;
+
+/*
+ * Given a list of columns to display, output appropriate headers for each one.
+ */
+static void
+print_header(int *fields, size_t count)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		if (i != 0)
+			(void) printf("  ");
+		if (i == count - 1)
+			(void) printf("%s", zfs_prop_column_name(fields[i]));
+		else	/* LINTED - format specifier */
+			(void) printf(zfs_prop_column_format(fields[i]),
+			    zfs_prop_column_name(fields[i]));
+	}
+
+	(void) printf("\n");
+}
+
+/*
+ * Given a dataset and a list of fields, print out all the properties according
+ * to the described layout.
+ */
+static void
+print_dataset(zfs_handle_t *zhp, int *fields, size_t count, int scripted)
+{
+	int i;
+	char property[ZFS_MAXPROPLEN];
+
+	for (i = 0; i < count; i++) {
+		if (i != 0) {
+			if (scripted)
+				(void) printf("\t");
+			else
+				(void) printf("  ");
+		}
+
+		if (zfs_prop_get(zhp, fields[i], property,
+		    sizeof (property), NULL, NULL, 0, FALSE) != 0)
+			(void) strlcpy(property, "-", sizeof (property));
+
+		if (scripted || i == count - 1)
+			(void) printf("%s", property);
+		else	/* LINTED - format specifier */
+			(void) printf(zfs_prop_column_format(fields[i]),
+			    property);
+	}
+
+	(void) printf("\n");
+}
+
+/*
+ * Generic callback function to list a dataset or snapshot.
+ */
+static int
+list_callback(zfs_handle_t *zhp, void *data)
+{
+	list_cbdata_t *cbp = data;
+
+	if (cbp->cb_first) {
+		if (!cbp->cb_scripted)
+			print_header(cbp->cb_fields, cbp->cb_fieldcount);
+		cbp->cb_first = FALSE;
+	}
+
+	print_dataset(zhp, cbp->cb_fields, cbp->cb_fieldcount,
+	    cbp->cb_scripted);
+
+	return (0);
+}
+
+static int
+zfs_do_list(int argc, char **argv)
+{
+	int c;
+	int recurse = 0;
+	int scripted = FALSE;
+	static char default_fields[] =
+	    "name,used,available,referenced,mountpoint";
+	int types = ZFS_TYPE_ANY;
+	char *fields = NULL;
+	char *basic_fields = default_fields;
+	list_cbdata_t cb = { 0 };
+	char *value;
+	int ret;
+	char **subopts = zfs_prop_column_subopts();
+	char **shortsubopts = zfs_prop_column_short_subopts();
+	int prop;
+	char *type_subopts[] = { "filesystem", "volume", "snapshot", NULL };
+	char *save_fields;
+	int i;
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":o:rt:H")) != -1) {
+		switch (c) {
+		case 'o':
+			fields = optarg;
+			break;
+		case 'r':
+			recurse = TRUE;
+			break;
+		case 'H':
+			scripted = TRUE;
+			break;
+		case 't':
+			types = 0;
+			while (*optarg != '\0') {
+				switch (getsubopt(&optarg, type_subopts,
+				    &value)) {
+				case 0:
+					types |= ZFS_TYPE_FILESYSTEM;
+					break;
+				case 1:
+					types |= ZFS_TYPE_VOLUME;
+					break;
+				case 2:
+					types |= ZFS_TYPE_SNAPSHOT;
+					break;
+				default:
+					(void) fprintf(stderr,
+					    gettext("invalid type '%s'\n"),
+					    value);
+					usage(FALSE);
+				}
+			}
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (fields == NULL)
+		fields = basic_fields;
+
+	while (*fields != '\0') {
+		if (cb.cb_fieldcount == ZFS_NPROP_ALL) {
+			(void) fprintf(stderr, gettext("too many "
+			    "properties given to -o option\n"));
+			usage(FALSE);
+		}
+
+		save_fields = fields;
+		if ((prop = getsubopt(&fields, subopts, &value)) == -1) {
+			fields = save_fields;
+			prop = getsubopt(&fields, shortsubopts, &value);
+		}
+
+		if (prop == -1) {
+			(void) fprintf(stderr, gettext("invalid property "
+			    "'%s'\n"), value);
+			usage(FALSE);
+		}
+
+		if (zfs_prop_column_name(prop) == NULL) {
+			(void) fprintf(stderr, gettext("invalid property "
+			    "'%s'\n"), zfs_prop_to_name(prop));
+			usage(FALSE);
+		}
+
+		cb.cb_fields[cb.cb_fieldcount++] = prop;
+	}
+
+	cb.cb_scripted = scripted;
+	cb.cb_first = TRUE;
+
+	ret = zfs_for_each(argc, argv, recurse, types, list_callback, &cb);
+
+	if (ret == 0 && cb.cb_first == TRUE)
+		(void) printf(gettext("no datasets available\n"));
+
+	free(subopts);
+	for (i = 0; i < ZFS_NPROP_ALL; i++)
+		if (shortsubopts[i][0])
+			free(shortsubopts[i]);
+	free(shortsubopts);
+
+	return (ret);
+}
+
+/*
+ * zfs rename <fs | snap | vol> <fs | snap | vol>
+ *
+ * Renames the given dataset to another of the same type.
+ */
+/* ARGSUSED */
+static int
+zfs_do_rename(int argc, char **argv)
+{
+	zfs_handle_t *zhp;
+	int ret = 1;
+
+	/* check options */
+	if (argc > 1 && argv[1][0] == '-') {
+		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+		    argv[1][1]);
+		usage(FALSE);
+	}
+
+	/* check number of arguments */
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing source dataset "
+		    "argument\n"));
+		usage(FALSE);
+	}
+	if (argc < 3) {
+		(void) fprintf(stderr, gettext("missing target dataset "
+		    "argument\n"));
+		usage(FALSE);
+	}
+	if (argc > 3) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(FALSE);
+	}
+
+	if ((zhp = zfs_open(argv[1], ZFS_TYPE_ANY)) == NULL)
+		return (1);
+
+	if (zfs_rename(zhp, argv[2]) != 0)
+		goto error;
+
+	ret = 0;
+error:
+	zfs_close(zhp);
+	return (ret);
+}
+
+/*
+ * zfs rollback [-rfR] <snapshot>
+ *
+ * 	-r	Delete any intervening snapshots before doing rollback
+ * 	-R	Delete any snapshots and their clones
+ * 	-f	Force unmount filesystems, even if they are in use.
+ *
+ * Given a filesystem, rollback to a specific snapshot, discarding any changes
+ * since then and making it the active dataset.  If more recent snapshots exist,
+ * the command will complain unless the '-r' flag is given.
+ */
+typedef struct rollback_cbdata {
+	uint64_t	cb_create;
+	int		cb_first;
+	int		cb_force;
+	int		cb_doclones;
+	char		*cb_target;
+	int		cb_error;
+	int		cb_recurse;
+	int		cb_dependent;
+} rollback_cbdata_t;
+
+/*
+ * Report any snapshots more recent than the one specified.  Used when '-r' is
+ * not specified.  We reuse this same callback for the snapshot dependents - if
+ * 'cb_dependent' is set, then this is a dependent and we should report it
+ * without checking the transaction group.
+ */
+static int
+rollback_check(zfs_handle_t *zhp, void *data)
+{
+	rollback_cbdata_t *cbp = data;
+
+	if (cbp->cb_doclones)
+		return (0);
+
+	if (!cbp->cb_dependent) {
+		if (strcmp(zfs_get_name(zhp), cbp->cb_target) != 0 &&
+		    zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) >
+		    cbp->cb_create) {
+
+			if (cbp->cb_first && !cbp->cb_recurse) {
+				(void) fprintf(stderr, gettext("cannot "
+				    "rollback to '%s': more recent snapshots "
+				    "exist\n"),
+				    cbp->cb_target);
+				(void) fprintf(stderr, gettext("use '-r' to "
+				    "force deletion of the following "
+				    "snapshots:\n"));
+				cbp->cb_first = 0;
+				cbp->cb_error = 1;
+			}
+
+			if (cbp->cb_recurse) {
+				cbp->cb_dependent = TRUE;
+				(void) zfs_iter_dependents(zhp, rollback_check,
+				    cbp);
+				cbp->cb_dependent = FALSE;
+			} else {
+				(void) fprintf(stderr, "%s\n",
+				    zfs_get_name(zhp));
+			}
+		}
+	} else {
+		if (cbp->cb_first && cbp->cb_recurse) {
+			(void) fprintf(stderr, gettext("cannot rollback to "
+			    "'%s': clones of previous snapshots exist\n"),
+			    cbp->cb_target);
+			(void) fprintf(stderr, gettext("use '-R' to "
+			    "force deletion of the following clones and "
+			    "dependents:\n"));
+			cbp->cb_first = 0;
+			cbp->cb_error = 1;
+		}
+
+		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+	}
+
+	zfs_close(zhp);
+	return (0);
+}
+
+/*
+ * Unmount any filesystems or snapshots that will need to be destroyed as part
+ * of the rollback process.
+ */
+static int
+rollback_unmount(zfs_handle_t *zhp, void *data)
+{
+	rollback_cbdata_t *cbp = data;
+
+	if (!cbp->cb_dependent) {
+		if (strcmp(zfs_get_name(zhp), cbp->cb_target) != 0 &&
+		    zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) >
+		    cbp->cb_create) {
+
+			cbp->cb_dependent = TRUE;
+			(void) zfs_iter_dependents(zhp, rollback_unmount, cbp);
+			cbp->cb_dependent = FALSE;
+
+			if (zfs_unmount(zhp, NULL,
+			    cbp->cb_force ? MS_FORCE: 0) != 0)
+				cbp->cb_error = 1;
+		}
+	} else if (zfs_unmount(zhp, NULL, cbp->cb_force ? MS_FORCE : 0) != 0) {
+		cbp->cb_error = 1;
+	}
+
+	zfs_close(zhp);
+	return (0);
+}
+
+/*
+ * Destroy any more recent snapshots.  We invoke this callback on any dependents
+ * of the snapshot first.  If the 'cb_dependent' member is non-zero, then this
+ * is a dependent and we should just destroy it without checking the transaction
+ * group.
+ */
+static int
+rollback_destroy(zfs_handle_t *zhp, void *data)
+{
+	rollback_cbdata_t *cbp = data;
+
+	if (!cbp->cb_dependent) {
+		if (strcmp(zfs_get_name(zhp), cbp->cb_target) != 0 &&
+		    zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) >
+		    cbp->cb_create) {
+
+			cbp->cb_dependent = TRUE;
+			(void) zfs_iter_dependents(zhp, rollback_destroy, cbp);
+			cbp->cb_dependent = FALSE;
+
+			if (zfs_destroy(zhp) != 0)
+				cbp->cb_error = 1;
+		}
+	} else if (zfs_destroy(zhp) != 0) {
+		cbp->cb_error = 1;
+	}
+
+	zfs_close(zhp);
+	return (0);
+}
+
+static int
+zfs_do_rollback(int argc, char **argv)
+{
+	int ret;
+	int c;
+	rollback_cbdata_t cb = { 0 };
+	int was_mounted;
+	zfs_handle_t *zhp, *snap;
+	char parentname[ZFS_MAXNAMELEN];
+	char *delim;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "rfR")) != -1) {
+		switch (c) {
+		case 'f':
+			cb.cb_force = TRUE;
+			break;
+		case 'r':
+			cb.cb_recurse = 1;
+			break;
+		case 'R':
+			cb.cb_recurse = 1;
+			cb.cb_doclones = 1;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing dataset argument\n"));
+		usage(FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(FALSE);
+	}
+
+	cb.cb_target = argv[0];
+
+	/* open the snapshot */
+	if ((snap = zfs_open(cb.cb_target, ZFS_TYPE_SNAPSHOT)) == NULL)
+		return (1);
+
+	(void) strlcpy(parentname, cb.cb_target, sizeof (parentname));
+	verify((delim = strrchr(parentname, '@')) != NULL);
+	*delim = '\0';
+	if ((zhp = zfs_open(parentname, ZFS_TYPE_ANY)) == NULL) {
+		zfs_close(snap);
+		return (1);
+	}
+
+	/* See if this dataset is mounted */
+	was_mounted = zfs_is_mounted(zhp, NULL);
+
+	cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
+
+	/*
+	 * Check for more recent snapshots and/or clones based on the presence
+	 * of '-r' and '-R'.
+	 */
+	cb.cb_first = 1;
+	cb.cb_error = 0;
+	(void) zfs_iter_children(zhp, rollback_check, &cb);
+
+	if ((ret = cb.cb_error) != 0)
+		goto out;
+
+	cb.cb_error = 0;
+
+	/*
+	 * Unmount any snapshots as well as the dataset itself.
+	 */
+	if ((ret = zfs_iter_children(zhp, rollback_unmount,
+	    &cb)) != 0 || (ret = zfs_unmount(zhp, NULL,
+		cb.cb_force ? MS_FORCE : 0)) != 0)
+		goto out;
+
+	(void) zfs_iter_children(zhp, rollback_destroy, &cb);
+
+	if ((ret = cb.cb_error) != 0)
+		goto out;
+
+	/*
+	 * Now that we have verified that the snapshot is the latest, rollback
+	 * to the given snapshot.
+	 */
+	ret = zfs_rollback(zhp);
+
+	/*
+	 * We only want to re-mount the filesystem if it was mounted in the
+	 * first place.
+	 */
+	if (was_mounted)
+		(void) zfs_mount(zhp, NULL, 0);
+
+out:
+	zfs_close(snap);
+	zfs_close(zhp);
+
+	if (ret == 0)
+		return (0);
+	else
+		return (1);
+}
+
+/*
+ * zfs set property=value { fs | snap | vol } ...
+ *
+ * Sets the given property for all datasets specified on the command line.
+ */
+typedef struct set_cbdata {
+	char		*cb_propname;
+	char		*cb_value;
+	zfs_prop_t	cb_prop;
+} set_cbdata_t;
+
+static int
+set_callback(zfs_handle_t *zhp, void *data)
+{
+	set_cbdata_t *cbp = data;
+	int ret = 1;
+
+	/* don't allow setting of properties for snapshots */
+	if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) {
+		(void) fprintf(stderr, gettext("cannot set %s property for "
+		    "'%s': snapshot properties cannot be modified\n"),
+		    cbp->cb_propname, zfs_get_name(zhp));
+		return (1);
+	}
+
+	/*
+	 * If we're changing the volsize, and the volsize and reservation are
+	 * the same, then change the reservation as well.
+	 */
+	if (cbp->cb_prop == ZFS_PROP_VOLSIZE &&
+	    zfs_get_type(zhp) == ZFS_TYPE_VOLUME &&
+	    zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE) ==
+	    zfs_prop_get_int(zhp, ZFS_PROP_RESERVATION)) {
+		uint64_t volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
+		uint64_t avail = zfs_prop_get_int(zhp, ZFS_PROP_AVAILABLE);
+		uint64_t value;
+
+		verify(zfs_nicestrtonum(cbp->cb_value, &value) == 0);
+
+		/*
+		 * Warn about raising the volume size greater than the amount of
+		 * available space.
+		 */
+		if (value > volsize && (value - volsize) > avail) {
+			(void) fprintf(stderr, gettext("cannot set "
+			    "%s property for '%s': volume size exceeds "
+			    "amount of available space\n"),
+			    cbp->cb_propname, zfs_get_name(zhp));
+			return (1);
+		}
+
+		if (zfs_prop_set(zhp, ZFS_PROP_RESERVATION,
+		    cbp->cb_value) != 0) {
+			(void) fprintf(stderr, gettext("volsize and "
+			    "reservation must remain equal\n"));
+			return (1);
+		}
+	}
+
+	/*
+	 * Do not allow the reservation to be set above the volume size. We do
+	 * this here instead of inside libzfs because libzfs violates this rule
+	 * internally.
+	 */
+	if (cbp->cb_prop == ZFS_PROP_RESERVATION &&
+	    zfs_get_type(zhp) == ZFS_TYPE_VOLUME) {
+		uint64_t value;
+		uint64_t volsize;
+
+		volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
+		if (strcmp(cbp->cb_value, "none") == 0)
+			value = 0;
+		else
+			verify(zfs_nicestrtonum(cbp->cb_value, &value) == 0);
+
+		if (value > volsize) {
+			(void) fprintf(stderr, gettext("cannot set %s "
+			    "for '%s': size is greater than current "
+			    "volume size\n"), cbp->cb_propname,
+			    zfs_get_name(zhp));
+			return (-1);
+		}
+	}
+
+	if (zfs_prop_set(zhp, cbp->cb_prop, cbp->cb_value) != 0)
+		return (1);
+
+	ret = 0;
+error:
+	return (ret);
+}
+
+static int
+zfs_do_set(int argc, char **argv)
+{
+	set_cbdata_t cb;
+
+	/* check for options */
+	if (argc > 1 && argv[1][0] == '-') {
+		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+		    argv[1][1]);
+		usage(FALSE);
+	}
+
+	/* check number of arguments */
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing property=value "
+		    "argument\n"));
+		usage(FALSE);
+	}
+	if (argc < 3) {
+		(void) fprintf(stderr, gettext("missing dataset name\n"));
+		usage(FALSE);
+	}
+
+	/* validate property=value argument */
+	cb.cb_propname = argv[1];
+	if ((cb.cb_value = strchr(cb.cb_propname, '=')) == NULL) {
+		(void) fprintf(stderr, gettext("missing value in "
+		    "property=value argument\n"));
+		usage(FALSE);
+	}
+
+	*cb.cb_value = '\0';
+	cb.cb_value++;
+
+	if (*cb.cb_propname == '\0') {
+		(void) fprintf(stderr,
+		    gettext("missing property in property=value argument\n"));
+		usage(FALSE);
+	}
+	if (*cb.cb_value == '\0') {
+		(void) fprintf(stderr,
+		    gettext("missing value in property=value argument\n"));
+		usage(FALSE);
+	}
+
+	/* get the property type */
+	if ((cb.cb_prop = zfs_name_to_prop(cb.cb_propname)) ==
+	    ZFS_PROP_INVAL) {
+		(void) fprintf(stderr,
+		    gettext("invalid property '%s'\n"), cb.cb_propname);
+		usage(FALSE);
+	}
+
+	/*
+	 * Validate that the value is appropriate for this property.  We do this
+	 * once now so we don't generate multiple errors each time we try to
+	 * apply it to a dataset.
+	 */
+	if (zfs_prop_validate(cb.cb_prop, cb.cb_value, NULL) != 0)
+		return (1);
+
+	return (zfs_for_each(argc - 2, argv + 2, FALSE,
+	    ZFS_TYPE_ANY, set_callback, &cb));
+}
+
+/*
+ * zfs snapshot <fs@snap>
+ *
+ * Creates a snapshot with the given name.  While functionally equivalent to
+ * 'zfs create', it is a separate command to diffferentiate intent.
+ */
+static int
+zfs_do_snapshot(int argc, char **argv)
+{
+	/* check options */
+	if (argc > 1 && argv[1][0] == '-') {
+		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+		    argv[1][1]);
+		usage(FALSE);
+	}
+
+	/* check number of arguments */
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
+		usage(FALSE);
+	}
+	if (argc > 2) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(FALSE);
+	}
+
+	return (zfs_snapshot(argv[1]) != 0);
+}
+
+/*
+ * zfs backup [-i <fs@snap>] <fs@snap>
+ *
+ * Send a backup stream to stdout.
+ */
+static int
+zfs_do_backup(int argc, char **argv)
+{
+	char *fromname = NULL;
+	zfs_handle_t *zhp_from = NULL, *zhp_to;
+	int c, err;
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":i:")) != -1) {
+		switch (c) {
+		case 'i':
+			fromname = optarg;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
+		usage(FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(FALSE);
+	}
+
+	if (isatty(STDOUT_FILENO)) {
+		(void) fprintf(stderr,
+		    gettext("Error: Backup stream can not be written "
+			    "to a terminal.\n"
+			    "You must redirect standard output.\n"));
+		return (1);
+	}
+
+	if (fromname) {
+		if ((zhp_from = zfs_open(fromname, ZFS_TYPE_SNAPSHOT)) == NULL)
+			return (1);
+	}
+	if ((zhp_to = zfs_open(argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
+		return (1);
+
+	err = zfs_backup(zhp_to, zhp_from);
+
+	if (zhp_from)
+		zfs_close(zhp_from);
+	zfs_close(zhp_to);
+
+	return (err != 0);
+}
+
+/*
+ * zfs restore <fs@snap>
+ *
+ * Restore a backup stream from stdin.
+ */
+static int
+zfs_do_restore(int argc, char **argv)
+{
+	int c, err;
+	int isprefix = FALSE;
+	int dryrun = FALSE;
+	int verbose = FALSE;
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":dnv")) != -1) {
+		switch (c) {
+		case 'd':
+			isprefix = TRUE;
+			break;
+		case 'n':
+			dryrun = TRUE;
+			break;
+		case 'v':
+			verbose = TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
+		usage(FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(FALSE);
+	}
+
+	if (isatty(STDIN_FILENO)) {
+		(void) fprintf(stderr,
+		    gettext("Error: Backup stream can not be read "
+			    "from a terminal.\n"
+			    "You must redirect standard input.\n"));
+		return (1);
+	}
+
+	err = zfs_restore(argv[0], isprefix, verbose, dryrun);
+	return (err != 0);
+}
+
+
+/*
+ * Generic callback for sharing or mounting filesystems.  Because the code is so
+ * similar, we have a common function with an extra parameter to determine which
+ * mode we are using.
+ */
+#define	OP_SHARE	0x1
+#define	OP_MOUNT	0x2
+
+typedef struct share_mount_cbdata {
+	int	cb_type;
+	int	cb_explicit;
+	int	cb_flags;
+	const char *cb_options;
+} share_mount_cbdata_t;
+
+/*
+ * Share or mount the filesystem.
+ */
+static int
+share_mount_callback(zfs_handle_t *zhp, void *data)
+{
+	char mountpoint[ZFS_MAXPROPLEN];
+	char shareopts[ZFS_MAXPROPLEN];
+	share_mount_cbdata_t *cbp = data;
+	const char *cmdname = cbp->cb_type == OP_SHARE ? "share" : "mount";
+	struct mnttab mnt;
+	uint64_t zoned;
+
+	if (cbp->cb_options == NULL)
+		mnt.mnt_mntopts = "";
+	else
+		mnt.mnt_mntopts = (char *)cbp->cb_options;
+
+	/*
+	 * Check to make sure we can mount/share this dataset.  If we are in the
+	 * global zone and the filesystem is exported to a local zone, or if we
+	 * are in a local zone and the filesystem is not exported, then it is an
+	 * error.
+	 */
+	zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
+
+	if (zoned && getzoneid() == GLOBAL_ZONEID) {
+		if (!cbp->cb_explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': dataset is "
+		    "exported to a local zone\n"), cmdname, zfs_get_name(zhp));
+		return (1);
+
+	} else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
+		if (!cbp->cb_explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': permission "
+		    "denied\n"), cmdname, zfs_get_name(zhp));
+		return (1);
+	}
+
+	/*
+	 * Inore any filesystems which don't apply to us.  This includes those
+	 * with a legacy mountpoint, or those with legacy share options.
+	 */
+	verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
+	    sizeof (mountpoint), NULL, NULL, 0, FALSE) == 0);
+	verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
+	    sizeof (shareopts), NULL, NULL, 0, FALSE) == 0);
+
+	if (cbp->cb_type == OP_SHARE) {
+		if (strcmp(shareopts, "off") == 0) {
+			if (!cbp->cb_explicit)
+				return (0);
+
+			(void) fprintf(stderr, gettext("cannot share '%s': "
+			    "legacy share\n"), zfs_get_name(zhp));
+			(void) fprintf(stderr, gettext("use share(1M) to "
+			    "share this filesystem\n"));
+			return (1);
+		}
+	}
+
+	/*
+	 * We cannot share or mount legacy filesystems.  If the shareopts is
+	 * non-legacy but the mountpoint is legacy, we treat it as a legacy
+	 * share.
+	 */
+	if (strcmp(mountpoint, "legacy") == 0) {
+		if (!cbp->cb_explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
+		(void) fprintf(stderr, gettext("use %s to "
+		    "%s this filesystem\n"), cbp->cb_type == OP_SHARE ?
+		    "share(1M)" : "mount(1M)", cmdname);
+		return (1);
+	}
+
+	if (strcmp(mountpoint, "none") == 0) {
+		if (!cbp->cb_explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': no "
+		    "mountpoint set\n"), cmdname, zfs_get_name(zhp));
+		return (1);
+	}
+
+	/*
+	 * At this point, we have verified that the mountpoint and/or shareopts
+	 * are appropriate for auto management.  Determine if the filesystem is
+	 * currently mounted or shared, and abort if this is an explicit
+	 * request.
+	 */
+	switch (cbp->cb_type) {
+	case OP_SHARE:
+		if (zfs_is_shared(zhp, NULL)) {
+			if (cbp->cb_explicit) {
+				(void) fprintf(stderr, gettext("cannot share "
+				    "'%s': filesystem already shared\n"),
+				    zfs_get_name(zhp));
+				return (1);
+			} else {
+				return (0);
+			}
+		}
+		break;
+
+	case OP_MOUNT:
+		if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
+		    zfs_is_mounted(zhp, NULL)) {
+			if (cbp->cb_explicit) {
+				(void) fprintf(stderr, gettext("cannot mount "
+				    "'%s': filesystem already mounted\n"),
+				    zfs_get_name(zhp));
+				return (1);
+			} else {
+				return (0);
+			}
+		}
+		break;
+	}
+
+	/*
+	 * Mount and optionally share the filesystem.
+	 */
+	switch (cbp->cb_type) {
+	case OP_SHARE:
+		{
+			if (!zfs_is_mounted(zhp, NULL) &&
+			    zfs_mount(zhp, NULL, 0) != 0)
+				return (1);
+
+			if (zfs_share(zhp) != 0)
+				return (1);
+		}
+		break;
+
+	case OP_MOUNT:
+		if (zfs_mount(zhp, cbp->cb_options, cbp->cb_flags) != 0)
+			return (1);
+		break;
+	}
+
+	return (0);
+}
+
+static int
+share_or_mount(int type, int argc, char **argv)
+{
+	int do_all = 0;
+	int c, ret;
+	share_mount_cbdata_t cb = { 0 };
+
+	cb.cb_type = type;
+
+	/* check options */
+	while ((c = getopt(argc, argv, type == OP_MOUNT ? ":ao:O" : "a"))
+	    != -1) {
+		switch (c) {
+		case 'a':
+			do_all = 1;
+			break;
+		case 'o':
+			cb.cb_options = optarg;
+			break;
+		case 'O':
+			cb.cb_flags |= MS_OVERLAY;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (do_all) {
+		if (argc != 0) {
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+			usage(FALSE);
+		}
+
+		ret = zfs_for_each(argc, argv, TRUE,
+		    ZFS_TYPE_FILESYSTEM, share_mount_callback, &cb);
+	} else if (argc == 0) {
+		struct mnttab entry;
+
+		if (type == OP_SHARE) {
+			(void) fprintf(stderr, gettext("missing filesystem "
+			    "argument\n"));
+			usage(FALSE);
+		}
+
+		/*
+		 * When mount is given no arguments, go through /etc/mnttab and
+		 * display any active ZFS mounts.  We hide any snapshots, since
+		 * they are controlled automatically.
+		 */
+		rewind(mnttab_file);
+		while (getmntent(mnttab_file, &entry) == 0) {
+			if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 ||
+			    strchr(entry.mnt_special, '@') != NULL)
+				continue;
+
+			(void) printf("%-30s  %s\n", entry.mnt_special,
+			    entry.mnt_mountp);
+		}
+
+		ret = 0;
+	} else {
+		zfs_handle_t *zhp;
+
+		if (argc > 1) {
+			(void) fprintf(stderr,
+			    gettext("too many arguments\n"));
+			usage(FALSE);
+		}
+
+		if ((zhp = zfs_open(argv[0], ZFS_TYPE_FILESYSTEM)) == NULL)
+			ret = 1;
+		else {
+			cb.cb_explicit = TRUE;
+			ret = share_mount_callback(zhp, &cb);
+			zfs_close(zhp);
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * zfs mount -a
+ * zfs mount filesystem
+ *
+ * Mount all filesystems, or mount the given filesystem.
+ */
+static int
+zfs_do_mount(int argc, char **argv)
+{
+	return (share_or_mount(OP_MOUNT, argc, argv));
+}
+
+/*
+ * zfs share -a
+ * zfs share filesystem
+ *
+ * Share all filesystems, or share the given filesystem.
+ */
+static int
+zfs_do_share(int argc, char **argv)
+{
+	return (share_or_mount(OP_SHARE, argc, argv));
+}
+
+typedef struct unshare_unmount_node {
+	zfs_handle_t	*un_zhp;
+	char		*un_mountp;
+	uu_avl_node_t	un_avlnode;
+} unshare_unmount_node_t;
+
+/* ARGSUSED */
+static int
+unshare_unmount_compare(const void *larg, const void *rarg, void *unused)
+{
+	const unshare_unmount_node_t *l = larg;
+	const unshare_unmount_node_t *r = rarg;
+
+	return (strcmp(l->un_mountp, r->un_mountp));
+}
+
+/*
+ * Convenience routine used by zfs_do_umount() and manual_unmount().  Given an
+ * absolute path, find the entry /etc/mnttab, verify that its a ZFS filesystem,
+ * and unmount it appropriately.
+ */
+static int
+unshare_unmount_path(int type, char *path, int flags, int is_manual)
+{
+	zfs_handle_t *zhp;
+	int ret;
+	struct stat64 statbuf;
+	struct extmnttab entry;
+	const char *cmdname = (type == OP_SHARE) ? "unshare" : "unmount";
+	char property[ZFS_MAXPROPLEN];
+
+	/*
+	 * Search for the path in /etc/mnttab.  Rather than looking for the
+	 * specific path, which can be fooled by non-standard paths (i.e. ".."
+	 * or "//"), we stat() the path and search for the corresponding
+	 * (major,minor) device pair.
+	 */
+	if (stat64(path, &statbuf) != 0) {
+		(void) fprintf(stderr, gettext("cannot %s '%s': %s\n"),
+		    cmdname, path, strerror(errno));
+		return (1);
+	}
+
+	/*
+	 * Search for the given (major,minor) pair in the mount table.
+	 */
+	rewind(mnttab_file);
+	while ((ret = getextmntent(mnttab_file, &entry, 0)) == 0) {
+		if (entry.mnt_major == major(statbuf.st_dev) &&
+		    entry.mnt_minor == minor(statbuf.st_dev))
+			break;
+	}
+	if (ret != 0) {
+		(void) fprintf(stderr, gettext("cannot %s '%s': not "
+		    "currently mounted\n"), cmdname, path);
+		return (1);
+	}
+
+	if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
+		(void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS "
+		    "filesystem\n"), cmdname, path);
+		return (1);
+	}
+
+	if ((zhp = zfs_open(entry.mnt_special, ZFS_TYPE_FILESYSTEM)) == NULL)
+		return (1);
+
+	verify(zfs_prop_get(zhp, type == OP_SHARE ?
+		ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, property,
+		sizeof (property), NULL, NULL, 0, FALSE) == 0);
+
+	if (type == OP_SHARE) {
+		if (strcmp(property, "off") == 0) {
+			(void) fprintf(stderr, gettext("cannot unshare "
+			    "'%s': legacy share\n"), path);
+			(void) fprintf(stderr, gettext("use "
+			    "unshare(1M) to unshare this filesystem\n"));
+			ret = 1;
+		} else if (!zfs_is_shared(zhp, NULL)) {
+			(void) fprintf(stderr, gettext("cannot unshare '%s': "
+			    "not currently shared\n"), path);
+			ret = 1;
+		} else {
+			ret = zfs_unshareall(zhp);
+		}
+	} else {
+		if (strcmp(property, "legacy") == 0 && !is_manual) {
+			(void) fprintf(stderr, gettext("cannot unmount "
+			    "'%s': legacy mountpoint\n"),
+			    zfs_get_name(zhp));
+			(void) fprintf(stderr, gettext("use umount(1M) "
+			    "to unmount this filesystem\n"));
+			ret = 1;
+		} else {
+			ret = zfs_unmountall(zhp, flags);
+		}
+	}
+
+	zfs_close(zhp);
+
+	return (ret != 0);
+}
+
+/*
+ * Generic callback for unsharing or unmounting a filesystem.
+ */
+static int
+unshare_unmount(int type, int argc, char **argv)
+{
+	int do_all = 0;
+	int flags = 0;
+	int ret = 0;
+	int c;
+	zfs_handle_t *zhp;
+	char property[ZFS_MAXPROPLEN];
+
+	/* check options */
+	while ((c = getopt(argc, argv, type == OP_SHARE ? "a" : "af")) != -1) {
+		switch (c) {
+		case 'a':
+			do_all = 1;
+			break;
+		case 'f':
+			flags = MS_FORCE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* ensure correct number of arguments */
+	if (do_all) {
+		if (argc != 0) {
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+			usage(FALSE);
+		}
+	} else if (argc != 1) {
+		if (argc == 0)
+			(void) fprintf(stderr,
+			    gettext("missing filesystem argument\n"));
+		else
+			(void) fprintf(stderr,
+			    gettext("too many arguments\n"));
+		usage(FALSE);
+	}
+
+	if (do_all) {
+		/*
+		 * We could make use of zfs_for_each() to walk all datasets in
+		 * the system, but this would be very inefficient, especially
+		 * since we would have to linearly search /etc/mnttab for each
+		 * one.  Instead, do one pass through /etc/mnttab looking for
+		 * zfs entries and call zfs_unmount() for each one.
+		 *
+		 * Things get a little tricky if the administrator has created
+		 * mountpoints beneath other ZFS filesystems.  In this case, we
+		 * have to unmount the deepest filesystems first.  To accomplish
+		 * this, we place all the mountpoints in an AVL tree sorted by
+		 * the special type (dataset name), and walk the result in
+		 * reverse to make sure to get any snapshots first.
+		 */
+		struct mnttab entry;
+		uu_avl_pool_t *pool;
+		uu_avl_t *tree;
+		unshare_unmount_node_t *node;
+		uu_avl_index_t idx;
+		uu_avl_walk_t *walk;
+
+		if ((pool = uu_avl_pool_create("unmount_pool",
+		    sizeof (unshare_unmount_node_t),
+		    offsetof(unshare_unmount_node_t, un_avlnode),
+		    unshare_unmount_compare,
+		    UU_DEFAULT)) == NULL) {
+			(void) fprintf(stderr, gettext("internal error: "
+			    "out of memory\n"));
+			exit(1);
+		}
+
+		if ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL) {
+			(void) fprintf(stderr, gettext("internal error: "
+			    "out of memory\n"));
+			exit(1);
+		}
+
+		rewind(mnttab_file);
+		while (getmntent(mnttab_file, &entry) == 0) {
+
+			/* ignore non-ZFS entries */
+			if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
+				continue;
+
+			/* ignore snapshots */
+			if (strchr(entry.mnt_special, '@') != NULL)
+				continue;
+
+			if ((zhp = zfs_open(entry.mnt_special,
+			    ZFS_TYPE_FILESYSTEM)) == NULL) {
+				ret = 1;
+				continue;
+			}
+
+			verify(zfs_prop_get(zhp, type == OP_SHARE ?
+			    ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
+			    property, sizeof (property), NULL, NULL,
+			    0, FALSE) == 0);
+
+			/* Ignore legacy mounts and shares */
+			if ((type == OP_SHARE &&
+			    strcmp(property, "off") == 0) ||
+			    (type == OP_MOUNT &&
+			    strcmp(property, "legacy") == 0)) {
+				zfs_close(zhp);
+				continue;
+			}
+
+			node = safe_malloc(sizeof (unshare_unmount_node_t));
+			node->un_zhp = zhp;
+
+			if ((node->un_mountp = strdup(entry.mnt_mountp)) ==
+			    NULL) {
+				(void) fprintf(stderr, gettext("internal error:"
+				    " out of memory\n"));
+				exit(1);
+			}
+
+			uu_avl_node_init(node, &node->un_avlnode, pool);
+
+			if (uu_avl_find(tree, node, NULL, &idx) == NULL) {
+				uu_avl_insert(tree, node, idx);
+			} else {
+				zfs_close(node->un_zhp);
+				free(node->un_mountp);
+				free(node);
+			}
+		}
+
+		/*
+		 * Walk the AVL tree in reverse, unmounting each filesystem and
+		 * removing it from the AVL tree in the process.
+		 */
+		if ((walk = uu_avl_walk_start(tree,
+		    UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) {
+			(void) fprintf(stderr,
+			    gettext("internal error: out of memory"));
+			exit(1);
+		}
+
+		while ((node = uu_avl_walk_next(walk)) != NULL) {
+			uu_avl_remove(tree, node);
+
+			switch (type) {
+			case OP_SHARE:
+				if (zfs_unshare(node->un_zhp,
+				    node->un_mountp) != 0)
+					ret = 1;
+				break;
+
+			case OP_MOUNT:
+				if (zfs_unmount(node->un_zhp,
+				    node->un_mountp, flags) != 0)
+					ret = 1;
+				break;
+			}
+
+			zfs_close(node->un_zhp);
+			free(node->un_mountp);
+			free(node);
+		}
+
+		uu_avl_walk_end(walk);
+		uu_avl_destroy(tree);
+		uu_avl_pool_destroy(pool);
+	} else {
+		/*
+		 * We have an argument, but it may be a full path or a ZFS
+		 * filesystem.  Pass full paths off to unmount_path() (shared by
+		 * manual_unmount), otherwise open the filesystem and pass to
+		 * zfs_unmount().
+		 */
+		if (argv[0][0] == '/')
+			return (unshare_unmount_path(type, argv[0],
+				flags, FALSE));
+
+		if ((zhp = zfs_open(argv[0], ZFS_TYPE_FILESYSTEM)) == NULL)
+			return (1);
+
+		verify(zfs_prop_get(zhp, type == OP_SHARE ?
+		    ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, property,
+		    sizeof (property), NULL, NULL, 0, FALSE) == 0);
+
+		switch (type) {
+		case OP_SHARE:
+			if (strcmp(property, "off") == 0) {
+				(void) fprintf(stderr, gettext("cannot unshare "
+				    "'%s': legacy share\n"), zfs_get_name(zhp));
+				(void) fprintf(stderr, gettext("use unshare(1M)"
+				    " to unshare this filesystem\n"));
+				ret = 1;
+			} else if (!zfs_is_shared(zhp, NULL)) {
+				(void) fprintf(stderr, gettext("cannot unshare "
+				    "'%s': not currently shared\n"),
+				    zfs_get_name(zhp));
+				ret = 1;
+			} else if (zfs_unshareall(zhp) != 0) {
+				ret = 1;
+			}
+			break;
+
+		case OP_MOUNT:
+			if (strcmp(property, "legacy") == 0) {
+				(void) fprintf(stderr, gettext("cannot unmount "
+				    "'%s': legacy mountpoint\n"),
+				    zfs_get_name(zhp));
+				(void) fprintf(stderr, gettext("use umount(1M) "
+				    "to unmount this filesystem\n"));
+				ret = 1;
+			} else if (!zfs_is_mounted(zhp, NULL)) {
+				(void) fprintf(stderr, gettext("cannot unmount "
+				    "'%s': not currently mounted\n"),
+				    zfs_get_name(zhp));
+				ret = 1;
+			} else if (zfs_unmountall(zhp, flags) != 0) {
+				ret = 1;
+			}
+		}
+
+		zfs_close(zhp);
+	}
+
+	return (ret);
+}
+
+/*
+ * zfs unmount -a
+ * zfs unmount filesystem
+ *
+ * Unmount all filesystems, or a specific ZFS filesystem.
+ */
+static int
+zfs_do_unmount(int argc, char **argv)
+{
+	return (unshare_unmount(OP_MOUNT, argc, argv));
+}
+
+/*
+ * zfs unshare -a
+ * zfs unshare filesystem
+ *
+ * Unshare all filesystems, or a specific ZFS filesystem.
+ */
+static int
+zfs_do_unshare(int argc, char **argv)
+{
+	return (unshare_unmount(OP_SHARE, argc, argv));
+}
+
+/*
+ * Called when invoked as /etc/fs/zfs/mount.  Do the mount if the mountpoint is
+ * 'legacy'.  Otherwise, complain that use should be using 'zfs mount'.
+ */
+static int
+manual_mount(int argc, char **argv)
+{
+	zfs_handle_t *zhp;
+	char mountpoint[ZFS_MAXPROPLEN];
+	char mntopts[MNT_LINE_MAX] = { '\0' };
+	int ret;
+	int c;
+	int flags = 0;
+	char *dataset, *path;
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":o:O")) != -1) {
+		switch (c) {
+		case 'o':
+			(void) strlcpy(mntopts, optarg, sizeof (mntopts));
+			break;
+		case 'O':
+			flags |= MS_OVERLAY;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			(void) fprintf(stderr, gettext("usage: mount [-o opts] "
+			    "<path>\n"));
+			return (2);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check that we only have two arguments */
+	if (argc != 2) {
+		if (argc == 0)
+			(void) fprintf(stderr, gettext("missing dataset "
+			    "argument\n"));
+		else if (argc == 1)
+			(void) fprintf(stderr,
+			    gettext("missing mountpoint argument\n"));
+		else
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+		(void) fprintf(stderr, "usage: mount <dataset> <mountpoint>\n");
+		return (2);
+	}
+
+	dataset = argv[0];
+	path = argv[1];
+
+	/* try to open the dataset */
+	if ((zhp = zfs_open(dataset, ZFS_TYPE_FILESYSTEM)) == NULL)
+		return (1);
+
+	(void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
+	    sizeof (mountpoint), NULL, NULL, 0, FALSE);
+
+	/* check for legacy mountpoint and complain appropriately */
+	ret = 0;
+	if (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0) {
+		if (mount(dataset, path, MS_OPTIONSTR | flags, MNTTYPE_ZFS,
+		    NULL, 0, mntopts, sizeof (mntopts)) != 0) {
+			(void) fprintf(stderr, gettext("mount failed: %s\n"),
+			    strerror(errno));
+			ret = 1;
+		}
+	} else {
+		(void) fprintf(stderr, gettext("filesystem '%s' cannot be "
+		    "mounted using 'mount -F zfs'\n"), dataset);
+		(void) fprintf(stderr, gettext("Use 'zfs set mountpoint=%s' "
+		    "instead.\n"), path);
+		(void) fprintf(stderr, gettext("If you must use 'mount -F zfs' "
+		    "or /etc/vfstab, use 'zfs set mountpoint=legacy'.\n"));
+		(void) fprintf(stderr, gettext("See zfs(1M) for more "
+		    "information.\n"));
+		ret = 1;
+	}
+
+	return (ret);
+}
+
+/*
+ * Called when invoked as /etc/fs/zfs/umount.  Unlike a manual mount, we allow
+ * unmounts of non-legacy filesystems, as this is the dominant administrative
+ * interface.
+ */
+static int
+manual_unmount(int argc, char **argv)
+{
+	int flags = 0;
+	int c;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "f")) != -1) {
+		switch (c) {
+		case 'f':
+			flags = MS_FORCE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			(void) fprintf(stderr, gettext("usage: unmount [-f] "
+			    "<path>\n"));
+			return (2);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check arguments */
+	if (argc != 1) {
+		if (argc == 0)
+			(void) fprintf(stderr, gettext("missing path "
+			    "argument\n"));
+		else
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+		(void) fprintf(stderr, gettext("usage: unmount [-f] <path>\n"));
+		return (2);
+	}
+
+	return (unshare_unmount_path(OP_MOUNT, argv[0], flags, TRUE));
+}
+
+static int
+volcheck(zpool_handle_t *zhp, void *data)
+{
+	int isinit = (int)data;
+
+	if (isinit)
+		return (zpool_create_zvol_links(zhp));
+	else
+		return (zpool_remove_zvol_links(zhp));
+}
+
+/*
+ * Iterate over all pools in the system and either create or destroy /dev/zvol
+ * links, depending on the value of 'isinit'.
+ */
+static int
+do_volcheck(int isinit)
+{
+	return (zpool_iter(volcheck, (void *)isinit) ? 1 : 0);
+}
+
+int
+main(int argc, char **argv)
+{
+	int ret;
+	int i;
+	char *progname;
+	char *cmdname;
+
+	(void) setlocale(LC_ALL, "");
+	(void) textdomain(TEXT_DOMAIN);
+
+	opterr = 0;
+
+	if ((mnttab_file = fopen(MNTTAB, "r")) == NULL) {
+		(void) fprintf(stderr, gettext("internal error: unable to "
+		    "open %s\n"), MNTTAB);
+		return (1);
+	}
+
+	/*
+	 * This command also doubles as the /etc/fs mount and unmount program.
+	 * Determine if we should take this behavior based on argv[0].
+	 */
+	progname = basename(argv[0]);
+	if (strcmp(progname, "mount") == 0) {
+		ret = manual_mount(argc, argv);
+	} else if (strcmp(progname, "umount") == 0) {
+		ret = manual_unmount(argc, argv);
+	} else {
+		/*
+		 * Make sure the user has specified some command.
+		 */
+		if (argc < 2) {
+			(void) fprintf(stderr, gettext("missing command\n"));
+			usage(FALSE);
+		}
+
+		cmdname = argv[1];
+
+		/*
+		 * The 'umount' command is an alias for 'unmount'
+		 */
+		if (strcmp(cmdname, "umount") == 0)
+			cmdname = "unmount";
+
+		/*
+		 * Special case '-?'
+		 */
+		if (strcmp(cmdname, "-?") == 0)
+			usage(TRUE);
+
+		/*
+		 * 'volinit' and 'volfini' do not appear in the usage message,
+		 * so we have to special case them here.
+		 */
+		if (strcmp(cmdname, "volinit") == 0)
+			return (do_volcheck(TRUE));
+		else if (strcmp(cmdname, "volfini") == 0)
+			return (do_volcheck(FALSE));
+
+		/*
+		 * Run the appropriate command.
+		 */
+		for (i = 0; i < NCOMMAND; i++) {
+			if (command_table[i].name == NULL)
+				continue;
+
+			if (strcmp(cmdname, command_table[i].name) == 0) {
+				current_command = &command_table[i];
+				ret = command_table[i].func(argc - 1, argv + 1);
+				break;
+			}
+		}
+
+		if (i == NCOMMAND) {
+			(void) fprintf(stderr, gettext("unrecognized "
+			    "command '%s'\n"), cmdname);
+			usage(FALSE);
+		}
+	}
+
+	(void) fclose(mnttab_file);
+
+	/*
+	 * The 'ZFS_ABORT' environment variable causes us to dump core on exit
+	 * for the purposes of running ::findleaks.
+	 */
+	if (getenv("ZFS_ABORT") != NULL) {
+		(void) printf("dumping core by request\n");
+		abort();
+	}
+
+	return (ret);
+}
diff --git a/usr/src/cmd/zfs/zfs_util.h b/usr/src/cmd/zfs/zfs_util.h
new file mode 100644
index 000000000000..5b2fcfa9f345
--- /dev/null
+++ b/usr/src/cmd/zfs/zfs_util.h
@@ -0,0 +1,42 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_ZFS_UTIL_H
+#define	_ZFS_UTIL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+void * safe_malloc(size_t size);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZFS_UTIL_H */
diff --git a/usr/src/cmd/zoneadm/Makefile b/usr/src/cmd/zoneadm/Makefile
index cdf36e185ec7..15d5233ab79d 100644
--- a/usr/src/cmd/zoneadm/Makefile
+++ b/usr/src/cmd/zoneadm/Makefile
@@ -38,7 +38,7 @@ $(ROOTMANIFEST)         := FILEMODE= 444
 $(ROOTMANIFESTDIR)/%: %
 	$(INS.file)
 
-LDLIBS += -lzonecfg -lsocket -lgen -lpool -lbsm
+LDLIBS += -lzonecfg -lsocket -lgen -lpool -lbsm -lzfs
 
 lint := LINTFLAGS += -ux
 
diff --git a/usr/src/cmd/zoneadm/zoneadm.c b/usr/src/cmd/zoneadm/zoneadm.c
index 287aa4ca95ef..53bf2848bcc5 100644
--- a/usr/src/cmd/zoneadm/zoneadm.c
+++ b/usr/src/cmd/zoneadm/zoneadm.c
@@ -65,6 +65,7 @@
 #include <sys/sockio.h>
 #include <sys/mntent.h>
 #include <limits.h>
+#include <libzfs.h>
 
 #include <fcntl.h>
 #include <door.h>
@@ -1899,6 +1900,117 @@ verify_filesystems(zone_dochandle_t handle)
 	return (return_code);
 }
 
+const char *current_dataset;
+
+/*
+ * Custom error handler for errors incurred as part of the checks below.  We
+ * want to trim off the leading 'cannot open ...' to create a better error
+ * message.  The only other way this can fail is if we fail to set the 'zoned'
+ * property.  In this case we just pass the error on verbatim.
+ */
+static void
+zfs_error_handler(const char *fmt, va_list ap)
+{
+	char buf[1024];
+
+	(void) vsnprintf(buf, sizeof (buf), fmt, ap);
+
+	if (strncmp(gettext("cannot open "), buf,
+	    strlen(gettext("cannot open "))) == 0)
+		(void) fprintf(stderr, gettext("cannot verify zfs "
+		    "dataset %s%s\n"), current_dataset, strchr(buf, ':'));
+	else
+		(void) fprintf(stderr, gettext("cannot verify zfs dataset "
+		    "%s: %s\n"), current_dataset, buf);
+}
+
+/* ARGSUSED */
+static int
+check_zvol(zfs_handle_t *zhp, void *unused)
+{
+	int ret;
+
+	if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) {
+		(void) fprintf(stderr, gettext("cannot verify zfs dataset %s: "
+		    "volumes cannot be specified as a zone dataset resource\n"),
+		    zfs_get_name(zhp));
+		ret = -1;
+	} else {
+		ret = zfs_iter_children(zhp, check_zvol, NULL);
+	}
+
+	zfs_close(zhp);
+
+	return (ret);
+}
+
+/*
+ * Validate that the given dataset exists on the system, and that neither it nor
+ * its children are zvols.
+ *
+ * Note that we don't do anything with the 'zoned' property here.  All
+ * management is done in zoneadmd when the zone is actually rebooted.  This
+ * allows us to automatically set the zoned property even when a zone is
+ * rebooted by the administrator.
+ */
+static int
+verify_datasets(zone_dochandle_t handle)
+{
+	int return_code = Z_OK;
+	struct zone_dstab dstab;
+	zfs_handle_t *zhp;
+	char propbuf[ZFS_MAXPROPLEN];
+	char source[ZFS_MAXNAMELEN];
+	zfs_source_t srctype;
+
+	if (zonecfg_setdsent(handle) != Z_OK) {
+		(void) fprintf(stderr, gettext("cannot verify zfs datasets: "
+		    "unable to enumerate datasets\n"));
+		return (Z_ERR);
+	}
+
+	zfs_set_error_handler(zfs_error_handler);
+
+	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
+
+		current_dataset = dstab.zone_dataset_name;
+
+		if ((zhp = zfs_open(dstab.zone_dataset_name,
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) {
+			return_code = Z_ERR;
+			continue;
+		}
+
+		if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, propbuf,
+		    sizeof (propbuf), &srctype, source,
+		    sizeof (source), 0) == 0 &&
+		    (srctype == ZFS_SRC_INHERITED)) {
+			(void) fprintf(stderr, gettext("cannot verify zfs "
+			    "dataset %s: mountpoint cannot be inherited\n"),
+			    dstab.zone_dataset_name);
+			return_code = Z_ERR;
+			zfs_close(zhp);
+			continue;
+		}
+
+		if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) {
+			(void) fprintf(stderr, gettext("cannot verify zfs "
+			    "dataset %s: volumes cannot be specified as a "
+			    "zone dataset resource\n"),
+			    dstab.zone_dataset_name);
+			return_code = Z_ERR;
+		}
+
+		if (zfs_iter_children(zhp, check_zvol, NULL) != 0)
+			return_code = Z_ERR;
+
+		zfs_close(zhp);
+	}
+	(void) zonecfg_enddsent(handle);
+
+	return (return_code);
+}
+
 static int
 verify_details(int cmd_num)
 {
@@ -2009,6 +2121,8 @@ verify_details(int cmd_num)
 		return_code = Z_ERR;
 	if (!in_alt_root && verify_pool(handle) != Z_OK)
 		return_code = Z_ERR;
+	if (!in_alt_root && verify_datasets(handle) != Z_OK)
+		return_code = Z_ERR;
 	zonecfg_fini_handle(handle);
 	if (return_code == Z_ERR)
 		(void) fprintf(stderr,
diff --git a/usr/src/cmd/zoneadmd/Makefile b/usr/src/cmd/zoneadmd/Makefile
index faf58fd7a8f2..b561ecb3ec83 100644
--- a/usr/src/cmd/zoneadmd/Makefile
+++ b/usr/src/cmd/zoneadmd/Makefile
@@ -39,7 +39,7 @@ POFILES= $(OBJS:%.o=%.po)
 CFLAGS += $(CCVERBOSE)
 LINTFLAGS += -ux
 LDLIBS += -lsocket -lzonecfg -lnsl -ldevinfo -ldevice -lnvpair -lpool \
-	-lgen -lbsm -lcontract
+	-lgen -lbsm -lcontract -lzfs
 XGETFLAGS += -a -x zoneadmd.xcl
 
 .KEEP_STATE:
diff --git a/usr/src/cmd/zoneadmd/vplat.c b/usr/src/cmd/zoneadmd/vplat.c
index 75eca589611f..98dd9e67bc7a 100644
--- a/usr/src/cmd/zoneadmd/vplat.c
+++ b/usr/src/cmd/zoneadmd/vplat.c
@@ -91,6 +91,7 @@
 #include <wait.h>
 #include <limits.h>
 #include <libgen.h>
+#include <libzfs.h>
 #include <zone.h>
 #include <assert.h>
 
@@ -98,6 +99,7 @@
 #include <sys/mnttab.h>
 #include <sys/fs/autofs.h>	/* for _autofssys() */
 #include <sys/fs/lofs_info.h>
+#include <sys/fs/zfs.h>
 
 #include <pool.h>
 #include <sys/pool.h>
@@ -1418,6 +1420,14 @@ mount_filesystems(zlog_t *zlogp, boolean_t mount_cmd)
 		goto bad;
 	}
 	while (zonecfg_getfsent(handle, &fstab) == Z_OK) {
+		/*
+		 * ZFS filesystems will not be accessible under an alternate
+		 * root, since the pool will not be known.  Ignore them in this
+		 * case.
+		 */
+		if (mount_cmd && strcmp(fstab.zone_fs_type, MNTTYPE_ZFS) == 0)
+			continue;
+
 		num_fs++;
 		if ((tmp_ptr = realloc(fs_ptr,
 		    num_fs * sizeof (*tmp_ptr))) == NULL) {
@@ -2438,6 +2448,150 @@ get_zone_pool(zlog_t *zlogp, char *poolbuf, size_t bufsz)
 	return (error);
 }
 
+static int
+get_datasets(zlog_t *zlogp, char **bufp, size_t *bufsizep)
+{
+	zone_dochandle_t handle;
+	struct zone_dstab dstab;
+	size_t total, offset, len;
+	int error = -1;
+	char *str;
+
+	*bufp = NULL;
+	*bufsizep = 0;
+
+	if ((handle = zonecfg_init_handle()) == NULL) {
+		zerror(zlogp, B_TRUE, "getting zone configuration handle");
+		return (-1);
+	}
+	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
+		zerror(zlogp, B_FALSE, "invalid configuration");
+		zonecfg_fini_handle(handle);
+		return (-1);
+	}
+
+	if (zonecfg_setdsent(handle) != Z_OK) {
+		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setdsent");
+		goto out;
+	}
+
+	total = 0;
+	while (zonecfg_getdsent(handle, &dstab) == Z_OK)
+		total += strlen(dstab.zone_dataset_name) + 1;
+	(void) zonecfg_enddsent(handle);
+
+	if (total == 0) {
+		error = 0;
+		goto out;
+	}
+
+	if ((str = malloc(total)) == NULL) {
+		zerror(zlogp, B_TRUE, "memory allocation failed");
+		goto out;
+	}
+
+	if (zonecfg_setdsent(handle) != Z_OK) {
+		zerror(zlogp, B_FALSE, "%s failed", "zonecfg_setdsent");
+		goto out;
+	}
+	offset = 0;
+	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
+		len = strlen(dstab.zone_dataset_name);
+		(void) strlcpy(str + offset, dstab.zone_dataset_name,
+		    sizeof (dstab.zone_dataset_name) - offset);
+		offset += len;
+		if (offset != total - 1)
+			str[offset++] = ',';
+	}
+	(void) zonecfg_enddsent(handle);
+
+	error = 0;
+	*bufp = str;
+	*bufsizep = total;
+
+out:
+	if (error != 0 && str != NULL)
+		free(str);
+	if (handle != NULL)
+		zonecfg_fini_handle(handle);
+
+	return (error);
+}
+
+/* ARGSUSED */
+static void
+zfs_error_handler(const char *fmt, va_list ap)
+{
+	/*
+	 * Do nothing - we interpret the failures from each libzfs call below.
+	 */
+}
+
+static int
+validate_datasets(zlog_t *zlogp)
+{
+	zone_dochandle_t handle;
+	struct zone_dstab dstab;
+	zfs_handle_t *zhp;
+
+	if ((handle = zonecfg_init_handle()) == NULL) {
+		zerror(zlogp, B_TRUE, "getting zone configuration handle");
+		return (-1);
+	}
+	if (zonecfg_get_snapshot_handle(zone_name, handle) != Z_OK) {
+		zerror(zlogp, B_FALSE, "invalid configuration");
+		zonecfg_fini_handle(handle);
+		return (-1);
+	}
+
+	if (zonecfg_setdsent(handle) != Z_OK) {
+		zerror(zlogp, B_FALSE, "invalid configuration");
+		zonecfg_fini_handle(handle);
+		return (-1);
+	}
+
+	zfs_set_error_handler(zfs_error_handler);
+
+	/*
+	 * libzfs opens /dev/zfs during its .init routine.
+	 * zoneadmd automatically closes these files when it daemonizes,
+	 * so we cheat by re-calling the init routine.
+	 */
+	zfs_init();
+
+	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
+
+		if ((zhp = zfs_open(dstab.zone_dataset_name,
+		    ZFS_TYPE_FILESYSTEM)) == NULL) {
+			zerror(zlogp, B_FALSE, "cannot open ZFS dataset '%s'",
+			    dstab.zone_dataset_name);
+			zonecfg_fini_handle(handle);
+			return (-1);
+		}
+
+		/*
+		 * Automatically set the 'zoned' property.  We check the value
+		 * first because we'll get EPERM if it is already set.
+		 */
+		if (!zfs_prop_get_int(zhp, ZFS_PROP_ZONED) &&
+		    zfs_prop_set(zhp, ZFS_PROP_ZONED, "on") != 0) {
+			zerror(zlogp, B_FALSE, "cannot set 'zoned' "
+			    "property for ZFS dataset '%s'\n",
+			    dstab.zone_dataset_name);
+			zonecfg_fini_handle(handle);
+			zfs_close(zhp);
+			return (-1);
+		}
+
+		zfs_close(zhp);
+	}
+	(void) zonecfg_enddsent(handle);
+
+	zonecfg_fini_handle(handle);
+
+	return (0);
+}
+
 static int
 bind_to_pool(zlog_t *zlogp, zoneid_t zoneid)
 {
@@ -2611,6 +2765,8 @@ vplat_create(zlog_t *zlogp, boolean_t mount_cmd)
 	char rootpath[MAXPATHLEN];
 	char *rctlbuf = NULL;
 	size_t rctlbufsz = 0;
+	char *zfsbuf = NULL;
+	size_t zfsbufsz = 0;
 	zoneid_t zoneid = -1;
 	int xerr;
 	char *kzone;
@@ -2636,6 +2792,10 @@ vplat_create(zlog_t *zlogp, boolean_t mount_cmd)
 		zerror(zlogp, B_FALSE, "Unable to get list of rctls");
 		goto error;
 	}
+	if (get_datasets(zlogp, &zfsbuf, &zfsbufsz) != 0) {
+		zerror(zlogp, B_FALSE, "Unable to get list of ZFS datasets");
+		goto error;
+	}
 
 	kzone = zone_name;
 
@@ -2706,7 +2866,7 @@ vplat_create(zlog_t *zlogp, boolean_t mount_cmd)
 
 	xerr = 0;
 	if ((zoneid = zone_create(kzone, rootpath, privs, rctlbuf,
-	    rctlbufsz, &xerr)) == -1) {
+	    rctlbufsz, zfsbuf, zfsbufsz, &xerr)) == -1) {
 		if (xerr == ZE_AREMOUNTS) {
 			if (zonecfg_find_mounts(rootpath, NULL, NULL) < 1) {
 				zerror(zlogp, B_FALSE,
@@ -2762,6 +2922,11 @@ vplat_create(zlog_t *zlogp, boolean_t mount_cmd)
 int
 vplat_bringup(zlog_t *zlogp, boolean_t mount_cmd)
 {
+	if (!mount_cmd && validate_datasets(zlogp) != 0) {
+		lofs_discard_mnttab();
+		return (-1);
+	}
+
 	if (create_dev_files(zlogp) != 0 ||
 	    mount_filesystems(zlogp, mount_cmd) != 0) {
 		lofs_discard_mnttab();
diff --git a/usr/src/cmd/zonecfg/Makefile b/usr/src/cmd/zonecfg/Makefile
index 606f4bf40d90..ac0ce5704702 100644
--- a/usr/src/cmd/zonecfg/Makefile
+++ b/usr/src/cmd/zonecfg/Makefile
@@ -20,7 +20,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -33,7 +33,7 @@ include ../Makefile.cmd
 
 LFLAGS =	-t
 YFLAGS =	-d -b zonecfg_grammar
-LDLIBS +=	-lzonecfg -ll -lnsl -ltecla
+LDLIBS +=	-lzonecfg -ll -lnsl -ltecla -lzfs
 CPPFLAGS +=	-I.
 CLEANFILES +=	zonecfg_lex.c zonecfg_grammar.tab.c zonecfg_grammar.tab.h
 
diff --git a/usr/src/cmd/zonecfg/zonecfg.c b/usr/src/cmd/zonecfg/zonecfg.c
index 69e931e1a941..8c7c421fad39 100644
--- a/usr/src/cmd/zonecfg/zonecfg.c
+++ b/usr/src/cmd/zonecfg/zonecfg.c
@@ -72,6 +72,7 @@
 #include <regex.h>
 #include <signal.h>
 #include <libtecla.h>
+#include <libzfs.h>
 
 #include <libzonecfg.h>
 #include "zonecfg.h"
@@ -153,6 +154,7 @@ static char *res_types[] = {
 	"device",
 	"rctl",
 	"attr",
+	"dataset",
 	NULL
 };
 
@@ -219,6 +221,7 @@ static const char *add_cmds[] = {
 	"add device",
 	"add rctl",
 	"add attr",
+	"add dataset",
 	NULL
 };
 
@@ -229,6 +232,7 @@ static const char *select_cmds[] = {
 	"select device ",
 	"select rctl ",
 	"select attr ",
+	"select dataset ",
 	NULL
 };
 
@@ -308,6 +312,16 @@ static const char *rctl_res_scope_cmds[] = {
 	NULL
 };
 
+static const char *dataset_res_scope_cmds[] = {
+	"cancel",
+	"end",
+	"exit",
+	"help",
+	"info",
+	"set name=",
+	NULL
+};
+
 /* Global variables */
 
 /* set early in main(), never modified thereafter, used all over the place */
@@ -365,6 +379,7 @@ static struct zone_nwiftab	old_nwiftab, in_progress_nwiftab;
 static struct zone_devtab	old_devtab, in_progress_devtab;
 static struct zone_rctltab	old_rctltab, in_progress_rctltab;
 static struct zone_attrtab	old_attrtab, in_progress_attrtab;
+static struct zone_dstab	old_dstab, in_progress_dstab;
 
 static GetLine *gl;	/* The gl_get_line() resource object */
 
@@ -426,6 +441,8 @@ CPL_MATCH_FN(cmd_cpl_fn)
 		return (add_stuff(cpl, line, rctl_res_scope_cmds, word_end));
 	case RT_ATTR:
 		return (add_stuff(cpl, line, attr_res_scope_cmds, word_end));
+	case RT_DATASET:
+		return (add_stuff(cpl, line, dataset_res_scope_cmds, word_end));
 	}
 	return (0);
 }
@@ -784,6 +801,14 @@ usage(bool verbose, uint_t flags)
 			(void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET),
 			    pt_to_str(PT_VALUE), gettext("<unsigned integer>"));
 			break;
+		case RT_DATASET:
+			(void) fprintf(fp, gettext("The '%s' resource scope is "
+			    "used to export ZFS datasets.\n"),
+			    rt_to_str(resource_scope));
+			(void) fprintf(fp, gettext("Valid commands:\n"));
+			(void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET),
+			    pt_to_str(PT_NAME), gettext("<name>"));
+			break;
 		}
 		(void) fprintf(fp, gettext("And from any resource scope, you "
 		    "can:\n"));
@@ -872,6 +897,8 @@ usage(bool verbose, uint_t flags)
 		(void) fprintf(fp, "\t%s\t\t%s, %s, %s\n", rt_to_str(RT_ATTR),
 		    pt_to_str(PT_NAME), pt_to_str(PT_TYPE),
 		    pt_to_str(PT_VALUE));
+		(void) fprintf(fp, "\t%s\t\t%s\n", rt_to_str(RT_DATASET),
+		    pt_to_str(PT_NAME));
 	}
 	if (need_to_close)
 		(void) pclose(fp);
@@ -1242,6 +1269,7 @@ export_func(cmd_t *cmd)
 	struct zone_devtab devtab;
 	struct zone_attrtab attrtab;
 	struct zone_rctltab rctltab;
+	struct zone_dstab dstab;
 	struct zone_rctlvaltab *valptr;
 	int err, arg;
 	char zonepath[MAXPATHLEN], outfile[MAXPATHLEN], pool[MAXNAMELEN];
@@ -1411,6 +1439,18 @@ export_func(cmd_t *cmd)
 	}
 	(void) zonecfg_endattrent(handle);
 
+	if ((err = zonecfg_setdsent(handle)) != Z_OK) {
+		zone_perror(zone, err, FALSE);
+		goto done;
+	}
+	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
+		(void) fprintf(of, "%s %s\n", cmd_to_str(CMD_ADD),
+		    rt_to_str(RT_DATASET));
+		export_prop(of, PT_NAME, dstab.zone_dataset_name);
+		(void) fprintf(of, "%s\n", cmd_to_str(CMD_END));
+	}
+	(void) zonecfg_enddsent(handle);
+
 done:
 	if (need_to_close)
 		(void) fclose(of);
@@ -1507,6 +1547,9 @@ add_resource(cmd_t *cmd)
 	case RT_ATTR:
 		bzero(&in_progress_attrtab, sizeof (in_progress_attrtab));
 		return;
+	case RT_DATASET:
+		bzero(&in_progress_dstab, sizeof (in_progress_dstab));
+		return;
 	default:
 		zone_perror(rt_to_str(type), Z_NO_RESOURCE_TYPE, TRUE);
 		long_usage(CMD_ADD, TRUE);
@@ -2077,6 +2120,39 @@ fill_in_attrtab(cmd_t *cmd, struct zone_attrtab *attrtab, bool fill_in_only)
 	return (err);
 }
 
+static int
+fill_in_dstab(cmd_t *cmd, struct zone_dstab *dstab, bool fill_in_only)
+{
+	int err, i;
+	property_value_ptr_t pp;
+
+	if ((err = initialize(TRUE)) != Z_OK)
+		return (err);
+
+	dstab->zone_dataset_name[0] = '\0';
+	for (i = 0; i < cmd->cmd_prop_nv_pairs; i++) {
+		pp = cmd->cmd_property_ptr[i];
+		if (pp->pv_type != PROP_VAL_SIMPLE || pp->pv_simple == NULL) {
+			zerr(gettext("A simple value was expected here."));
+			saw_error = TRUE;
+			return (Z_INSUFFICIENT_SPEC);
+		}
+		switch (cmd->cmd_prop_name[i]) {
+		case PT_NAME:
+			(void) strlcpy(dstab->zone_dataset_name, pp->pv_simple,
+			    sizeof (dstab->zone_dataset_name));
+			break;
+		default:
+			zone_perror(pt_to_str(cmd->cmd_prop_name[i]),
+			    Z_NO_PROPERTY_TYPE, TRUE);
+			return (Z_INSUFFICIENT_SPEC);
+		}
+	}
+	if (fill_in_only)
+		return (Z_OK);
+	return (zonecfg_lookup_ds(handle, dstab));
+}
+
 static void
 remove_resource(cmd_t *cmd)
 {
@@ -2086,6 +2162,7 @@ remove_resource(cmd_t *cmd)
 	struct zone_devtab devtab;
 	struct zone_attrtab attrtab;
 	struct zone_rctltab rctltab;
+	struct zone_dstab dstab;
 
 	if ((type = cmd->cmd_res_type) == RT_UNKNOWN) {
 		long_usage(CMD_REMOVE, TRUE);
@@ -2164,6 +2241,16 @@ remove_resource(cmd_t *cmd)
 		else
 			need_to_commit = TRUE;
 		return;
+	case RT_DATASET:
+		if ((err = fill_in_dstab(cmd, &dstab, FALSE)) != Z_OK) {
+			z_cmd_rt_perror(CMD_REMOVE, RT_DATASET, err, TRUE);
+			return;
+		}
+		if ((err = zonecfg_delete_ds(handle, &dstab)) != Z_OK)
+			z_cmd_rt_perror(CMD_REMOVE, RT_DATASET, err, TRUE);
+		else
+			need_to_commit = TRUE;
+		return;
 	default:
 		zone_perror(rt_to_str(type), Z_NO_RESOURCE_TYPE, TRUE);
 		long_usage(CMD_REMOVE, TRUE);
@@ -2398,6 +2485,14 @@ select_func(cmd_t *cmd)
 		bcopy(&old_attrtab, &in_progress_attrtab,
 		    sizeof (struct zone_attrtab));
 		return;
+	case RT_DATASET:
+		if ((err = fill_in_dstab(cmd, &old_dstab, FALSE)) != Z_OK) {
+			z_cmd_rt_perror(CMD_SELECT, RT_DATASET, err, TRUE);
+			global_scope = TRUE;
+		}
+		bcopy(&old_dstab, &in_progress_dstab,
+		    sizeof (struct zone_dstab));
+		return;
 	default:
 		zone_perror(rt_to_str(type), Z_NO_RESOURCE_TYPE, TRUE);
 		long_usage(CMD_SELECT, TRUE);
@@ -2801,6 +2896,20 @@ set_func(cmd_t *cmd)
 			return;
 		}
 		return;
+	case RT_DATASET:
+		switch (prop_type) {
+		case PT_NAME:
+			(void) strlcpy(in_progress_dstab.zone_dataset_name,
+			    prop_id,
+			    sizeof (in_progress_dstab.zone_dataset_name));
+			return;
+		default:
+			break;
+		}
+		zone_perror(pt_to_str(prop_type), Z_NO_PROPERTY_TYPE, TRUE);
+		long_usage(CMD_SET, TRUE);
+		usage(FALSE, HELP_PROPS);
+		return;
 	default:
 		zone_perror(rt_to_str(res_type), Z_NO_RESOURCE_TYPE, TRUE);
 		long_usage(CMD_SET, TRUE);
@@ -3149,6 +3258,46 @@ info_attr(zone_dochandle_t handle, FILE *fp, cmd_t *cmd)
 		    rt_to_str(RT_ATTR));
 }
 
+static void
+output_ds(FILE *fp, struct zone_dstab *dstab)
+{
+	(void) fprintf(fp, "%s:\n", rt_to_str(RT_DATASET));
+	output_prop(fp, PT_NAME, dstab->zone_dataset_name, B_TRUE);
+}
+
+static void
+info_ds(zone_dochandle_t handle, FILE *fp, cmd_t *cmd)
+{
+	struct zone_dstab lookup, user;
+	bool output = FALSE;
+
+	if (zonecfg_setdevent(handle) != Z_OK)
+		return;
+	while (zonecfg_getdsent(handle, &lookup) == Z_OK) {
+		if (cmd->cmd_prop_nv_pairs == 0) {
+			output_ds(fp, &lookup);
+			continue;
+		}
+		if (fill_in_dstab(cmd, &user, TRUE) != Z_OK)
+			continue;
+		if (strlen(user.zone_dataset_name) > 0 &&
+		    strcmp(user.zone_dataset_name,
+		    lookup.zone_dataset_name) != 0)
+			continue;	/* no match */
+		output_ds(fp, &lookup);
+		output = TRUE;
+	}
+	(void) zonecfg_enddsent(handle);
+	/*
+	 * If a property n/v pair was specified, warn the user if there was
+	 * nothing to output.
+	 */
+	if (!output && cmd->cmd_prop_nv_pairs > 0)
+		(void) printf(gettext("No such %s resource.\n"),
+		    rt_to_str(RT_DATASET));
+}
+
+
 void
 info_func(cmd_t *cmd)
 {
@@ -3192,6 +3341,9 @@ info_func(cmd_t *cmd)
 		case RT_ATTR:
 			output_attr(fp, &in_progress_attrtab);
 			break;
+		case RT_DATASET:
+			output_ds(fp, &in_progress_dstab);
+			break;
 		}
 		goto cleanup;
 	}
@@ -3208,6 +3360,7 @@ info_func(cmd_t *cmd)
 		info_dev(handle, fp, cmd);
 		info_rctl(handle, fp, cmd);
 		info_attr(handle, fp, cmd);
+		info_ds(handle, fp, cmd);
 		break;
 	case RT_ZONENAME:
 		info_zonename(handle, fp);
@@ -3239,6 +3392,9 @@ info_func(cmd_t *cmd)
 	case RT_ATTR:
 		info_attr(handle, fp, cmd);
 		break;
+	case RT_DATASET:
+		info_ds(handle, fp, cmd);
+		break;
 	default:
 		zone_perror(rt_to_str(cmd->cmd_res_type), Z_NO_RESOURCE_TYPE,
 		    TRUE);
@@ -3281,6 +3437,7 @@ verify_func(cmd_t *cmd)
 	struct zone_fstab fstab;
 	struct zone_attrtab attrtab;
 	struct zone_rctltab rctltab;
+	struct zone_dstab dstab;
 	char zonepath[MAXPATHLEN];
 	int err, ret_val = Z_OK, arg;
 	bool save = FALSE;
@@ -3391,6 +3548,29 @@ verify_func(cmd_t *cmd)
 	}
 	(void) zonecfg_endattrent(handle);
 
+	if ((err = zonecfg_setdsent(handle)) != Z_OK) {
+		zone_perror(zone, err, TRUE);
+		return;
+	}
+	while (zonecfg_getdsent(handle, &dstab) == Z_OK) {
+		if (strlen(dstab.zone_dataset_name) == 0) {
+			zerr("%s: %s %s", rt_to_str(RT_DATASET),
+			    pt_to_str(PT_NAME), gettext("not specified"));
+			saw_error = TRUE;
+			if (ret_val == Z_OK)
+				ret_val = Z_REQD_PROPERTY_MISSING;
+		} else if (!zfs_name_valid(dstab.zone_dataset_name,
+		    ZFS_TYPE_FILESYSTEM)) {
+			zerr("%s: %s %s", rt_to_str(RT_DATASET),
+			    pt_to_str(PT_NAME), gettext("invalid"));
+			saw_error = TRUE;
+			if (ret_val == Z_OK)
+				ret_val = Z_BAD_PROPERTY;
+		}
+
+	}
+	(void) zonecfg_enddsent(handle);
+
 	if (!global_scope) {
 		zerr(gettext("resource specification incomplete"));
 		saw_error = TRUE;
@@ -3442,10 +3622,12 @@ cancel_func(cmd_t *cmd)
 	zonecfg_free_fs_option_list(in_progress_fstab.zone_fs_options);
 	bzero(&in_progress_fstab, sizeof (in_progress_fstab));
 	bzero(&in_progress_nwiftab, sizeof (in_progress_nwiftab));
+	bzero(&in_progress_ipdtab, sizeof (in_progress_ipdtab));
 	bzero(&in_progress_devtab, sizeof (in_progress_devtab));
 	zonecfg_free_rctl_value_list(in_progress_rctltab.zone_rctl_valptr);
 	bzero(&in_progress_rctltab, sizeof (in_progress_rctltab));
 	bzero(&in_progress_attrtab, sizeof (in_progress_attrtab));
+	bzero(&in_progress_dstab, sizeof (in_progress_dstab));
 }
 
 static int
@@ -3539,6 +3721,7 @@ end_func(cmd_t *cmd)
 	struct zone_devtab tmp_devtab;
 	struct zone_rctltab tmp_rctltab;
 	struct zone_attrtab tmp_attrtab;
+	struct zone_dstab tmp_dstab;
 	int err, arg;
 
 	assert(cmd != NULL);
@@ -3808,6 +3991,37 @@ end_func(cmd_t *cmd)
 			    &in_progress_attrtab);
 		}
 		break;
+	case RT_DATASET:
+		/* First make sure everything was filled in. */
+		if (strlen(in_progress_dstab.zone_dataset_name) == 0) {
+			zerr("%s %s", pt_to_str(PT_NAME),
+			    gettext("not specified"));
+			saw_error = TRUE;
+			validation_failed = TRUE;
+		}
+		if (validation_failed)
+			return;
+		if (end_op == CMD_ADD) {
+			/* Make sure there isn't already one like this. */
+			bzero(&tmp_dstab, sizeof (tmp_dstab));
+			(void) strlcpy(tmp_dstab.zone_dataset_name,
+			    in_progress_dstab.zone_dataset_name,
+			    sizeof (tmp_dstab.zone_dataset_name));
+			err = zonecfg_lookup_ds(handle, &tmp_dstab);
+			if (err == Z_OK) {
+				zerr(gettext("A %s resource "
+				    "with the %s '%s' already exists."),
+				    rt_to_str(RT_DATASET), pt_to_str(PT_NAME),
+				    in_progress_dstab.zone_dataset_name);
+				saw_error = TRUE;
+				return;
+			}
+			err = zonecfg_add_ds(handle, &in_progress_dstab);
+		} else {
+			err = zonecfg_modify_ds(handle, &old_dstab,
+			    &in_progress_dstab);
+		}
+		break;
 	default:
 		zone_perror(rt_to_str(resource_scope), Z_NO_RESOURCE_TYPE,
 		    TRUE);
diff --git a/usr/src/cmd/zonecfg/zonecfg.h b/usr/src/cmd/zonecfg/zonecfg.h
index e0fca7a02d74..2c37de8a19ae 100644
--- a/usr/src/cmd/zonecfg/zonecfg.h
+++ b/usr/src/cmd/zonecfg/zonecfg.h
@@ -79,9 +79,10 @@ typedef int bool;
 #define	RT_DEVICE	8
 #define	RT_RCTL		9
 #define	RT_ATTR		10
+#define	RT_DATASET	11
 
 #define	RT_MIN		RT_UNKNOWN
-#define	RT_MAX		RT_ATTR
+#define	RT_MAX		RT_DATASET
 
 /* property types: increment PT_MAX when expanding this list */
 #define	PT_UNKNOWN	0
diff --git a/usr/src/cmd/zonecfg/zonecfg_grammar.y b/usr/src/cmd/zonecfg/zonecfg_grammar.y
index abca323bed26..4f7f2d6c2305 100644
--- a/usr/src/cmd/zonecfg/zonecfg_grammar.y
+++ b/usr/src/cmd/zonecfg/zonecfg_grammar.y
@@ -61,7 +61,7 @@ extern void yyerror(char *s);
 %token COMMIT REVERT EXIT SEMICOLON TOKEN ZONENAME ZONEPATH AUTOBOOT POOL NET
 %token FS IPD ATTR DEVICE RCTL SPECIAL RAW DIR OPTIONS TYPE ADDRESS PHYSICAL
 %token NAME MATCH PRIV LIMIT ACTION VALUE EQUAL OPEN_SQ_BRACKET CLOSE_SQ_BRACKET
-%token OPEN_PAREN CLOSE_PAREN COMMA
+%token OPEN_PAREN CLOSE_PAREN COMMA DATASET
 
 %type <strval> TOKEN EQUAL OPEN_SQ_BRACKET CLOSE_SQ_BRACKET
     property_value OPEN_PAREN CLOSE_PAREN COMMA simple_prop_val
@@ -668,6 +668,7 @@ resource_type: NET	{ $$ = RT_NET; }
 	| DEVICE	{ $$ = RT_DEVICE; }
 	| RCTL		{ $$ = RT_RCTL; }
 	| ATTR		{ $$ = RT_ATTR; }
+	| DATASET	{ $$ = RT_DATASET; }
 
 property_name: SPECIAL	{ $$ = PT_SPECIAL; }
 	| RAW		{ $$ = PT_RAW; }
diff --git a/usr/src/cmd/zonecfg/zonecfg_lex.l b/usr/src/cmd/zonecfg/zonecfg_lex.l
index 1a5de3659ed4..3c3f1c0da068 100644
--- a/usr/src/cmd/zonecfg/zonecfg_lex.l
+++ b/usr/src/cmd/zonecfg/zonecfg_lex.l
@@ -159,6 +159,8 @@ char *safe_strdup(char *s);
 <TSTATE>zonename	{ return ZONENAME; }
 <CSTATE>zonename	{ return ZONENAME; }
 
+<TSTATE>dataset	{ return DATASET; }
+
 <TSTATE>zonepath	{ return ZONEPATH; }
 <CSTATE>zonepath	{ return ZONEPATH; }
 
diff --git a/usr/src/cmd/zpool/Makefile b/usr/src/cmd/zpool/Makefile
new file mode 100644
index 000000000000..818c7b7fe8a4
--- /dev/null
+++ b/usr/src/cmd/zpool/Makefile
@@ -0,0 +1,80 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+PROG= zpool
+OBJS= zpool_main.o zpool_vdev.o zpool_iter.o zpool_util.o zpool_dataset.o
+SRCS= $(OBJS:%.o=%.c)
+POFILES= zpool_main.po zpool_vdev.po zpool_iter.po zpool_util.po \
+	zpool_dataset.po
+POFILE= zpool.po
+
+include ../Makefile.cmd
+
+LDLIBS += -lzfs -lnvpair -ldevid -lefi -ldiskmgt -luutil -lumem
+
+CPPFLAGS += -D_LARGEFILE64_SOURCE=1 -D_REENTRANT
+
+# lint complains about unused _umem_* functions
+LINTFLAGS += -xerroff=E_NAME_DEF_NOT_USED2 
+LINTFLAGS64 += -xerroff=E_NAME_DEF_NOT_USED2  
+
+CACHEDIR= $(ROOTETC)/zfs
+
+ROOTUSRSBINLINKS = $(PROG:%=$(ROOTUSRSBIN)/%)
+
+.KEEP_STATE:
+
+.PARALLEL:
+
+all: $(PROG)
+
+$(PROG): $(OBJS)
+	$(LINK.c) -o $@ $(OBJS) $(LDLIBS)
+	$(POST_PROCESS)
+
+install: all $(ROOTSBINPROG) $(CACHEDIR) $(ROOTUSRSBINLINKS)
+
+$(CACHEDIR):
+	$(INS.dir)
+
+$(POFILE): $(POFILES)
+	$(RM) $@
+	cat $(POFILES) > $@
+
+clean:
+	$(RM) $(OBJS)
+
+lint:	lint_SRCS
+
+# Links from /usr/sbin to /sbin
+$(ROOTUSRSBINLINKS):
+	-$(RM) $@; $(SYMLINK) ../../sbin/$(@F) $@
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/zpool/zpool_dataset.c b/usr/src/cmd/zpool/zpool_dataset.c
new file mode 100644
index 000000000000..0432f536522c
--- /dev/null
+++ b/usr/src/cmd/zpool/zpool_dataset.c
@@ -0,0 +1,148 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <libintl.h>
+#include <libzfs.h>
+#include <sys/mount.h>
+
+#include "zpool_util.h"
+
+/*
+ * For export and destroy, we have to support iterating over all datasets and
+ * unmounting and/or destroying them.  This file contains the routines to
+ * support this.
+ */
+typedef struct cbdata {
+	int	cb_force;
+	int	cb_failed;
+	const char *cb_mntopts;
+} cbdata_t;
+
+/*
+ * Unmount a single ZFS dataset.
+ */
+int
+do_unmount(zfs_handle_t *zfsp, void *data)
+{
+	cbdata_t *cbp = data;
+
+	if (zfs_unmount(zfsp, NULL, cbp->cb_force ? MS_FORCE : 0) != 0)
+		cbp->cb_failed = 1;
+
+	return (0);
+}
+
+/*
+ * Unmount all datasets within the given pool.
+ *
+ * XXZFS it would be much more efficient, and correct, to iterate over
+ * mountpoints based on /etc/mnttab.
+ */
+int
+unmount_datasets(zpool_handle_t *zhp, int force)
+{
+	cbdata_t cb = { 0 };
+	zfs_handle_t *zfsp;
+
+	/* For unavailable pools, we don't do anything */
+	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL)
+		return (0);
+
+	if ((zfsp = zfs_open(zpool_get_name(zhp), ZFS_TYPE_FILESYSTEM)) == NULL)
+		return (-1);
+
+	cb.cb_force = force;
+
+	if (zfs_iter_dependents(zfsp, do_unmount, &cb) != 0 ||
+	    cb.cb_failed != 0) {
+		zfs_close(zfsp);
+		return (-1);
+	}
+
+	if (do_unmount(zfsp, &cb) != 0 || cb.cb_failed != 0) {
+		zfs_close(zfsp);
+		return (-1);
+	}
+
+	zfs_close(zfsp);
+
+	return (0);
+}
+
+/*
+ * Mount a single dataset
+ */
+static int
+do_mount(zfs_handle_t *zfsp, void *data)
+{
+	cbdata_t *cbp = data;
+	int ret;
+
+	if (zfs_get_type(zfsp) != ZFS_TYPE_FILESYSTEM)
+		return (0);
+
+	if (zfs_mount(zfsp, cbp->cb_mntopts, 0) != 0)
+		cbp->cb_failed = 1;
+
+	ret = zfs_iter_children(zfsp, do_mount, data);
+
+	return (ret);
+}
+
+
+/*
+ * Go through and mount all datasets within a pool.  We need to mount all
+ * datasets in order, so that we mount parents before any children.  A complete
+ * fix would gather all mountpoints, sort them, and mount them in lexical order.
+ * There are many more problems if you start to have nested filesystems - we
+ * just want to get inherited filesystems right.
+ */
+int
+mount_datasets(zpool_handle_t *zhp, const char *options)
+{
+	cbdata_t cb = { 0 };
+	zfs_handle_t *zfsp;
+
+	cb.cb_mntopts = options;
+
+	/* For unavailable pools, we don't do anything */
+	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL)
+		return (0);
+
+	if ((zfsp = zfs_open(zpool_get_name(zhp), ZFS_TYPE_FILESYSTEM)) == NULL)
+		return (-1);
+
+	if (do_mount(zfsp, &cb) != 0 || cb.cb_failed != 0) {
+		zfs_close(zfsp);
+		return (-1);
+	}
+
+	zfs_close(zfsp);
+
+	return (0);
+}
diff --git a/usr/src/cmd/zpool/zpool_iter.c b/usr/src/cmd/zpool/zpool_iter.c
new file mode 100644
index 000000000000..f99396da817d
--- /dev/null
+++ b/usr/src/cmd/zpool/zpool_iter.c
@@ -0,0 +1,241 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <libintl.h>
+#include <libuutil.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+
+#include <libzfs.h>
+
+#include "zpool_util.h"
+
+/*
+ * Private interface for iterating over pools specified on the command line.
+ * Most consumers will call for_each_pool, but in order to support iostat, we
+ * allow fined grained control through the zpool_list_t interface.
+ */
+
+typedef struct zpool_node {
+	zpool_handle_t	*zn_handle;
+	uu_avl_node_t	zn_avlnode;
+	int		zn_mark;
+} zpool_node_t;
+
+struct zpool_list {
+	int		zl_findall;
+	uu_avl_t	*zl_avl;
+	uu_avl_pool_t	*zl_pool;
+};
+
+/* ARGSUSED */
+static int
+zpool_compare(const void *larg, const void *rarg, void *unused)
+{
+	zpool_handle_t *l = ((zpool_node_t *)larg)->zn_handle;
+	zpool_handle_t *r = ((zpool_node_t *)rarg)->zn_handle;
+	const char *lname = zpool_get_name(l);
+	const char *rname = zpool_get_name(r);
+
+	return (strcmp(lname, rname));
+}
+
+/*
+ * Callback function for pool_list_get().  Adds the given pool to the AVL tree
+ * of known pools.
+ */
+static int
+add_pool(zpool_handle_t *zhp, void *data)
+{
+	zpool_list_t *zlp = data;
+	zpool_node_t *node = safe_malloc(sizeof (zpool_node_t));
+	uu_avl_index_t idx;
+
+	node->zn_handle = zhp;
+	uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool);
+	if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) {
+		uu_avl_insert(zlp->zl_avl, node, idx);
+	} else {
+		zpool_close(zhp);
+		free(node);
+	}
+
+	return (0);
+}
+
+/*
+ * Create a list of pools based on the given arguments.  If we're given no
+ * arguments, then iterate over all pools in the system and add them to the AVL
+ * tree.  Otherwise, add only those pool explicitly specified on the command
+ * line.
+ */
+zpool_list_t *
+pool_list_get(int argc, char **argv, int *err)
+{
+	zpool_list_t *zlp;
+
+	zlp = safe_malloc(sizeof (zpool_list_t));
+
+	zlp->zl_pool = uu_avl_pool_create("zfs_pool", sizeof (zpool_node_t),
+	    offsetof(zpool_node_t, zn_avlnode), zpool_compare, UU_DEFAULT);
+
+	if (zlp->zl_pool == NULL)
+		no_memory();
+
+	if ((zlp->zl_avl = uu_avl_create(zlp->zl_pool, NULL,
+	    UU_DEFAULT)) == NULL)
+		no_memory();
+
+	if (argc == 0) {
+		(void) zpool_iter(add_pool, zlp);
+		zlp->zl_findall = TRUE;
+	} else {
+		int i;
+
+		for (i = 0; i < argc; i++) {
+			zpool_handle_t *zhp;
+
+			if ((zhp = zpool_open_canfail(argv[i])) != NULL)
+				(void) add_pool(zhp, zlp);
+			else
+				*err = TRUE;
+		}
+	}
+
+	return (zlp);
+}
+
+/*
+ * Search for any new pools, adding them to the list.  We only add pools when no
+ * options were given on the command line.  Otherwise, we keep the list fixed as
+ * those that were explicitly specified.
+ */
+void
+pool_list_update(zpool_list_t *zlp)
+{
+	if (zlp->zl_findall)
+		(void) zpool_iter(add_pool, zlp);
+}
+
+/*
+ * Iterate over all pools in the list, executing the callback for each
+ */
+int
+pool_list_iter(zpool_list_t *zlp, int unavail, zpool_iter_f func,
+    void *data)
+{
+	zpool_node_t *node, *next_node;
+	int ret = 0;
+
+	for (node = uu_avl_first(zlp->zl_avl); node != NULL; node = next_node) {
+		next_node = uu_avl_next(zlp->zl_avl, node);
+		if (zpool_get_state(node->zn_handle) != POOL_STATE_UNAVAIL ||
+		    unavail)
+			ret |= func(node->zn_handle, data);
+	}
+
+	return (ret);
+}
+
+/*
+ * Remove the given pool from the list.  When running iostat, we want to remove
+ * those pools that no longer exist.
+ */
+void
+pool_list_remove(zpool_list_t *zlp, zpool_handle_t *zhp)
+{
+	zpool_node_t search, *node;
+
+	search.zn_handle = zhp;
+	if ((node = uu_avl_find(zlp->zl_avl, &search, NULL, NULL)) != NULL) {
+		uu_avl_remove(zlp->zl_avl, node);
+		zpool_close(node->zn_handle);
+		free(node);
+	}
+}
+
+/*
+ * Free all the handles associated with this list.
+ */
+void
+pool_list_free(zpool_list_t *zlp)
+{
+	uu_avl_walk_t *walk;
+	zpool_node_t *node;
+
+	if ((walk = uu_avl_walk_start(zlp->zl_avl, UU_WALK_ROBUST)) == NULL) {
+		(void) fprintf(stderr,
+		    gettext("internal error: out of memory"));
+		exit(1);
+	}
+
+	while ((node = uu_avl_walk_next(walk)) != NULL) {
+		uu_avl_remove(zlp->zl_avl, node);
+		zpool_close(node->zn_handle);
+		free(node);
+	}
+
+	uu_avl_walk_end(walk);
+	uu_avl_destroy(zlp->zl_avl);
+	uu_avl_pool_destroy(zlp->zl_pool);
+
+	free(zlp);
+}
+
+/*
+ * Returns the number of elements in the pool list.
+ */
+int
+pool_list_count(zpool_list_t *zlp)
+{
+	return (uu_avl_numnodes(zlp->zl_avl));
+}
+
+/*
+ * High level function which iterates over all pools given on the command line,
+ * using the pool_list_* interfaces.
+ */
+int
+for_each_pool(int argc, char **argv, int unavail, zpool_iter_f func,
+    void *data)
+{
+	zpool_list_t *list;
+	int ret = 0;
+
+	if ((list = pool_list_get(argc, argv, &ret)) == NULL)
+		return (1);
+
+	if (pool_list_iter(list, unavail, func, data) != 0)
+		ret = 1;
+
+	pool_list_free(list);
+
+	return (ret);
+}
diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c
new file mode 100644
index 000000000000..0a2f2d6cd385
--- /dev/null
+++ b/usr/src/cmd/zpool/zpool_main.c
@@ -0,0 +1,2471 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <assert.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <libintl.h>
+#include <libuutil.h>
+#include <locale.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <priv.h>
+
+#include <sys/stat.h>
+
+#include <libzfs.h>
+
+#include "zpool_util.h"
+
+static int zpool_do_create(int, char **);
+static int zpool_do_destroy(int, char **);
+
+static int zpool_do_add(int, char **);
+
+static int zpool_do_list(int, char **);
+static int zpool_do_iostat(int, char **);
+static int zpool_do_status(int, char **);
+
+static int zpool_do_online(int, char **);
+static int zpool_do_offline(int, char **);
+
+static int zpool_do_attach(int, char **);
+static int zpool_do_detach(int, char **);
+static int zpool_do_replace(int, char **);
+
+static int zpool_do_scrub(int, char **);
+
+static int zpool_do_import(int, char **);
+static int zpool_do_export(int, char **);
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+const char *
+_umem_debug_init()
+{
+	return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+	return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+
+typedef struct zpool_command {
+	const char	*name;
+	int		(*func)(int, char **);
+	const char	*usage;
+} zpool_command_t;
+
+/*
+ * Master command table.  Each ZFS command has a name, associated function, and
+ * usage message.  These commands are organized according to how they are
+ * displayed in the usage message.  An empty command (one with a NULL name)
+ * indicates an empty line in the generic usage message.
+ */
+static zpool_command_t command_table[] = {
+	{ "create",	zpool_do_create,
+	    "\tcreate  [-fn] [-R root] [-m mountpoint] <pool> <vdev> ...\n" },
+	{ "destroy",	zpool_do_destroy,
+	    "\tdestroy [-f] <pool>\n"					},
+
+
+	{ NULL },
+
+	{ "add",	zpool_do_add,
+	    "\tadd [-fn] <pool> <vdev> ...\n"				},
+
+	{ NULL },
+
+	{ "list",	zpool_do_list,
+	    "\tlist [-H] [-o field[,field]*] [pool] ...\n"		},
+	{ "iostat",	zpool_do_iostat,
+	    "\tiostat [-v] [pool] ... [interval [count]]\n"		},
+	{ "status",	zpool_do_status,
+	    "\tstatus [-vx] [pool] ...\n"				},
+
+	{ NULL },
+
+	{ "online",	zpool_do_online,
+	    "\tonline <pool> <device>\n"				},
+	{ "offline",	zpool_do_offline,
+	    "\toffline <pool> <device>\n"				},
+
+	{ NULL },
+
+	{ "attach",	zpool_do_attach,
+	    "\tattach [-f] <pool> <device> <new_device>\n"		},
+	{ "detach",	zpool_do_detach,
+	    "\tdetach <pool> <device>\n"				},
+	{ "replace",	zpool_do_replace,
+	    "\treplace [-f] <pool> <device> [new_device]\n"		},
+
+	{ NULL },
+
+	{ "scrub",	zpool_do_scrub,
+	    "\tscrub [-s] <pool> ...\n"					},
+
+	{ NULL },
+
+	{ "import",	zpool_do_import,
+	    "\timport [-d dir]\n"
+	    "\timport [-d dir] [-f] [-o opts] [-R root] -a\n"
+	    "\timport [-d dir] [-f] [-o opts] [-R root ]<pool | id> "
+	    "[newpool]\n"						},
+	{ "export",	zpool_do_export,
+	    "\texport [-f] <pool> ...\n"				},
+
+	{ NULL }
+};
+
+#define	NCOMMAND	(sizeof (command_table) / sizeof (command_table[0]))
+
+zpool_command_t *current_command;
+
+/*
+ * Fields available for 'zpool list'.
+ */
+typedef enum {
+	ZPOOL_FIELD_NAME,
+	ZPOOL_FIELD_SIZE,
+	ZPOOL_FIELD_USED,
+	ZPOOL_FIELD_AVAILABLE,
+	ZPOOL_FIELD_CAPACITY,
+	ZPOOL_FIELD_HEALTH,
+	ZPOOL_FIELD_ROOT
+} zpool_field_t;
+
+#define	MAX_FIELDS	10
+
+typedef struct column_def {
+	const char	*cd_title;
+	size_t		cd_width;
+	enum {
+		left_justify,
+		right_justify
+	}		cd_justify;
+} column_def_t;
+
+static column_def_t column_table[] = {
+	{ "NAME",	20,	left_justify	},
+	{ "SIZE",	6,	right_justify	},
+	{ "USED",	6,	right_justify	},
+	{ "AVAIL",	6,	right_justify	},
+	{ "CAP",	5,	right_justify	},
+	{ "HEALTH",	9,	left_justify	},
+	{ "ALTROOT",	15,	left_justify	}
+};
+
+static char *column_subopts[] = {
+	"name",
+	"size",
+	"used",
+	"available",
+	"capacity",
+	"health",
+	"root",
+	NULL
+};
+
+/*
+ * Display usage message.  If we're inside a command, display only the usage for
+ * that command.  Otherwise, iterate over the entire command table and display
+ * a complete usage message.
+ */
+void
+usage(int requested)
+{
+	int i;
+	FILE *fp = requested ? stdout : stderr;
+
+	if (current_command == NULL) {
+		int i;
+
+		(void) fprintf(fp, gettext("usage: zpool command args ...\n"));
+		(void) fprintf(fp,
+		    gettext("where 'command' is one of the following:\n\n"));
+
+		for (i = 0; i < NCOMMAND; i++) {
+			if (command_table[i].name == NULL)
+				(void) fprintf(fp, "\n");
+			else
+				(void) fprintf(fp, "%s",
+				    command_table[i].usage);
+		}
+	} else {
+		(void) fprintf(fp, gettext("usage:\n"));
+		(void) fprintf(fp, current_command->usage);
+
+		if (strcmp(current_command->name, "list") == 0) {
+			(void) fprintf(fp, gettext("\nwhere 'field' is one "
+			    "of the following:\n\n"));
+
+			for (i = 0; column_subopts[i] != NULL; i++)
+				(void) fprintf(fp, "\t%s\n", column_subopts[i]);
+		}
+	}
+
+	exit(requested ? 0 : 2);
+}
+
+const char *
+state_to_name(int state)
+{
+	switch (state) {
+	case VDEV_STATE_CLOSED:
+	case VDEV_STATE_CANT_OPEN:
+		return (gettext("FAULTED"));
+	case VDEV_STATE_OFFLINE:
+		return (gettext("OFFLINE"));
+	case VDEV_STATE_DEGRADED:
+		return (gettext("DEGRADED"));
+	case VDEV_STATE_HEALTHY:
+		return (gettext("ONLINE"));
+	}
+
+	return (gettext("UNKNOWN"));
+}
+
+void
+print_vdev_tree(const char *name, nvlist_t *nv, int indent)
+{
+	nvlist_t **child;
+	uint_t c, children;
+
+	if (name != NULL)
+		(void) printf("\t%*s%s\n", indent, "", name);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		return;
+
+	for (c = 0; c < children; c++)
+		print_vdev_tree(vdev_get_name(child[c]), child[c], indent + 2);
+}
+
+/*
+ * zpool add [-fn] <pool> <vdev> ...
+ *
+ *	-f	Force addition of devices, even if they appear in use
+ *	-n	Do not add the devices, but display the resulting layout if
+ *		they were to be added.
+ *
+ * Adds the given vdevs to 'pool'.  As with create, the bulk of this work is
+ * handled by get_vdev_spec(), which constructs the nvlist needed to pass to
+ * libzfs.
+ */
+int
+zpool_do_add(int argc, char **argv)
+{
+	int force = FALSE;
+	int dryrun = FALSE;
+	int c;
+	nvlist_t *nvroot;
+	char *poolname;
+	int ret;
+	zpool_handle_t *zhp;
+	nvlist_t *config;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "fn")) != -1) {
+		switch (c) {
+		case 'f':
+			force = TRUE;
+			break;
+		case 'n':
+			dryrun = TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* get pool name and check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name argument\n"));
+		usage(FALSE);
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing vdev specification\n"));
+		usage(FALSE);
+	}
+
+	poolname = argv[0];
+
+	argc--;
+	argv++;
+
+	if ((zhp = zpool_open(poolname)) == NULL)
+		return (1);
+
+	if ((config = zpool_get_config(zhp)) == NULL) {
+		(void) fprintf(stderr, gettext("pool '%s' is unavailable\n"),
+		    poolname);
+		zpool_close(zhp);
+		return (1);
+	}
+
+	/* pass off to get_vdev_spec for processing */
+	nvroot = make_root_vdev(config, force, !force, argc, argv);
+	if (nvroot == NULL) {
+		zpool_close(zhp);
+		return (1);
+	}
+
+	if (dryrun) {
+		nvlist_t *poolnvroot;
+
+		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+		    &poolnvroot) == 0);
+
+		(void) printf(gettext("would update '%s' to the following "
+		    "configuration:\n"), zpool_get_name(zhp));
+
+		print_vdev_tree(poolname, poolnvroot, 0);
+		print_vdev_tree(NULL, nvroot, 0);
+
+		ret = 0;
+	} else {
+		ret = (zpool_add(zhp, nvroot) != 0);
+	}
+
+	return (ret);
+}
+
+/*
+ * zpool create [-fn] [-R root] [-m mountpoint] <pool> <dev> ...
+ *
+ *	-f	Force creation, even if devices appear in use
+ *	-n	Do not create the pool, but display the resulting layout if it
+ *		were to be created.
+ *      -R	Create a pool under an alternate root
+ *      -m	Set default mountpoint for the root dataset.  By default it's
+ *      	'/<pool>'
+ *
+ * Creates the the named pool according to the given vdev specification.  The
+ * bulk of the vdev processing is done in get_vdev_spec() in zpool_vdev.c.  Once
+ * we get the nvlist back from get_vdev_spec(), we either print out the contents
+ * (if '-n' was specified), or pass it to libzfs to do the creation.
+ */
+int
+zpool_do_create(int argc, char **argv)
+{
+	int force = FALSE;
+	int dryrun = FALSE;
+	int c;
+	nvlist_t *nvroot;
+	char *poolname;
+	int ret;
+	char *altroot = NULL;
+	char *mountpoint = NULL;
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":fnR:m:")) != -1) {
+		switch (c) {
+		case 'f':
+			force = TRUE;
+			break;
+		case 'n':
+			dryrun = TRUE;
+			break;
+		case 'R':
+			altroot = optarg;
+			break;
+		case 'm':
+			mountpoint = optarg;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* get pool name and check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name argument\n"));
+		usage(FALSE);
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing vdev specification\n"));
+		usage(FALSE);
+	}
+
+	poolname = argv[0];
+
+	/*
+	 * As a special case, check for use of '/' in the name, and direct the
+	 * user to use 'zfs create' instead.
+	 */
+	if (strchr(poolname, '/') != NULL) {
+		(void) fprintf(stderr, gettext("cannot create '%s': invalid "
+		    "character '/' in pool name\n"), poolname);
+		(void) fprintf(stderr, gettext("use 'zfs create' to "
+		    "create a dataset\n"));
+		return (1);
+	}
+
+	/* pass off to get_vdev_spec for bulk processing */
+	nvroot = make_root_vdev(NULL, force, !force, argc - 1, argv + 1);
+	if (nvroot == NULL)
+		return (1);
+
+	if (altroot != NULL && altroot[0] != '/') {
+		(void) fprintf(stderr, gettext("invalid alternate root '%s': "
+		    "must be an absolute path\n"));
+		return (1);
+	}
+
+	/*
+	 * Check the validity of the mountpoint and direct the user to use the
+	 * '-m' mountpoint option if it looks like its in use.
+	 */
+	if (mountpoint == NULL ||
+	    (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 &&
+	    strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) {
+		char buf[MAXPATHLEN];
+		struct stat64 statbuf;
+
+		if (mountpoint && mountpoint[0] != '/') {
+			(void) fprintf(stderr, gettext("invalid mountpoint "
+			    "'%s': must be an absolute path, 'legacy', or "
+			    "'none'\n"), mountpoint);
+			return (1);
+		}
+
+		if (mountpoint == NULL) {
+			if (altroot != NULL)
+				(void) snprintf(buf, sizeof (buf), "%s/%s",
+				    altroot, poolname);
+			else
+				(void) snprintf(buf, sizeof (buf), "/%s",
+				    poolname);
+		} else {
+			if (altroot != NULL)
+				(void) snprintf(buf, sizeof (buf), "%s%s",
+				    altroot, mountpoint);
+			else
+				(void) snprintf(buf, sizeof (buf), "%s",
+				    mountpoint);
+		}
+
+		if (stat64(buf, &statbuf) == 0 &&
+		    statbuf.st_nlink != 2) {
+			if (mountpoint == NULL)
+				(void) fprintf(stderr, gettext("default "
+				    "mountpoint '%s' exists and is not "
+				    "empty\n"), buf);
+			else
+				(void) fprintf(stderr, gettext("mountpoint "
+				    "'%s' exists and is not empty\n"), buf);
+			(void) fprintf(stderr, gettext("use '-m' "
+			    "option to provide a different default\n"));
+			return (1);
+		}
+	}
+
+
+	if (dryrun) {
+		/*
+		 * For a dry run invocation, print out a basic message and run
+		 * through all the vdevs in the list and print out in an
+		 * appropriate hierarchy.
+		 *
+		 * XXZFS find out of we can create the pool?
+		 */
+		(void) printf(gettext("would create '%s' with the "
+		    "following layout:\n\n"), poolname);
+
+		print_vdev_tree(poolname, nvroot, 0);
+
+		ret = 0;
+	} else {
+		ret = 1;
+		/*
+		 * Hand off to libzfs.
+		 */
+		if (zpool_create(poolname, nvroot, altroot) == 0) {
+			zfs_handle_t *pool = zfs_open(poolname,
+			    ZFS_TYPE_FILESYSTEM);
+			if (pool != NULL) {
+				if (mountpoint != NULL)
+					verify(zfs_prop_set(pool,
+					    ZFS_PROP_MOUNTPOINT,
+					    mountpoint) == 0);
+				if (zfs_mount(pool, NULL, 0) == 0)
+					ret = zfs_share(pool);
+				zfs_close(pool);
+			}
+		}
+
+	}
+
+	nvlist_free(nvroot);
+
+	return (ret);
+}
+
+/*
+ * zpool destroy <pool>
+ *
+ * 	-f	Forcefully unmount any datasets
+ *
+ * Destroy the given pool.  Automatically unmounts any datasets in the pool.
+ */
+int
+zpool_do_destroy(int argc, char **argv)
+{
+	int force = FALSE;
+	int c;
+	char *pool;
+	zpool_handle_t *zhp;
+	int ret;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "f")) != -1) {
+		switch (c) {
+		case 'f':
+			force = TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool argument\n"));
+		usage(FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(FALSE);
+	}
+
+	pool = argv[0];
+
+	if ((zhp = zpool_open_canfail(pool)) == NULL) {
+		/*
+		 * As a special case, check for use of '/' in the name, and
+		 * direct the user to use 'zfs destroy' instead.
+		 */
+		if (strchr(pool, '/') != NULL)
+			(void) fprintf(stderr, gettext("use 'zfs destroy' to "
+			    "destroy a dataset\n"));
+		return (1);
+	}
+
+	if (unmount_datasets(zhp, force) != 0) {
+		(void) fprintf(stderr, gettext("could not destroy '%s': "
+		    "could not unmount datasets\n"), zpool_get_name(zhp));
+		return (1);
+	}
+
+	ret = (zpool_destroy(zhp) != 0);
+
+	zpool_close(zhp);
+
+	return (ret);
+}
+
+/*
+ * zpool export [-f] <pool> ...
+ *
+ *	-f	Forcefully unmount datasets
+ *
+ * Export the the given pools.  By default, the command will attempt to cleanly
+ * unmount any active datasets within the pool.  If the '-f' flag is specified,
+ * then the datasets will be forcefully unmounted.
+ */
+int
+zpool_do_export(int argc, char **argv)
+{
+	int force = FALSE;
+	int c;
+	zpool_handle_t *zhp;
+	int ret;
+	int i;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "f")) != -1) {
+		switch (c) {
+		case 'f':
+			force = TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool argument\n"));
+		usage(FALSE);
+	}
+
+	ret = 0;
+	for (i = 0; i < argc; i++) {
+		if ((zhp = zpool_open_canfail(argv[i])) == NULL) {
+			ret = 1;
+			continue;
+		}
+
+		if (unmount_datasets(zhp, force) != 0) {
+			ret = 1;
+			zpool_close(zhp);
+			continue;
+		}
+
+		if (zpool_export(zhp) != 0)
+			ret = 1;
+
+		zpool_close(zhp);
+	}
+
+	return (ret);
+}
+
+/*
+ * Given a vdev configuration, determine the maximum width needed for the device
+ * name column.
+ */
+static int
+max_width(nvlist_t *nv, int depth, int max)
+{
+	const char *name = vdev_get_name(nv);
+	nvlist_t **child;
+	uint_t c, children;
+	int ret;
+
+	if (strlen(name) + depth > max)
+		max = strlen(name) + depth;
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		return (max);
+
+	for (c = 0; c < children; c++)
+		if ((ret = max_width(child[c], depth + 2, max)) > max)
+			max = ret;
+
+	return (max);
+}
+
+
+/*
+ * Print the configuration of an exported pool.  Iterate over all vdevs in the
+ * pool, printing out the name and status for each one.
+ */
+void
+print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	vdev_stat_t *vs;
+	char *type;
+
+	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+	if (strcmp(type, VDEV_TYPE_MISSING) == 0)
+		return;
+
+	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+	    (uint64_t **)&vs, &c) == 0);
+
+	(void) printf("\t%*s%-*s", depth, "", namewidth - depth, name);
+
+	if (vs->vs_aux != 0) {
+		(void) printf("  %-8s  ", state_to_name(vs->vs_state));
+
+		switch (vs->vs_aux) {
+		case VDEV_AUX_OPEN_FAILED:
+			(void) printf(gettext("cannot open"));
+			break;
+
+		case VDEV_AUX_BAD_GUID_SUM:
+			(void) printf(gettext("missing device"));
+			break;
+
+		case VDEV_AUX_NO_REPLICAS:
+			(void) printf(gettext("insufficient replicas"));
+			break;
+
+		default:
+			(void) printf(gettext("corrupted data"));
+			break;
+		}
+	} else {
+		(void) printf("  %s", state_to_name(vs->vs_state));
+	}
+	(void) printf("\n");
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		return;
+
+	for (c = 0; c < children; c++)
+		print_import_config(vdev_get_name(child[c]), child[c],
+		    namewidth, depth + 2);
+}
+
+/*
+ * Display the status for the given pool.
+ */
+static void
+show_import(nvlist_t *config)
+{
+	uint64_t pool_state;
+	vdev_stat_t *vs;
+	char *name;
+	uint64_t guid;
+	char *msgid;
+	nvlist_t *nvroot;
+	int reason;
+	char *health;
+	uint_t vsc;
+	int namewidth;
+
+	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+	    &name) == 0);
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+	    &guid) == 0);
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+	    &pool_state) == 0);
+	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_HEALTH,
+	    &health) == 0);
+	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+
+	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+	    (uint64_t **)&vs, &vsc) == 0);
+
+	reason = zpool_import_status(config, &msgid);
+
+	(void) printf("  pool: %s\n", name);
+	(void) printf("    id: %llu\n", guid);
+	(void) printf(" state: %s\n", health);
+
+	switch (reason) {
+	case ZPOOL_STATUS_MISSING_DEV_R:
+	case ZPOOL_STATUS_MISSING_DEV_NR:
+	case ZPOOL_STATUS_BAD_GUID_SUM:
+		(void) printf(gettext("status: One or more devices are missing "
+		    "from the system.\n"));
+		break;
+
+	case ZPOOL_STATUS_CORRUPT_LABEL_R:
+	case ZPOOL_STATUS_CORRUPT_LABEL_NR:
+		(void) printf(gettext("status: One or more devices contains "
+		    "corrupted data.\n"));
+		break;
+
+	case ZPOOL_STATUS_CORRUPT_DATA:
+		(void) printf(gettext("status: The pool data is corrupted.\n"));
+		break;
+
+	default:
+		/*
+		 * No other status can be seen when importing pools.
+		 */
+		assert(reason == ZPOOL_STATUS_OK);
+	}
+
+	/*
+	 * Print out an action according to the overall state of the pool.
+	 */
+	if (strcmp(health, gettext("ONLINE")) == 0) {
+		(void) printf(gettext("action: The pool can be imported"
+		    " using its name or numeric identifier."));
+		if (pool_state != POOL_STATE_EXPORTED)
+			(void) printf(gettext("  The\n\tpool may be active on "
+			    "on another system, but can be imported using\n\t"
+			    "the '-f' flag.\n"));
+		else
+			(void) printf("\n");
+	} else if (strcmp(health, gettext("DEGRADED")) == 0) {
+		(void) printf(gettext("action: The pool can be imported "
+		    "despite missing or damaged devices.  The\n\tfault "
+		    "tolerance of the pool may be compromised if imported."));
+		if (pool_state != POOL_STATE_EXPORTED)
+			(void) printf(gettext("  The\n\tpool may be active on "
+			    "on another system, but can be imported using\n\t"
+			    "the '-f' flag.\n"));
+		else
+			(void) printf("\n");
+	} else {
+		if (reason == ZPOOL_STATUS_MISSING_DEV_R ||
+		    reason == ZPOOL_STATUS_MISSING_DEV_NR ||
+		    reason == ZPOOL_STATUS_BAD_GUID_SUM)
+			(void) printf(gettext("action: The pool cannot be "
+			    "imported. Attach the missing\n\tdevices and try "
+			    "again.\n"));
+		else
+			(void) printf(gettext("action: The pool cannot be "
+			    "imported due to damaged devices or data.\n"));
+	}
+
+	if (msgid != NULL)
+		(void) printf(gettext("   see: http://www.sun.com/msg/%s\n"),
+		    msgid);
+
+	(void) printf(gettext("config:\n\n"));
+
+	namewidth = max_width(nvroot, 0, 0);
+	if (namewidth < 10)
+		namewidth = 10;
+	print_import_config(name, nvroot, namewidth, 0);
+
+	if (reason == ZPOOL_STATUS_BAD_GUID_SUM) {
+		(void) printf("\n\tAdditional devices are known to "
+		    "be part of this pool, though their\n\texact "
+		    "configuration cannot be determined.\n");
+	}
+}
+
+/*
+ * Perform the import for the given configuration.  This passes the heavy
+ * lifting off to zpool_import(), and then mounts the datasets contained within
+ * the pool.
+ */
+static int
+do_import(nvlist_t *config, const char *newname, const char *mntopts,
+    const char *altroot, int force)
+{
+	zpool_handle_t *zhp;
+	char *name;
+	uint64_t state;
+
+	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+	    &name) == 0);
+
+	verify(nvlist_lookup_uint64(config,
+	    ZPOOL_CONFIG_POOL_STATE, &state) == 0);
+	if (state != POOL_STATE_EXPORTED && !force) {
+		(void) fprintf(stderr, gettext("cannot import '%s': pool "
+		    "may be in use from other system\n"), name);
+		(void) fprintf(stderr, gettext("use '-f' to import anyway\n"));
+		return (1);
+	}
+
+	if (zpool_import(config, newname, altroot) != 0)
+		return (1);
+
+	if (newname != NULL)
+		name = (char *)newname;
+
+	verify((zhp = zpool_open(name)) != NULL);
+
+	if (mount_datasets(zhp, mntopts) != 0) {
+		zpool_close(zhp);
+		return (1);
+	}
+
+	zpool_close(zhp);
+	return (0);
+}
+
+/*
+ * zpool import [-d dir]
+ *       import [-R root] [-d dir] [-f] -a
+ *       import [-R root] [-d dir] [-f] <pool | id> [newpool]
+ *
+ *       -d	Scan in a specific directory, other than /dev/dsk.  More than
+ *		one directory can be specified using multiple '-d' options.
+ *
+ *       -R	Temporarily import the pool, with all mountpoints relative to
+ *		the given root.  The pool will remain exported when the machine
+ *		is rebooted.
+ *
+ *       -f	Force import, even if it appears that the pool is active.
+ *
+ *       -a	Import all pools found.
+ *
+ * The import command scans for pools to import, and import pools based on pool
+ * name and GUID.  The pool can also be renamed as part of the import process.
+ */
+int
+zpool_do_import(int argc, char **argv)
+{
+	char **searchdirs = NULL;
+	int nsearch = 0;
+	int c;
+	int err;
+	nvlist_t *pools;
+	int do_all = FALSE;
+	char *altroot = NULL;
+	char *mntopts = NULL;
+	int do_force = FALSE;
+	nvpair_t *elem;
+	nvlist_t *config;
+	uint64_t searchguid;
+	char *searchname;
+	nvlist_t *found_config;
+	int first;
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":fd:R:ao:")) != -1) {
+		switch (c) {
+		case 'a':
+			do_all = TRUE;
+			break;
+		case 'd':
+			if (searchdirs == NULL) {
+				searchdirs = safe_malloc(sizeof (char *));
+			} else {
+				char **tmp = safe_malloc((nsearch + 1) *
+				    sizeof (char *));
+				bcopy(searchdirs, tmp, nsearch *
+				    sizeof (char *));
+				free(searchdirs);
+				searchdirs = tmp;
+			}
+			searchdirs[nsearch++] = optarg;
+			break;
+		case 'f':
+			do_force = TRUE;
+			break;
+		case 'o':
+			mntopts = optarg;
+			break;
+		case 'R':
+			altroot = optarg;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (searchdirs == NULL) {
+		searchdirs = safe_malloc(sizeof (char *));
+		searchdirs[0] = "/dev/dsk";
+		nsearch = 1;
+	}
+
+	/* check argument count */
+	if (do_all) {
+		if (argc != 0) {
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+			usage(FALSE);
+		}
+	} else {
+		if (argc > 2) {
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+			usage(FALSE);
+		}
+
+		/*
+		 * Check for the SYS_CONFIG privilege.  We do this explicitly
+		 * here because otherwise any attempt to discover pools will
+		 * silently fail.
+		 */
+		if (argc == 0 && !priv_ineffect(PRIV_SYS_CONFIG)) {
+			(void) fprintf(stderr, gettext("cannot "
+			    "discover pools: permission denied\n"));
+			return (1);
+		}
+	}
+
+	if ((pools = zpool_find_import(nsearch, searchdirs)) == NULL)
+		return (1);
+
+	/*
+	 * We now have a list of all available pools in the given directories.
+	 * Depending on the arguments given, we do one of the following:
+	 *
+	 *	<none>	Iterate through all pools and display information about
+	 *		each one.
+	 *
+	 *	-a	Iterate through all pools and try to import each one.
+	 *
+	 *	<id>	Find the pool that corresponds to the given GUID/pool
+	 *		name and import that one.
+	 */
+	if (argc != 0) {
+		char *endptr;
+
+		errno = 0;
+		searchguid = strtoull(argv[0], &endptr, 10);
+		if (errno != 0 || *endptr != '\0')
+			searchname = argv[0];
+		else
+			searchname = NULL;
+		found_config = NULL;
+	}
+
+	err = 0;
+	elem = NULL;
+	first = TRUE;
+	while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
+
+		verify(nvpair_value_nvlist(elem, &config) == 0);
+
+		if (argc == 0) {
+			if (first)
+				first = FALSE;
+			else
+				(void) printf("\n");
+
+			if (do_all)
+				err |= do_import(config, NULL, mntopts,
+				    altroot, do_force);
+			else
+				show_import(config);
+		} else if (searchname != NULL) {
+			char *name;
+
+			/*
+			 * We are searching for a pool based on name.
+			 */
+			verify(nvlist_lookup_string(config,
+			    ZPOOL_CONFIG_POOL_NAME, &name) == 0);
+
+			if (strcmp(name, searchname) == 0) {
+				if (found_config != NULL) {
+					(void) fprintf(stderr, gettext(
+					    "cannot import '%s': more than "
+					    "one matching pool\n"), searchname);
+					(void) fprintf(stderr, gettext(
+					    "import by numeric ID instead\n"));
+					err = TRUE;
+				}
+				found_config = config;
+			}
+		} else {
+			uint64_t guid;
+
+			/*
+			 * Search for a pool by guid.
+			 */
+			verify(nvlist_lookup_uint64(config,
+			    ZPOOL_CONFIG_POOL_GUID, &guid) == 0);
+
+			if (guid == searchguid)
+				found_config = config;
+		}
+	}
+
+	/*
+	 * If we were searching for a specific pool, verify that we found a
+	 * pool, and then do the import.
+	 */
+	if (argc != 0 && err == 0) {
+		if (found_config == NULL) {
+			(void) fprintf(stderr, gettext("cannot import '%s': "
+			    "no such pool available\n"), argv[0]);
+			err = TRUE;
+		} else {
+			err |= do_import(found_config, argc == 1 ? NULL :
+			    argv[1], mntopts, altroot, do_force);
+		}
+	}
+
+	/*
+	 * If we were just looking for pools, report an error if none were
+	 * found.
+	 */
+	if (argc == 0 && first)
+		(void) fprintf(stderr,
+		    gettext("no pools available to import\n"));
+
+	nvlist_free(pools);
+
+	return (err ? 1 : 0);
+}
+
+typedef struct iostat_cbdata {
+	zpool_list_t *cb_list;
+	int cb_verbose;
+	int cb_iteration;
+	int cb_namewidth;
+} iostat_cbdata_t;
+
+static void
+print_iostat_separator(iostat_cbdata_t *cb)
+{
+	int i = 0;
+
+	for (i = 0; i < cb->cb_namewidth; i++)
+		(void) printf("-");
+	(void) printf("  -----  -----  -----  -----  -----  -----\n");
+}
+
+static void
+print_iostat_header(iostat_cbdata_t *cb)
+{
+	(void) printf("%*s     capacity     operations    bandwidth\n",
+	    cb->cb_namewidth, "");
+	(void) printf("%-*s   used  avail   read  write   read  write\n",
+	    cb->cb_namewidth, "pool");
+	print_iostat_separator(cb);
+}
+
+/*
+ * Display a single statistic.
+ */
+void
+print_one_stat(uint64_t value)
+{
+	char buf[64];
+
+	zfs_nicenum(value, buf, sizeof (buf));
+	(void) printf("  %5s", buf);
+}
+
+/*
+ * Print out all the statistics for the given vdev.  This can either be the
+ * toplevel configuration, or called recursively.  If 'name' is NULL, then this
+ * is a verbose output, and we don't want to display the toplevel pool stats.
+ */
+void
+print_vdev_stats(const char *name, nvlist_t *oldnv, nvlist_t *newnv,
+	iostat_cbdata_t *cb, int depth)
+{
+	nvlist_t **oldchild, **newchild;
+	uint_t c, children;
+	vdev_stat_t *oldvs, *newvs;
+	vdev_stat_t zerovs = { 0 };
+	uint64_t tdelta;
+	double scale;
+
+	if (oldnv != NULL) {
+		verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_STATS,
+		    (uint64_t **)&oldvs, &c) == 0);
+	} else {
+		oldvs = &zerovs;
+	}
+
+	verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_STATS,
+	    (uint64_t **)&newvs, &c) == 0);
+
+	if (strlen(name) + depth > cb->cb_namewidth)
+		(void) printf("%*s%s", depth, "", name);
+	else
+		(void) printf("%*s%s%*s", depth, "", name,
+		    (int)(cb->cb_namewidth - strlen(name) - depth), "");
+
+	tdelta = newvs->vs_timestamp - oldvs->vs_timestamp;
+
+	if (tdelta == 0)
+		scale = 1.0;
+	else
+		scale = (double)NANOSEC / tdelta;
+
+	/* only toplevel vdevs have capacity stats */
+	if (newvs->vs_space == 0) {
+		(void) printf("      -      -");
+	} else {
+		print_one_stat(newvs->vs_alloc);
+		print_one_stat(newvs->vs_space - newvs->vs_alloc);
+	}
+
+	print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_READ] -
+	    oldvs->vs_ops[ZIO_TYPE_READ])));
+
+	print_one_stat((uint64_t)(scale * (newvs->vs_ops[ZIO_TYPE_WRITE] -
+	    oldvs->vs_ops[ZIO_TYPE_WRITE])));
+
+	print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_READ] -
+	    oldvs->vs_bytes[ZIO_TYPE_READ])));
+
+	print_one_stat((uint64_t)(scale * (newvs->vs_bytes[ZIO_TYPE_WRITE] -
+	    oldvs->vs_bytes[ZIO_TYPE_WRITE])));
+
+	(void) printf("\n");
+
+	if (!cb->cb_verbose)
+		return;
+
+	if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN,
+	    &newchild, &children) != 0)
+		return;
+
+	if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN,
+	    &oldchild, &c) != 0)
+		return;
+
+	for (c = 0; c < children; c++)
+		print_vdev_stats(vdev_get_name(newchild[c]),
+		    oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2);
+}
+
+/*
+ * Callback to print out the iostats for the given pool.
+ */
+int
+print_iostat(zpool_handle_t *zhp, void *data)
+{
+	iostat_cbdata_t *cb = data;
+	nvlist_t *oldconfig, *newconfig;
+	nvlist_t *oldnvroot, *newnvroot;
+	uint64_t oldtxg, newtxg;
+
+	if (zpool_refresh_stats(zhp, &oldconfig, &newconfig) != 0) {
+		/*
+		 * This pool has disappeared, so remove it
+		 * from the list and continue.
+		 */
+		pool_list_remove(cb->cb_list, zhp);
+		return (0);
+	}
+
+	if (cb->cb_iteration == 1) {
+		if (oldconfig != NULL)
+			nvlist_free(oldconfig);
+		oldconfig = NULL;
+	}
+
+	verify(nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_POOL_TXG,
+	    &newtxg) == 0);
+	verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE,
+	    &newnvroot) == 0);
+
+	if (oldconfig == NULL ||
+	    nvlist_lookup_uint64(oldconfig, ZPOOL_CONFIG_POOL_TXG, &oldtxg) ||
+	    oldtxg != newtxg ||
+	    nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, &oldnvroot))
+		oldnvroot = NULL;
+
+	/*
+	 * Print out the statistics for the pool.
+	 */
+	print_vdev_stats(zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0);
+
+	if (cb->cb_verbose)
+		print_iostat_separator(cb);
+
+	if (oldconfig != NULL)
+		nvlist_free(oldconfig);
+
+	return (0);
+}
+
+int
+get_namewidth(zpool_handle_t *zhp, void *data)
+{
+	iostat_cbdata_t *cb = data;
+	nvlist_t *config, *nvroot;
+
+	if ((config = zpool_get_config(zhp)) != NULL) {
+		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+		    &nvroot) == 0);
+		if (!cb->cb_verbose)
+			cb->cb_namewidth = strlen(zpool_get_name(zhp));
+		else
+			cb->cb_namewidth = max_width(nvroot, 0, 0);
+	}
+
+	/*
+	 * The width must fall into the range [10,38].  The upper limit is the
+	 * maximum we can have and still fit in 80 columns.
+	 */
+	if (cb->cb_namewidth < 10)
+		cb->cb_namewidth = 10;
+	if (cb->cb_namewidth > 38)
+		cb->cb_namewidth = 38;
+
+	return (0);
+}
+
+/*
+ * zpool iostat [-v] [pool] ... [interval [count]]
+ *
+ *	-v	Display statistics for individual vdevs
+ *
+ * This command can be tricky because we want to be able to deal with pool
+ * creation/destruction as well as vdev configuration changes.  The bulk of this
+ * processing is handled by the pool_list_* routines in zpool_iter.c.  We rely
+ * on pool_list_update() to detect the addition of new pools.  Configuration
+ * changes are all handled within libzfs.
+ */
+int
+zpool_do_iostat(int argc, char **argv)
+{
+	int c;
+	int ret;
+	int npools;
+	unsigned long interval = 0, count = 0;
+	zpool_list_t *list;
+	int verbose = FALSE;
+	iostat_cbdata_t cb;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "v")) != -1) {
+		switch (c) {
+		case 'v':
+			verbose = TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/*
+	 * Determine if the last argument is an integer or a pool name
+	 */
+	if (argc > 0 && isdigit(argv[argc - 1][0])) {
+		char *end;
+
+		errno = 0;
+		interval = strtoul(argv[argc - 1], &end, 10);
+
+		if (*end == '\0' && errno == 0) {
+			if (interval == 0) {
+				(void) fprintf(stderr, gettext("interval "
+				    "cannot be zero\n"));
+				usage(FALSE);
+			}
+
+			/*
+			 * Ignore the last parameter
+			 */
+			argc--;
+		} else {
+			/*
+			 * If this is not a valid number, just plow on.  The
+			 * user will get a more informative error message later
+			 * on.
+			 */
+			interval = 0;
+		}
+	}
+
+	/*
+	 * If the last argument is also an integer, then we have both a count
+	 * and an integer.
+	 */
+	if (argc > 0 && isdigit(argv[argc - 1][0])) {
+		char *end;
+
+		errno = 0;
+		count = interval;
+		interval = strtoul(argv[argc - 1], &end, 10);
+
+		if (*end == '\0' && errno == 0) {
+			if (interval == 0) {
+				(void) fprintf(stderr, gettext("interval "
+				    "cannot be zero\n"));
+				usage(FALSE);
+			}
+
+			/*
+			 * Ignore the last parameter
+			 */
+			argc--;
+		} else {
+			interval = 0;
+		}
+	}
+
+	/*
+	 * Construct the list of all interesting pools.
+	 */
+	ret = 0;
+	if ((list = pool_list_get(argc, argv, &ret)) == NULL)
+		return (1);
+
+	if (pool_list_count(list) == 0 && argc != 0)
+		return (1);
+
+	if (pool_list_count(list) == 0 && interval == 0) {
+		(void) fprintf(stderr, gettext("no pools available\n"));
+		return (1);
+	}
+
+	/*
+	 * Enter the main iostat loop.
+	 */
+	cb.cb_list = list;
+	cb.cb_verbose = verbose;
+	cb.cb_iteration = 0;
+	cb.cb_namewidth = 0;
+
+	for (;;) {
+		pool_list_update(list);
+
+		if ((npools = pool_list_count(list)) == 0)
+			break;
+
+		/*
+		 * Iterate over all pools to determine the maximum width
+		 * for the pool / device name column across all pools.
+		 */
+		cb.cb_namewidth = 0;
+		(void) pool_list_iter(list, FALSE, get_namewidth, &cb);
+
+		/*
+		 * If it's the first time, or verbose mode, print the header.
+		 */
+		if (++cb.cb_iteration == 1 || verbose)
+			print_iostat_header(&cb);
+
+		(void) pool_list_iter(list, FALSE, print_iostat, &cb);
+
+		/*
+		 * If there's more than one pool, and we're not in verbose mode
+		 * (which prints a separator for us), then print a separator.
+		 */
+		if (npools > 1 && !verbose)
+			print_iostat_separator(&cb);
+
+		if (verbose)
+			(void) printf("\n");
+
+		if (interval == 0)
+			break;
+
+		if (count != 0 && --count == 0)
+			break;
+
+		(void) sleep(interval);
+	}
+
+	pool_list_free(list);
+
+	return (ret);
+}
+
+typedef struct list_cbdata {
+	int	cb_scripted;
+	int	cb_first;
+	int	cb_fields[MAX_FIELDS];
+	int	cb_fieldcount;
+} list_cbdata_t;
+
+/*
+ * Given a list of columns to display, output appropriate headers for each one.
+ */
+void
+print_header(int *fields, size_t count)
+{
+	int i;
+	column_def_t *col;
+	const char *fmt;
+
+	for (i = 0; i < count; i++) {
+		col = &column_table[fields[i]];
+		if (i != 0)
+			(void) printf("  ");
+		if (col->cd_justify == left_justify)
+			fmt = "%-*s";
+		else
+			fmt = "%*s";
+
+		(void) printf(fmt, i == count - 1 ? strlen(col->cd_title) :
+		    col->cd_width, col->cd_title);
+	}
+
+	(void) printf("\n");
+}
+
+int
+list_callback(zpool_handle_t *zhp, void *data)
+{
+	list_cbdata_t *cbp = data;
+	nvlist_t *config;
+	int i;
+	char buf[ZPOOL_MAXNAMELEN];
+	uint64_t total;
+	uint64_t used;
+	const char *fmt;
+	column_def_t *col;
+
+	if (cbp->cb_first) {
+		if (!cbp->cb_scripted)
+			print_header(cbp->cb_fields, cbp->cb_fieldcount);
+		cbp->cb_first = FALSE;
+	}
+
+	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
+		config = NULL;
+	} else {
+		config = zpool_get_config(zhp);
+		total = zpool_get_space_total(zhp);
+		used = zpool_get_space_used(zhp);
+	}
+
+	for (i = 0; i < cbp->cb_fieldcount; i++) {
+		if (i != 0) {
+			if (cbp->cb_scripted)
+				(void) printf("\t");
+			else
+				(void) printf("  ");
+		}
+
+		col = &column_table[cbp->cb_fields[i]];
+
+		switch (cbp->cb_fields[i]) {
+		case ZPOOL_FIELD_NAME:
+			(void) strlcpy(buf, zpool_get_name(zhp), sizeof (buf));
+			break;
+
+		case ZPOOL_FIELD_SIZE:
+			if (config == NULL)
+				(void) strlcpy(buf, "-", sizeof (buf));
+			else
+				zfs_nicenum(total, buf, sizeof (buf));
+			break;
+
+		case ZPOOL_FIELD_USED:
+			if (config == NULL)
+				(void) strlcpy(buf, "-", sizeof (buf));
+			else
+				zfs_nicenum(used, buf, sizeof (buf));
+			break;
+
+		case ZPOOL_FIELD_AVAILABLE:
+			if (config == NULL)
+				(void) strlcpy(buf, "-", sizeof (buf));
+			else
+				zfs_nicenum(total - used, buf, sizeof (buf));
+			break;
+
+		case ZPOOL_FIELD_CAPACITY:
+			if (config == NULL) {
+				(void) strlcpy(buf, "-", sizeof (buf));
+			} else {
+				uint64_t capacity = (total == 0 ? 0 :
+				    (used * 100 / total));
+				(void) snprintf(buf, sizeof (buf), "%llu%%",
+				    capacity);
+			}
+			break;
+
+		case ZPOOL_FIELD_HEALTH:
+			if (config == NULL) {
+				(void) strlcpy(buf, "FAULTED", sizeof (buf));
+			} else {
+				nvlist_t *nvroot;
+				vdev_stat_t *vs;
+				uint_t vsc;
+
+				verify(nvlist_lookup_nvlist(config,
+				    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+				verify(nvlist_lookup_uint64_array(nvroot,
+				    ZPOOL_CONFIG_STATS, (uint64_t **)&vs,
+				    &vsc) == 0);
+				(void) strlcpy(buf, state_to_name(vs->vs_state),
+				    sizeof (buf));
+			}
+			break;
+
+		case ZPOOL_FIELD_ROOT:
+			if (config == NULL)
+				(void) strlcpy(buf, "-", sizeof (buf));
+			else if (zpool_get_root(zhp, buf, sizeof (buf)) != 0)
+				(void) strlcpy(buf, "-", sizeof (buf));
+			break;
+		}
+
+		if (cbp->cb_scripted)
+			(void) printf("%s", buf);
+		else {
+			if (col->cd_justify == left_justify)
+				fmt = "%-*s";
+			else
+				fmt = "%*s";
+
+			(void) printf(fmt, i == cbp->cb_fieldcount - 1 ?
+			    strlen(buf) : col->cd_width, buf);
+		}
+	}
+
+	(void) printf("\n");
+
+	return (0);
+}
+
+/*
+ * zpool list [-H] [-o field[,field]*] [pool] ...
+ *
+ *	-H	Scripted mode.  Don't display headers, and separate fields by
+ *		a single tab.
+ *	-o	List of fields to display.  Defaults to all fields, or
+ *		"name,size,used,available,capacity,health,root"
+ *
+ * List all pools in the system, whether or not they're healthy.  Output space
+ * statistics for each one, as well as health status summary.
+ */
+int
+zpool_do_list(int argc, char **argv)
+{
+	int c;
+	int ret;
+	list_cbdata_t cb = { 0 };
+	static char default_fields[] =
+	    "name,size,used,available,capacity,health,root";
+	char *fields = default_fields;
+	char *value;
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":Ho:")) != -1) {
+		switch (c) {
+		case 'H':
+			cb.cb_scripted = TRUE;
+			break;
+		case 'o':
+			fields = optarg;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	while (*fields != '\0') {
+		if (cb.cb_fieldcount == MAX_FIELDS) {
+			(void) fprintf(stderr, gettext("too many "
+			    "properties given to -o option\n"));
+			usage(FALSE);
+		}
+
+		if ((cb.cb_fields[cb.cb_fieldcount] = getsubopt(&fields,
+		    column_subopts, &value)) == -1) {
+			(void) fprintf(stderr, gettext("invalid property "
+			    "'%s'\n"), value);
+			usage(FALSE);
+		}
+
+		cb.cb_fieldcount++;
+	}
+
+
+	cb.cb_first = TRUE;
+
+	ret = for_each_pool(argc, argv, TRUE, list_callback, &cb);
+
+	if (argc == 0 && cb.cb_first) {
+		(void) printf(gettext("no pools available\n"));
+		return (0);
+	}
+
+	return (ret);
+}
+
+static nvlist_t *
+zpool_get_vdev_by_name(nvlist_t *nv, char *name)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	nvlist_t *match;
+	char *path;
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0) {
+		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
+		if (strncmp(name, "/dev/dsk/", 9) == 0)
+			name += 9;
+		if (strncmp(path, "/dev/dsk/", 9) == 0)
+			path += 9;
+		if (strcmp(name, path) == 0)
+			return (nv);
+		return (NULL);
+	}
+
+	for (c = 0; c < children; c++)
+		if ((match = zpool_get_vdev_by_name(child[c], name)) != NULL)
+			return (match);
+
+	return (NULL);
+}
+
+static int
+zpool_do_attach_or_replace(int argc, char **argv, int replacing)
+{
+	int force = FALSE;
+	int c;
+	nvlist_t *nvroot;
+	char *poolname, *old_disk, *new_disk;
+	zpool_handle_t *zhp;
+	nvlist_t *config;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "f")) != -1) {
+		switch (c) {
+		case 'f':
+			force = TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* get pool name and check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name argument\n"));
+		usage(FALSE);
+	}
+
+	poolname = argv[0];
+
+	if (argc < 2) {
+		(void) fprintf(stderr,
+		    gettext("missing <device> specification\n"));
+		usage(FALSE);
+	}
+
+	old_disk = argv[1];
+
+	if (argc < 3) {
+		if (!replacing) {
+			(void) fprintf(stderr,
+			    gettext("missing <new_device> specification\n"));
+			usage(FALSE);
+		}
+		new_disk = old_disk;
+		argc -= 1;
+		argv += 1;
+	} else {
+		new_disk = argv[2];
+		argc -= 2;
+		argv += 2;
+	}
+
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(FALSE);
+	}
+
+	if ((zhp = zpool_open(poolname)) == NULL)
+		return (1);
+
+	if ((config = zpool_get_config(zhp)) == NULL) {
+		(void) fprintf(stderr, gettext("pool '%s' is unavailable\n"),
+		    poolname);
+		zpool_close(zhp);
+		return (1);
+	}
+
+	nvroot = make_root_vdev(config, force, B_FALSE, argc, argv);
+	if (nvroot == NULL) {
+		zpool_close(zhp);
+		return (1);
+	}
+
+	return (zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing));
+}
+
+/*
+ * zpool replace [-f] <pool> <device> <new_device>
+ *
+ *	-f	Force attach, even if <new_device> appears to be in use.
+ *
+ * Replace <device> with <new_device>.
+ */
+/* ARGSUSED */
+int
+zpool_do_replace(int argc, char **argv)
+{
+	return (zpool_do_attach_or_replace(argc, argv, B_TRUE));
+}
+
+/*
+ * zpool attach [-f] <pool> <device> <new_device>
+ *
+ *	-f	Force attach, even if <new_device> appears to be in use.
+ *
+ * Attach <new_device> to the mirror containing <device>.  If <device> is not
+ * part of a mirror, then <device> will be transformed into a mirror of
+ * <device> and <new_device>.  In either case, <new_device> will begin life
+ * with a DTL of [0, now], and will immediately begin to resilver itself.
+ */
+int
+zpool_do_attach(int argc, char **argv)
+{
+	return (zpool_do_attach_or_replace(argc, argv, B_FALSE));
+}
+
+/*
+ * zpool detach [-f] <pool> <device>
+ *
+ *	-f	Force detach of <device>, even if DTLs argue against it
+ *		(not supported yet)
+ *
+ * Detach a device from a mirror.  The operation will be refused if <device>
+ * is the last device in the mirror, or if the DTLs indicate that this device
+ * has the only valid copy of some data.
+ */
+/* ARGSUSED */
+int
+zpool_do_detach(int argc, char **argv)
+{
+	int c;
+	char *poolname, *path;
+	zpool_handle_t *zhp;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "f")) != -1) {
+		switch (c) {
+		case 'f':
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* get pool name and check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name argument\n"));
+		usage(FALSE);
+	}
+
+	if (argc < 2) {
+		(void) fprintf(stderr,
+		    gettext("missing <device> specification\n"));
+		usage(FALSE);
+	}
+
+	poolname = argv[0];
+	path = argv[1];
+
+	if ((zhp = zpool_open(poolname)) == NULL)
+		return (1);
+
+	return (zpool_vdev_detach(zhp, path));
+}
+
+/*
+ * zpool online [-t] <pool> <device>
+ *
+ *	-t	Only bring the device on-line temporarily.  The online
+ *		state will not be persistent across reboots.
+ */
+/* ARGSUSED */
+int
+zpool_do_online(int argc, char **argv)
+{
+	int c, i;
+	char *poolname;
+	zpool_handle_t *zhp;
+	int ret = 0;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "t")) != -1) {
+		switch (c) {
+		case 't':
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* get pool name and check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name\n"));
+		usage(FALSE);
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing device name\n"));
+		usage(FALSE);
+	}
+
+	poolname = argv[0];
+
+	if ((zhp = zpool_open(poolname)) == NULL)
+		return (1);
+
+	for (i = 1; i < argc; i++)
+		if (zpool_vdev_online(zhp, argv[i]) == 0)
+			(void) printf(gettext("Bringing device %s online\n"),
+			    argv[i]);
+		else
+			ret = 1;
+
+	return (ret);
+}
+
+/*
+ * zpool offline [-ft] <pool> <device>
+ *
+ *	-f	Force the device into the offline state, even if doing
+ *		so would appear to compromise pool availability.
+ *		(not supported yet)
+ *
+ *	-t	Only take the device off-line temporarily.  The offline
+ *		state will not be persistent across reboots.
+ *		(not supported yet)
+ */
+/* ARGSUSED */
+int
+zpool_do_offline(int argc, char **argv)
+{
+	int c, i;
+	char *poolname;
+	zpool_handle_t *zhp;
+	int ret = 0;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "ft")) != -1) {
+		switch (c) {
+		case 'f':
+		case 't':
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* get pool name and check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name\n"));
+		usage(FALSE);
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing device name\n"));
+		usage(FALSE);
+	}
+
+	poolname = argv[0];
+
+	if ((zhp = zpool_open(poolname)) == NULL)
+		return (1);
+
+	for (i = 1; i < argc; i++)
+		if (zpool_vdev_offline(zhp, argv[i]) == 0)
+			(void) printf(gettext("Bringing device %s offline\n"),
+			    argv[i]);
+		else
+			ret = 1;
+
+	return (ret);
+}
+
+typedef struct scrub_cbdata {
+	int	cb_type;
+} scrub_cbdata_t;
+
+int
+scrub_callback(zpool_handle_t *zhp, void *data)
+{
+	scrub_cbdata_t *cb = data;
+
+	return (zpool_scrub(zhp, cb->cb_type) != 0);
+}
+
+/*
+ * zpool scrub [-s] <pool> ...
+ *
+ *	-s	Stop.  Stops any in-progress scrub.
+ */
+int
+zpool_do_scrub(int argc, char **argv)
+{
+	int c;
+	scrub_cbdata_t cb;
+
+	cb.cb_type = POOL_SCRUB_EVERYTHING;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "s")) != -1) {
+		switch (c) {
+		case 's':
+			cb.cb_type = POOL_SCRUB_NONE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name argument\n"));
+		usage(FALSE);
+	}
+
+	return (for_each_pool(argc, argv, TRUE, scrub_callback, &cb));
+}
+
+typedef struct status_cbdata {
+	int	cb_verbose;
+	int	cb_explain;
+	int	cb_count;
+	int	cb_first;
+} status_cbdata_t;
+
+/*
+ * Print out detailed scrub status.
+ */
+void
+print_scrub_status(nvlist_t *nvroot)
+{
+	vdev_stat_t *vs;
+	uint_t vsc;
+	time_t start, end, now;
+	double fraction_done;
+	uint64_t examined, total, minutes_left;
+	char *scrub_type;
+
+	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+	    (uint64_t **)&vs, &vsc) == 0);
+
+	/*
+	 * If there's never been a scrub, there's not much to say.
+	 */
+	if (vs->vs_scrub_end == 0 && vs->vs_scrub_type == POOL_SCRUB_NONE) {
+		(void) printf(gettext("none requested\n"));
+		return;
+	}
+
+	scrub_type = (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
+	    "resilver" : "scrub";
+
+	start = vs->vs_scrub_start;
+	end = vs->vs_scrub_end;
+	now = time(NULL);
+	examined = vs->vs_scrub_examined;
+	total = vs->vs_alloc;
+
+	if (end != 0) {
+		(void) printf(gettext("%s %s with %llu errors on %s"),
+		    scrub_type, vs->vs_scrub_complete ? "completed" : "stopped",
+		    (u_longlong_t)vs->vs_scrub_errors, ctime(&end));
+		return;
+	}
+
+	if (examined == 0)
+		examined = 1;
+	if (examined > total)
+		total = examined;
+
+	fraction_done = (double)examined / total;
+	minutes_left = (uint64_t)((now - start) *
+	    (1 - fraction_done) / fraction_done / 60);
+
+	(void) printf(gettext("%s in progress, %.2f%% done, %lluh%um to go\n"),
+	    scrub_type, 100 * fraction_done,
+	    (u_longlong_t)(minutes_left / 60), (uint_t)(minutes_left % 60));
+}
+
+/*
+ * Print out configuration state as requested by status_callback.
+ */
+void
+print_status_config(const char *name, nvlist_t *nv, int namewidth, int depth)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	vdev_stat_t *vs;
+	char rbuf[6], wbuf[6], cbuf[6], repaired[6];
+
+	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+	    (uint64_t **)&vs, &c) == 0);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		children = 0;
+
+	(void) printf("\t%*s%-*s  %-8s", depth, "", namewidth - depth,
+	    name, state_to_name(vs->vs_state));
+
+	zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
+	zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
+	zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf));
+	(void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
+
+	if (vs->vs_aux != 0) {
+		(void) printf("  ");
+
+		switch (vs->vs_aux) {
+		case VDEV_AUX_OPEN_FAILED:
+			(void) printf(gettext("cannot open"));
+			break;
+
+		case VDEV_AUX_BAD_GUID_SUM:
+			(void) printf(gettext("missing device"));
+			break;
+
+		case VDEV_AUX_NO_REPLICAS:
+			(void) printf(gettext("insufficient replicas"));
+			break;
+
+		default:
+			(void) printf(gettext("corrupted data"));
+			break;
+		}
+	} else if (vs->vs_scrub_repaired != 0 && children == 0) {
+		/*
+		 * Report bytes resilvered/repaired on leaf devices.
+		 */
+		zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired));
+		(void) printf(gettext("  %s %s"), repaired,
+		    (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
+		    "resilvered" : "repaired");
+	}
+
+	(void) printf("\n");
+
+	for (c = 0; c < children; c++)
+		print_status_config(vdev_get_name(child[c]), child[c],
+		    namewidth, depth + 2);
+}
+
+/*
+ * Display a summary of pool status.  Displays a summary such as:
+ *
+ *        pool: tank
+ *	status: DEGRADED
+ *	reason: One or more devices ...
+ *         see: http://www.sun.com/msg/ZFS-xxxx-01
+ *	config:
+ *		mirror		DEGRADED
+ *                c1t0d0	OK
+ *                c2t0d0	FAULTED
+ *
+ * When given the '-v' option, we print out the complete config.  If the '-e'
+ * option is specified, then we print out error rate information as well.
+ */
+int
+status_callback(zpool_handle_t *zhp, void *data)
+{
+	status_cbdata_t *cbp = data;
+	nvlist_t *config, *nvroot;
+	char *msgid;
+	int reason;
+	char *health;
+
+	config = zpool_get_config(zhp);
+	reason = zpool_get_status(zhp, &msgid);
+
+	cbp->cb_count++;
+
+	/*
+	 * If we were given 'zpool status -x', only report those pools with
+	 * problems.
+	 */
+	if (reason == ZPOOL_STATUS_OK && cbp->cb_explain)
+		return (0);
+
+	if (cbp->cb_first)
+		cbp->cb_first = FALSE;
+	else
+		(void) printf("\n");
+
+	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_HEALTH,
+	    &health) == 0);
+
+	(void) printf(gettext("  pool: %s\n"), zpool_get_name(zhp));
+	(void) printf(gettext(" state: %s\n"), health);
+
+	switch (reason) {
+	case ZPOOL_STATUS_MISSING_DEV_R:
+		(void) printf(gettext("status: One or more devices could not "
+		    "be opened.  Sufficient replicas exist for\n\tthe pool to "
+		    "continue functioning in a degraded state.\n"));
+		(void) printf(gettext("action: Attach the missing device and "
+		    "online it using 'zpool online'.\n"));
+		break;
+
+	case ZPOOL_STATUS_MISSING_DEV_NR:
+		(void) printf(gettext("status: One or more devices could not "
+		    "be opened.  There are insufficient\n\treplicas for the "
+		    "pool to continue functioning.\n"));
+		(void) printf(gettext("action: Attach the missing device and "
+		    "online it using 'zpool online'.\n"));
+		break;
+
+	case ZPOOL_STATUS_CORRUPT_LABEL_R:
+		(void) printf(gettext("status: One or more devices could not "
+		    "be used because the label is missing or\n\tinvalid.  "
+		    "Sufficient replicas exist for the pool to continue\n\t"
+		    "functioning in a degraded state.\n"));
+		(void) printf(gettext("action: Replace the device using "
+		    "'zpool replace'.\n"));
+		break;
+
+	case ZPOOL_STATUS_CORRUPT_LABEL_NR:
+		(void) printf(gettext("status: One or more devices could not "
+		    "be used because the the label is missing \n\tor invalid.  "
+		    "There are insufficient replicas for the pool to "
+		    "continue\n\tfunctioning.\n"));
+		(void) printf(gettext("action: Destroy and re-create the pool "
+		    "from a backup source.\n"));
+		break;
+
+	case ZPOOL_STATUS_FAILING_DEV:
+		(void) printf(gettext("status: One or more devices has "
+		    "experienced an unrecoverable error.  An\n\tattempt was "
+		    "made to correct the error.  Applications are "
+		    "unaffected.\n"));
+		(void) printf(gettext("action: Determine if the device needs "
+		    "to be replaced, and clear the errors\n\tusing "
+		    "'zpool online' or replace the device with 'zpool "
+		    "replace'.\n"));
+		break;
+
+	case ZPOOL_STATUS_OFFLINE_DEV:
+		(void) printf(gettext("status: One or more devices has "
+		    "been taken offline by the adminstrator.\n\tSufficient "
+		    "replicas exist for the pool to continue functioning in "
+		    "a\n\tdegraded state.\n"));
+		(void) printf(gettext("action: Online the device using "
+		    "'zpool online' or replace the device with\n\t'zpool "
+		    "replace'.\n"));
+		break;
+
+	case ZPOOL_STATUS_RESILVERING:
+		(void) printf(gettext("status: One or more devices is "
+		    "currently being resilvered.  The pool will\n\tcontinue "
+		    "to function, possibly in a degraded state.\n"));
+		(void) printf(gettext("action: Wait for the resilver to "
+		    "complete.\n"));
+		break;
+
+	default:
+		/*
+		 * The remaining errors can't actually be generated, yet.
+		 */
+		assert(reason == ZPOOL_STATUS_OK);
+	}
+
+	if (msgid != NULL)
+		(void) printf(gettext("   see: http://www.sun.com/msg/%s\n"),
+		    msgid);
+
+	if (config != NULL) {
+		int namewidth;
+
+		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+		    &nvroot) == 0);
+
+		(void) printf(gettext(" scrub: "));
+		print_scrub_status(nvroot);
+
+		namewidth = max_width(nvroot, 0, 0);
+		if (namewidth < 10)
+			namewidth = 10;
+
+		(void) printf(gettext("config:\n\n"));
+		(void) printf(gettext("\t%-*s  %-8s %5s %5s %5s\n"), namewidth,
+		    "NAME", "STATE", "READ", "WRITE", "CKSUM");
+		print_status_config(zpool_get_name(zhp), nvroot, namewidth, 0);
+	} else {
+		(void) printf(gettext("config: The configuration cannot be "
+		    "determined.\n"));
+	}
+
+	return (0);
+}
+
+/*
+ * zpool status [-vx] [pool] ...
+ *
+ *	-v	Display complete error logs
+ *	-x	Display only pools with potential problems
+ *
+ * Describes the health status of all pools or some subset.
+ */
+int
+zpool_do_status(int argc, char **argv)
+{
+	int c;
+	int ret;
+	status_cbdata_t cb = { 0 };
+
+	/* check options */
+	while ((c = getopt(argc, argv, "vx")) != -1) {
+		switch (c) {
+		case 'v':
+			cb.cb_verbose = TRUE;
+			break;
+		case 'x':
+			cb.cb_explain = TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	cb.cb_first = TRUE;
+
+	ret = for_each_pool(argc, argv, TRUE, status_callback, &cb);
+
+	if (argc == 0 && cb.cb_count == 0)
+		(void) printf(gettext("no pools available\n"));
+	else if (cb.cb_explain && cb.cb_first) {
+		if (argc == 0) {
+			(void) printf(gettext("all pools are healthy\n"));
+		} else {
+			int i;
+			for (i = 0; i < argc; i++)
+				(void) printf(gettext("pool '%s' is healthy\n"),
+				    argv[i]);
+		}
+	}
+
+	return (ret);
+}
+
+int
+main(int argc, char **argv)
+{
+	int ret;
+	int i;
+	char *cmdname;
+
+	(void) setlocale(LC_ALL, "");
+	(void) textdomain(TEXT_DOMAIN);
+
+	opterr = 0;
+
+	/*
+	 * Make sure the user has specified some command.
+	 */
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing command\n"));
+		usage(FALSE);
+	}
+
+	cmdname = argv[1];
+
+	/*
+	 * Special case '-?'
+	 */
+	if (strcmp(cmdname, "-?") == 0)
+		usage(TRUE);
+
+	/*
+	 * Run the appropriate command.
+	 */
+	for (i = 0; i < NCOMMAND; i++) {
+		if (command_table[i].name == NULL)
+			continue;
+
+		if (strcmp(cmdname, command_table[i].name) == 0) {
+			current_command = &command_table[i];
+			ret = command_table[i].func(argc - 1, argv + 1);
+			break;
+		}
+	}
+
+	/*
+	 * 'freeze' is a vile debugging abomination, so we treat it as such.
+	 */
+	if (strcmp(cmdname, "freeze") == 0 && argc == 3) {
+		char buf[8192];
+		int fd = open("/dev/zpoolctl", O_RDWR);
+		(void) strcpy((void *)buf, argv[2]);
+		return (!!ioctl(fd, ZFS_IOC_POOL_FREEZE, buf));
+	}
+
+	if (i == NCOMMAND) {
+		(void) fprintf(stderr, gettext("unrecognized "
+		    "command '%s'\n"), cmdname);
+		usage(FALSE);
+	}
+
+	/*
+	 * The 'ZFS_ABORT' environment variable causes us to dump core on exit
+	 * for the purposes of running ::findleaks.
+	 */
+	if (getenv("ZFS_ABORT") != NULL) {
+		(void) printf("dumping core by request\n");
+		abort();
+	}
+
+	return (ret);
+}
diff --git a/usr/src/cmd/zpool/zpool_util.c b/usr/src/cmd/zpool/zpool_util.c
new file mode 100644
index 000000000000..a9e7bb600cf7
--- /dev/null
+++ b/usr/src/cmd/zpool/zpool_util.c
@@ -0,0 +1,100 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <errno.h>
+#include <libgen.h>
+#include <libintl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+
+#include "zpool_util.h"
+
+/*
+ * Utility function to guarantee malloc() success.
+ */
+void *
+safe_malloc(size_t size)
+{
+	void *data;
+
+	if ((data = calloc(1, size)) == NULL) {
+		(void) fprintf(stderr, "internal error: out of memory\n");
+		exit(1);
+	}
+
+	return (data);
+}
+
+/*
+ * Same as above, but for strdup()
+ */
+char *
+safe_strdup(const char *str)
+{
+	char *ret;
+
+	if ((ret = strdup(str)) == NULL) {
+		(void) fprintf(stderr, "internal error: out of memory\n");
+		exit(1);
+	}
+
+	return (ret);
+}
+
+/*
+ * Display an out of memory error message and abort the current program.
+ */
+void
+no_memory(void)
+{
+	assert(errno == ENOMEM);
+	(void) fprintf(stderr,
+	    gettext("internal error: out of memory\n"));
+	exit(1);
+}
+
+/*
+ * Given a vdev, return the name to display in iostat.  If the vdev has a path,
+ * we use that, stripping off any leading "/dev/dsk/"; if not, we use the type.
+ */
+const char *
+vdev_get_name(nvlist_t *nv)
+{
+	char *path, *type;
+
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
+		if (strncmp(path, "/dev/dsk/", 9) == 0)
+			return (path + 9);
+		return (path);
+	}
+
+	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+
+	return (type);
+}
diff --git a/usr/src/cmd/zpool/zpool_util.h b/usr/src/cmd/zpool/zpool_util.h
new file mode 100644
index 000000000000..7287a96cb376
--- /dev/null
+++ b/usr/src/cmd/zpool/zpool_util.h
@@ -0,0 +1,78 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	ZPOOL_UTIL_H
+#define	ZPOOL_UTIL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <libnvpair.h>
+#include <libzfs.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Basic utility functions
+ */
+void *safe_malloc(size_t);
+char *safe_strdup(const char *);
+void no_memory(void);
+
+const char *vdev_get_name(nvlist_t *nv);
+
+/*
+ * Virtual device functions
+ */
+nvlist_t *make_root_vdev(nvlist_t *poolconfig, int force, int check_rep,
+    int argc, char **argv);
+
+/*
+ * Pool list functions
+ */
+int for_each_pool(int, char **, int unavail, zpool_iter_f, void *);
+
+typedef struct zpool_list zpool_list_t;
+
+zpool_list_t *pool_list_get(int, char **, int *);
+void pool_list_update(zpool_list_t *);
+int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *);
+void pool_list_free(zpool_list_t *);
+int pool_list_count(zpool_list_t *);
+void pool_list_remove(zpool_list_t *, zpool_handle_t *);
+
+/*
+ * Dataset functions
+ */
+int unmount_datasets(zpool_handle_t *, int);
+int mount_datasets(zpool_handle_t *, const char *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* ZPOOL_UTIL_H */
diff --git a/usr/src/cmd/zpool/zpool_vdev.c b/usr/src/cmd/zpool/zpool_vdev.c
new file mode 100644
index 000000000000..669807285da4
--- /dev/null
+++ b/usr/src/cmd/zpool/zpool_vdev.c
@@ -0,0 +1,1395 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Functions to convert between a list of vdevs and an nvlist representing the
+ * configuration.  Each entry in the list can be one of:
+ *
+ * 	Device vdevs
+ * 		disk=(path=..., devid=...)
+ * 		file=(path=...)
+ *
+ * 	Group vdevs
+ * 		raidz=(...)
+ * 		mirror=(...)
+ *
+ * While the underlying implementation supports it, group vdevs cannot contain
+ * other group vdevs.  All userland verification of devices is contained within
+ * this file.  If successful, the nvlist returned can be passed directly to the
+ * kernel; we've done as much verification as possible in userland.
+ *
+ * The only function exported by this file is 'get_vdev_spec'.  The function
+ * performs several passes:
+ *
+ * 	1. Construct the vdev specification.  Performs syntax validation and
+ *         makes sure each device is valid.
+ * 	2. Check for devices in use.  Using libdiskmgt, makes sure that no
+ *         devices are also in use.  Some can be overridden using the 'force'
+ *         flag, others cannot.
+ * 	3. Check for replication errors if the 'force' flag is not specified.
+ *         validates that the replication level is consistent across the
+ *         entire pool.
+ * 	4. Label any whole disks with an EFI label.
+ */
+
+#include <assert.h>
+#include <devid.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libdiskmgt.h>
+#include <libintl.h>
+#include <libnvpair.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/efi_partition.h>
+#include <sys/stat.h>
+#include <sys/vtoc.h>
+#include <sys/mntent.h>
+
+#include <libzfs.h>
+
+#include "zpool_util.h"
+
+#define	DISK_ROOT	"/dev/dsk"
+#define	RDISK_ROOT	"/dev/rdsk"
+#define	BACKUP_SLICE	"s2"
+
+/*
+ * For any given vdev specification, we can have multiple errors.  The
+ * vdev_error() function keeps track of whether we have seen an error yet, and
+ * prints out a header if its the first error we've seen.
+ */
+int error_seen;
+int is_force;
+
+void
+vdev_error(const char *fmt, ...)
+{
+	va_list ap;
+
+	if (!error_seen) {
+		(void) fprintf(stderr, gettext("invalid vdev specification\n"));
+		if (!is_force)
+			(void) fprintf(stderr, gettext("use '-f' to override "
+			    "the following errors:\n"));
+		else
+			(void) fprintf(stderr, gettext("the following errors "
+			    "must be manually repaired:\n"));
+		error_seen = TRUE;
+	}
+
+	va_start(ap, fmt);
+	(void) vfprintf(stderr, fmt, ap);
+	va_end(ap);
+}
+
+void
+_libdskmgt_error(int err, const char *file, int line)
+{
+	if (err == 0)
+		no_memory();
+
+	/*
+	 * Some of the libdiskmgt stuff requires root privileges in order to
+	 * examine devices.  Bail out gracefully in this case.
+	 */
+	if (err == EACCES) {
+		(void) fprintf(stderr, gettext("cannot determine disk "
+		    "configuration: permission denied\n"));
+		exit(1);
+	}
+
+	(void) fprintf(stderr, gettext("internal error: disk configuration "
+	    "error %d at line %d of file %s\n"), err, line, file);
+	abort();
+}
+
+#define	libdskmgt_error(err)	(_libdskmgt_error((err), __FILE__, __LINE__))
+
+/*
+ * Checks whether a single slice overlaps with any of the slices in the provided
+ * list.  Called by check_overlapping().
+ */
+int
+is_overlapping(dm_descriptor_t slice, dm_descriptor_t media,
+	dm_descriptor_t *slice_list, int *error, char **overlaps_with)
+{
+	int 		i = 0;
+	uint32_t	in_snum;
+	uint64_t 	start_block = 0;
+	uint64_t 	end_block = 0;
+	uint64_t 	media_size = 0;
+	uint64_t 	size = 0;
+	nvlist_t 	*media_attrs;
+	nvlist_t 	*slice_attrs;
+
+	media_attrs = dm_get_attributes(media, error);
+	if (*error != 0) {
+		return (-1);
+	}
+
+	if (media_attrs == NULL) {
+		return (0);
+	}
+
+	*error = nvlist_lookup_uint64(media_attrs, DM_NACCESSIBLE, &media_size);
+	if (*error != 0) {
+		nvlist_free(media_attrs);
+		return (-1);
+	}
+
+	slice_attrs = dm_get_attributes(slice, error);
+	if (*error != 0) {
+		nvlist_free(media_attrs);
+		return (-1);
+	}
+	/*
+	 * Not really possible, but the error above would catch any system
+	 * errors.
+	 */
+	if (slice_attrs == NULL) {
+		nvlist_free(media_attrs);
+		return (0);
+	}
+
+	*error = nvlist_lookup_uint64(slice_attrs, DM_START, &start_block);
+	if (*error != 0) {
+		nvlist_free(media_attrs);
+		nvlist_free(slice_attrs);
+		return (-1);
+	}
+
+	*error = nvlist_lookup_uint64(slice_attrs, DM_SIZE, &size);
+	if (*error != 0) {
+		nvlist_free(media_attrs);
+		nvlist_free(slice_attrs);
+		return (-1);
+	}
+	*error = nvlist_lookup_uint32(slice_attrs, DM_INDEX, &in_snum);
+	if (*error != 0) {
+		nvlist_free(media_attrs);
+		nvlist_free(slice_attrs);
+		return (-1);
+	}
+
+	end_block = (start_block + size) - 1;
+
+	for (i = 0; slice_list[i]; i ++) {
+		uint64_t other_start;
+		uint64_t other_end;
+		uint64_t other_size;
+		uint32_t snum;
+
+		nvlist_t *other_attrs = dm_get_attributes(slice_list[i], error);
+		if (*error != 0) {
+			return (-1);
+		}
+
+		if (other_attrs == NULL)
+			continue;
+
+		*error = nvlist_lookup_uint64(other_attrs, DM_START,
+			&other_start);
+		if (*error) {
+		    nvlist_free(media_attrs);
+		    nvlist_free(slice_attrs);
+		    nvlist_free(other_attrs);
+		    return (-1);
+		}
+
+		*error = nvlist_lookup_uint64(other_attrs, DM_SIZE,
+			&other_size);
+
+		if (*error) {
+		    nvlist_free(media_attrs);
+		    nvlist_free(slice_attrs);
+		    nvlist_free(other_attrs);
+		    return (-1);
+		}
+
+		other_end = (other_size + other_start) - 1;
+
+		*error = nvlist_lookup_uint32(other_attrs, DM_INDEX,
+			&snum);
+
+		if (*error) {
+		    nvlist_free(media_attrs);
+		    nvlist_free(slice_attrs);
+		    nvlist_free(other_attrs);
+		    return (-1);
+		}
+
+		/*
+		 * Check to see if there are > 2 overlapping regions
+		 * on this media in the same region as this slice.
+		 * This is done by assuming the following:
+		 *   	Slice 2 is the backup slice if it is the size
+		 *	of the whole disk
+		 * If slice 2 is the overlap and slice 2 is the size of
+		 * the whole disk, continue. If another slice is found
+		 * that overlaps with our slice, return it.
+		 * There is the potential that there is more than one slice
+		 * that our slice overlaps with, however, we only return
+		 * the first overlapping slice we find.
+		 *
+		 */
+
+		if (start_block >= other_start && start_block <= other_end) {
+			if ((snum == 2 && (other_size == media_size)) ||
+				snum == in_snum) {
+				continue;
+			} else {
+				char *str = dm_get_name(slice_list[i], error);
+				if (*error != 0) {
+					nvlist_free(media_attrs);
+					nvlist_free(slice_attrs);
+					nvlist_free(other_attrs);
+					return (-1);
+				}
+				*overlaps_with = strdup(str);
+				dm_free_name(str);
+				nvlist_free(media_attrs);
+				nvlist_free(slice_attrs);
+				nvlist_free(other_attrs);
+				return (1);
+			}
+		} else if (other_start >= start_block &&
+			other_start <= end_block) {
+			if ((snum == 2 && (other_size == media_size)) ||
+				snum == in_snum) {
+				continue;
+			} else {
+				char *str = dm_get_name(slice_list[i], error);
+				if (*error != 0) {
+					nvlist_free(media_attrs);
+					nvlist_free(slice_attrs);
+					nvlist_free(other_attrs);
+					return (-1);
+				}
+				*overlaps_with = strdup(str);
+				dm_free_name(str);
+				nvlist_free(media_attrs);
+				nvlist_free(slice_attrs);
+				nvlist_free(other_attrs);
+				return (1);
+			}
+		}
+		nvlist_free(other_attrs);
+	}
+	nvlist_free(media_attrs);
+	nvlist_free(slice_attrs);
+	return (0);
+}
+
+/*
+ * Check to see whether the given slice overlaps with any other slices.  Get the
+ * associated slice information and pass on to is_overlapping().
+ */
+int
+check_overlapping(const char *slicename, dm_descriptor_t slice)
+{
+	dm_descriptor_t *media;
+	dm_descriptor_t *slices;
+	int error;
+	char *overlaps;
+	int ret = 0;
+
+	/*
+	 * Get the list of slices be fetching the associated media, and then all
+	 * associated slices.
+	 */
+	media = dm_get_associated_descriptors(slice, DM_MEDIA, &error);
+	if (media == NULL || *media == NULL || error != 0)
+		libdskmgt_error(error);
+
+	slices = dm_get_associated_descriptors(*media, DM_SLICE, &error);
+	if (slices == NULL || *slices == NULL || error != 0)
+		libdskmgt_error(error);
+
+
+	overlaps = NULL;
+	if (is_overlapping(slice, *media, slices, &error, &overlaps)) {
+		vdev_error(gettext("device '%s' overlaps with '%s'\n"),
+		    slicename, overlaps);
+		ret = -1;
+	}
+
+	if (overlaps != NULL)
+		free(overlaps);
+	dm_free_descriptors(slices);
+	dm_free_descriptors(media);
+
+	return (ret);
+}
+
+/*
+ * Validate the given slice.  If 'diskname' is non-NULL, then this is a single
+ * slice on a complete disk.  If 'force' is set, then the user specified '-f'
+ * and we only want to report error for completely forbidden uses.
+ */
+int
+check_slice(const char *slicename, dm_descriptor_t slice, int force,
+    int overlap)
+{
+	nvlist_t *stats;
+	int err;
+	nvpair_t *nvwhat, *nvdesc;
+	char *what, *desc, *name;
+	int found = FALSE;
+	int found_zfs = FALSE;
+	int fd;
+
+	if ((stats = dm_get_stats(slice, DM_SLICE_STAT_USE, &err)) == NULL)
+		libdskmgt_error(err);
+
+	/*
+	 * Always check to see if this is used by an active ZFS pool.
+	 */
+	if ((fd = open(slicename, O_RDONLY)) > 0) {
+		if (zpool_in_use(fd, &desc, &name)) {
+
+			if (!force) {
+				vdev_error(gettext("%s is part of %s pool "
+				    "'%s'\n"), slicename, desc, name);
+				found = found_zfs = TRUE;
+			}
+
+			free(desc);
+			free(name);
+		}
+
+		(void) close(fd);
+	}
+
+	/*
+	 * This slice is in use.  Print out a descriptive message describing who
+	 * is using it.  The 'used_by' nvlist is formatted as:
+	 *
+	 * 	(used_by=what, used_name=desc, ...)
+	 *
+	 * Each 'used_by' must be accompanied by a 'used_name'.
+	 */
+	nvdesc = NULL;
+	for (;;) {
+		nvwhat = nvlist_next_nvpair(stats, nvdesc);
+		nvdesc = nvlist_next_nvpair(stats, nvwhat);
+
+		if (nvwhat == NULL || nvdesc == NULL)
+			break;
+
+		assert(strcmp(nvpair_name(nvwhat), DM_USED_BY) == 0);
+		assert(strcmp(nvpair_name(nvdesc), DM_USED_NAME) == 0);
+
+		verify(nvpair_value_string(nvwhat, &what) == 0);
+		verify(nvpair_value_string(nvdesc, &desc) == 0);
+
+		/*
+		 * For currently mounted filesystems, filesystems in
+		 * /etc/vfstab, or dedicated dump devices, we can never use
+		 * them, even if '-f' is specified.  The rest of the errors
+		 * indicate that a filesystem was detected on disk, which can be
+		 * overridden with '-f'.
+		 */
+		if (strcmp(what, DM_USE_MOUNT) == 0 ||
+		    strcmp(what, DM_USE_VFSTAB) == 0 ||
+		    strcmp(what, DM_USE_DUMP) == 0) {
+			found = TRUE;
+			if (strcmp(what, DM_USE_MOUNT) == 0) {
+				vdev_error(gettext("%s is "
+				    "currently mounted on %s\n"),
+				    slicename, desc);
+			} else if (strcmp(what, DM_USE_VFSTAB) == 0) {
+				vdev_error(gettext("%s is usually "
+				    "mounted at %s in /etc/vfstab\n"),
+				    slicename, desc);
+			} else if (strcmp(what, DM_USE_DUMP) == 0) {
+				vdev_error(gettext("%s is the "
+				    "dedicated dump device\n"), slicename);
+			}
+		} else if (!force) {
+			found = TRUE;
+			if (strcmp(what, DM_USE_SVM) == 0) {
+				vdev_error(gettext("%s is part of "
+				    "SVM volume %s\n"), slicename, desc);
+			} else if (strcmp(what, DM_USE_LU) == 0) {
+				vdev_error(gettext("%s is in use "
+				    "for live upgrade %s\n"), slicename, desc);
+			} else if (strcmp(what, DM_USE_VXVM) == 0) {
+				vdev_error(gettext("%s is part of "
+				    "VxVM volume %s\n"), slicename, desc);
+			} else if (strcmp(what, DM_USE_FS) == 0) {
+				/*
+				 * We should have already caught ZFS in-use
+				 * filesystems above.  If the ZFS version is
+				 * different, or there was some other critical
+				 * failure, it's possible for fstyp to report it
+				 * as in-use, but zpool_open_by_dev() to fail.
+				 */
+				if (strcmp(desc, MNTTYPE_ZFS) != 0)
+					vdev_error(gettext("%s contains a %s "
+					    "filesystem\n"), slicename, desc);
+				else if (!found_zfs)
+					vdev_error(gettext("%s is part of an "
+					    "outdated or damaged ZFS "
+					    "pool\n"), slicename);
+			} else {
+				vdev_error(gettext("is used by %s as %s\n"),
+				    slicename, what, desc);
+			}
+		} else {
+			found = FALSE;
+		}
+	}
+
+	/*
+	 * Perform any overlap checking if requested to do so.
+	 */
+	if (overlap && !force)
+		found |= (check_overlapping(slicename, slice) != 0);
+
+	return (found ? -1 : 0);
+}
+
+/*
+ * Validate a whole disk.  Iterate over all slices on the disk and make sure
+ * that none is in use by calling check_slice().
+ */
+/* ARGSUSED */
+int
+check_disk(const char *name, dm_descriptor_t disk, int force)
+{
+	dm_descriptor_t *drive, *media, *slice;
+	int err = 0;
+	int i;
+	int ret;
+
+	/*
+	 * Get the drive associated with this disk.  This should never fail,
+	 * because we already have an alias handle open for the device.
+	 */
+	if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE,
+	    &err)) == NULL || *drive == NULL)
+		libdskmgt_error(err);
+
+	if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA,
+	    &err)) == NULL)
+		libdskmgt_error(err);
+
+	dm_free_descriptors(drive);
+
+	/*
+	 * It is possible that the user has specified a removable media drive,
+	 * and the media is not present.
+	 */
+	if (*media == NULL) {
+		vdev_error(gettext("'%s' has no media in drive\n"), name);
+		dm_free_descriptors(media);
+		return (-1);
+	}
+
+	if ((slice = dm_get_associated_descriptors(*media, DM_SLICE,
+	    &err)) == NULL)
+		libdskmgt_error(err);
+
+	dm_free_descriptors(media);
+
+	ret = 0;
+
+	/*
+	 * Iterate over all slices and report any errors.  We don't care about
+	 * overlapping slices because we are using the whole disk.
+	 */
+	for (i = 0; slice[i] != NULL; i++) {
+		if (check_slice(dm_get_name(slice[i], &err), slice[i],
+		    force, FALSE) != 0)
+			ret = -1;
+	}
+
+	dm_free_descriptors(slice);
+	return (ret);
+}
+
+
+/*
+ * Validate a device.  Determines whether the device is a disk, slice, or
+ * partition, and passes it off to an appropriate function.
+ */
+int
+check_device(const char *path, int force)
+{
+	dm_descriptor_t desc;
+	int err;
+	char *dev, rpath[MAXPATHLEN];
+
+	/*
+	 * For whole disks, libdiskmgt does not include the leading dev path.
+	 */
+	dev = strrchr(path, '/');
+	assert(dev != NULL);
+	dev++;
+	if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL)
+		return (check_disk(path, desc, force));
+
+	/*
+	 * If 'err' is not ENODEV, then we've had an unexpected error from
+	 * libdiskmgt.  The only explanation is that we ran out of memory.
+	 */
+	if (err != ENODEV)
+		libdskmgt_error(err);
+
+	/*
+	 * Determine if this is a slice.
+	 */
+	if ((desc = dm_get_descriptor_by_name(DM_SLICE, (char *)path, &err))
+	    != NULL)
+		return (check_slice(path, desc, force, TRUE));
+
+	if (err != ENODEV)
+		libdskmgt_error(err);
+
+	/*
+	 * Check for a partition.  libdiskmgt expects path of /dev/rdsk when
+	 * dealing with partitions, so convert it.
+	 */
+	(void) snprintf(rpath, sizeof (rpath), "/dev/rdsk/%s", dev);
+	if ((desc = dm_get_descriptor_by_name(DM_PARTITION, rpath, &err))
+	    != NULL) {
+		/* XXZFS perform checking on partitions */
+		return (0);
+	}
+
+	if (err != ENODEV)
+		libdskmgt_error(err);
+
+	/*
+	 * At this point, libdiskmgt failed to find the device as either a whole
+	 * disk or a slice.  Ignore these errors, as we know that it at least a
+	 * block device.  The user may have provided us with some unknown device
+	 * that libdiskmgt doesn't know about.
+	 */
+	return (0);
+}
+
+/*
+ * Check that a file is valid.  All we can do in this case is check that it's
+ * not in use by another pool.
+ */
+int
+check_file(const char *file, int force)
+{
+	char *desc, *name;
+	int fd;
+	int ret = 0;
+
+	if ((fd = open(file, O_RDONLY)) < 0)
+		return (0);
+
+	if (zpool_in_use(fd, &desc, &name)) {
+		if (strcmp(desc, gettext("active")) == 0 ||
+		    !force) {
+			vdev_error(gettext("%s is part of %s pool '%s'\n"),
+			    file, desc, name);
+			ret = -1;
+		}
+
+		free(desc);
+		free(name);
+	}
+
+	(void) close(fd);
+	return (ret);
+}
+
+static int
+is_whole_disk(const char *arg, struct stat64 *statbuf)
+{
+	char path[MAXPATHLEN];
+
+	(void) snprintf(path, sizeof (path), "%s%s", arg, BACKUP_SLICE);
+	if (stat64(path, statbuf) == 0)
+		return (TRUE);
+
+	return (FALSE);
+}
+
+/*
+ * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
+ * device, fill in the device id to make a complete nvlist.  Valid forms for a
+ * leaf vdev are:
+ *
+ * 	/dev/dsk/xxx	Complete disk path
+ * 	/xxx		Full path to file
+ * 	xxx		Shorthand for /dev/dsk/xxx
+ */
+nvlist_t *
+make_leaf_vdev(const char *arg)
+{
+	char path[MAXPATHLEN];
+	struct stat64 statbuf;
+	nvlist_t *vdev = NULL;
+	char *type = NULL;
+	int wholedisk = FALSE;
+
+	/*
+	 * Determine what type of vdev this is, and put the full path into
+	 * 'path'.  We detect whether this is a device of file afterwards by
+	 * checking the st_mode of the file.
+	 */
+	if (arg[0] == '/') {
+		/*
+		 * Complete device or file path.  Exact type is determined by
+		 * examining the file descriptor afterwards.
+		 */
+		if (is_whole_disk(arg, &statbuf)) {
+			wholedisk = TRUE;
+		} else if (stat64(arg, &statbuf) != 0) {
+			(void) fprintf(stderr,
+			    gettext("cannot open '%s': %s\n"),
+			    arg, strerror(errno));
+			return (NULL);
+		}
+
+		(void) strlcpy(path, arg, sizeof (path));
+	} else {
+		/*
+		 * This may be a short path for a device, or it could be total
+		 * gibberish.  Check to see if it's a known device in
+		 * /dev/dsk/.  As part of this check, see if we've been given a
+		 * an entire disk (minus the slice number).
+		 */
+		(void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT,
+		    arg);
+		if (is_whole_disk(path, &statbuf)) {
+			wholedisk = TRUE;
+		} else if (stat64(path, &statbuf) != 0) {
+			/*
+			 * If we got ENOENT, then the user gave us
+			 * gibberish, so try to direct them with a
+			 * reasonable error message.  Otherwise,
+			 * regurgitate strerror() since it's the best we
+			 * can do.
+			 */
+			if (errno == ENOENT) {
+				(void) fprintf(stderr,
+				    gettext("cannot open '%s': no such "
+				    "device in %s\n"), arg, DISK_ROOT);
+				(void) fprintf(stderr,
+				    gettext("must be a full path or "
+				    "shorthand device name\n"));
+				return (NULL);
+			} else {
+				(void) fprintf(stderr,
+				    gettext("cannot open '%s': %s\n"),
+				    path, strerror(errno));
+				return (NULL);
+			}
+		}
+	}
+
+	/*
+	 * Determine whether this is a device or a file.
+	 */
+	if (S_ISBLK(statbuf.st_mode)) {
+		type = VDEV_TYPE_DISK;
+	} else if (S_ISREG(statbuf.st_mode)) {
+		type = VDEV_TYPE_FILE;
+	} else {
+		(void) fprintf(stderr, gettext("cannot use '%s': must be a "
+		    "block device or regular file\n"), path);
+		return (NULL);
+	}
+
+	/*
+	 * Finally, we have the complete device or file, and we know that it is
+	 * acceptable to use.  Construct the nvlist to describe this vdev.  All
+	 * vdevs have a 'path' element, and devices also have a 'devid' element.
+	 */
+	verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
+	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
+	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
+
+	/*
+	 * For a whole disk, defer getting its devid until after labeling it.
+	 */
+	if (S_ISBLK(statbuf.st_mode) && !wholedisk) {
+		/*
+		 * Get the devid for the device.
+		 */
+		int fd;
+		ddi_devid_t devid;
+		char *minor = NULL, *devid_str = NULL;
+
+		if ((fd = open(path, O_RDONLY)) < 0) {
+			(void) fprintf(stderr, gettext("cannot open '%s': "
+			    "%s\n"), path, strerror(errno));
+			nvlist_free(vdev);
+			return (NULL);
+		}
+
+		if (devid_get(fd, &devid) == 0) {
+			if (devid_get_minor_name(fd, &minor) == 0 &&
+			    (devid_str = devid_str_encode(devid, minor)) !=
+			    NULL) {
+				verify(nvlist_add_string(vdev,
+				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
+			}
+			if (devid_str != NULL)
+				devid_str_free(devid_str);
+			if (minor != NULL)
+				devid_str_free(minor);
+			devid_free(devid);
+		}
+
+		(void) close(fd);
+	}
+
+	return (vdev);
+}
+
+/*
+ * Go through and verify the replication level of the pool is consistent.
+ * Performs the following checks:
+ *
+ * 	For the new spec, verifies that devices in mirrors and raidz are the
+ * 	same size.
+ *
+ * 	If the current configuration already has inconsistent replication
+ * 	levels, ignore any other potential problems in the new spec.
+ *
+ * 	Otherwise, make sure that the current spec (if there is one) and the new
+ * 	spec have consistent replication levels.
+ */
+typedef struct replication_level {
+	char	*type;
+	int	level;
+} replication_level_t;
+
+/*
+ * Given a list of toplevel vdevs, return the current replication level.  If
+ * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
+ * an error message will be displayed for each self-inconsistent vdev.
+ */
+replication_level_t *
+get_replication(nvlist_t *nvroot, int fatal)
+{
+	nvlist_t **top;
+	uint_t t, toplevels;
+	nvlist_t **child;
+	uint_t c, children;
+	nvlist_t *nv;
+	char *type;
+	replication_level_t lastrep, rep, *ret;
+	int dontreport;
+
+	ret = safe_malloc(sizeof (replication_level_t));
+
+	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    &top, &toplevels) == 0);
+
+	lastrep.type = NULL;
+	for (t = 0; t < toplevels; t++) {
+		nv = top[t];
+
+		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+
+		if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+		    &child, &children) != 0) {
+			/*
+			 * This is a 'file' or 'disk' vdev.
+			 */
+			rep.type = type;
+			rep.level = 1;
+		} else {
+			uint64_t vdev_size;
+
+			/*
+			 * This is a mirror or RAID-Z vdev.  Go through and make
+			 * sure the contents are all the same (files vs. disks),
+			 * keeping track of the number of elements in the
+			 * process.
+			 *
+			 * We also check that the size of each vdev (if it can
+			 * be determined) is the same.
+			 */
+			rep.type = type;
+			rep.level = 0;
+
+			/*
+			 * The 'dontreport' variable indicatest that we've
+			 * already reported an error for this spec, so don't
+			 * bother doing it again.
+			 */
+			type = NULL;
+			dontreport = 0;
+			vdev_size = -1ULL;
+			for (c = 0; c < children; c++) {
+				nvlist_t *cnv = child[c];
+				char *path;
+				struct stat64 statbuf;
+				uint64_t size = -1ULL;
+				char *childtype;
+				int fd, err;
+
+				rep.level++;
+
+				verify(nvlist_lookup_string(cnv,
+				    ZPOOL_CONFIG_TYPE, &childtype) == 0);
+				verify(nvlist_lookup_string(cnv,
+				    ZPOOL_CONFIG_PATH, &path) == 0);
+
+				/*
+				 * If we have a raidz/mirror that combines disks
+				 * with files, report it as an error.
+				 */
+				if (!dontreport && type != NULL &&
+				    strcmp(type, childtype) != 0) {
+					if (ret != NULL)
+						free(ret);
+					ret = NULL;
+					if (fatal)
+						vdev_error(gettext(
+						    "mismatched replication "
+						    "level: %s contains both "
+						    "files and devices\n"),
+						    rep.type);
+					else
+						return (NULL);
+					dontreport = TRUE;
+				}
+
+				/*
+				 * According to stat(2), the value of 'st_size'
+				 * is undefined for block devices and character
+				 * devices.  But there is no effective way to
+				 * determine the real size in userland.
+				 *
+				 * Instead, we'll take advantage of an
+				 * implementation detail of spec_size().  If the
+				 * device is currently open, then we (should)
+				 * return a valid size.
+				 *
+				 * If we still don't get a valid size (indicated
+				 * by a size of 0 or MAXOFFSET_T), then ignore
+				 * this device altogether.
+				 */
+				if ((fd = open(path, O_RDONLY)) >= 0) {
+					err = fstat64(fd, &statbuf);
+					(void) close(fd);
+				} else {
+					err = stat64(path, &statbuf);
+				}
+
+				if (err != 0 ||
+				    statbuf.st_size == 0 ||
+				    statbuf.st_size == MAXOFFSET_T)
+					continue;
+
+				size = statbuf.st_size;
+
+				/*
+				 * Also check the size of each device.  If they
+				 * differ, then report an error.
+				 */
+				if (!dontreport && vdev_size != -1ULL &&
+				    size != vdev_size) {
+					if (ret != NULL)
+						free(ret);
+					ret = NULL;
+					if (fatal)
+						vdev_error(gettext(
+						    "%s contains devices of "
+						    "different sizes\n"),
+						    rep.type);
+					else
+						return (NULL);
+					dontreport = TRUE;
+				}
+
+				type = childtype;
+				vdev_size = size;
+			}
+		}
+
+		/*
+		 * At this point, we have the replication of the last toplevel
+		 * vdev in 'rep'.  Compare it to 'lastrep' to see if its
+		 * different.
+		 */
+		if (lastrep.type != NULL) {
+			if (strcmp(lastrep.type, rep.type) != 0) {
+				if (ret != NULL)
+					free(ret);
+				ret = NULL;
+				if (fatal)
+					vdev_error(gettext(
+					    "mismatched replication "
+					    "level: both %s and %s vdevs are "
+					    "present\n"),
+					    lastrep.type, rep.type);
+				else
+					return (NULL);
+			} else if (lastrep.level != rep.level) {
+				if (ret)
+					free(ret);
+				ret = NULL;
+				if (fatal)
+					vdev_error(gettext(
+					    "mismatched replication "
+					    "level: %d-way %s and %d-way %s "
+					    "vdevs are present\n"),
+					    lastrep.level, lastrep.type,
+					    rep.level, rep.type);
+				else
+					return (NULL);
+			}
+		}
+		lastrep = rep;
+	}
+
+	if (ret != NULL) {
+		ret->type = rep.type;
+		ret->level = rep.level;
+	}
+
+	return (ret);
+}
+
+/*
+ * Check the replication level of the vdev spec against the current pool.  Calls
+ * get_replication() to make sure the new spec is self-consistent.  If the pool
+ * has a consistent replication level, then we ignore any errors.  Otherwise,
+ * report any difference between the two.
+ */
+int
+check_replication(nvlist_t *config, nvlist_t *newroot)
+{
+	replication_level_t *current = NULL, *new;
+	int ret;
+
+	/*
+	 * If we have a current pool configuration, check to see if it's
+	 * self-consistent.  If not, simply return success.
+	 */
+	if (config != NULL) {
+		nvlist_t *nvroot;
+
+		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+		    &nvroot) == 0);
+		if ((current = get_replication(nvroot, FALSE)) == NULL)
+			return (0);
+	}
+
+	/*
+	 * Get the replication level of the new vdev spec, reporting any
+	 * inconsistencies found.
+	 */
+	if ((new = get_replication(newroot, TRUE)) == NULL) {
+		free(current);
+		return (-1);
+	}
+
+	/*
+	 * Check to see if the new vdev spec matches the replication level of
+	 * the current pool.
+	 */
+	ret = 0;
+	if (current != NULL) {
+		if (strcmp(current->type, new->type) != 0 ||
+		    current->level != new->level) {
+			vdev_error(gettext(
+			    "mismatched replication level: pool uses %d-way %s "
+			    "and new vdev uses %d-way %s\n"),
+			    current->level, current->type, new->level,
+			    new->type);
+			ret = -1;
+		}
+	}
+
+	free(new);
+	if (current != NULL)
+		free(current);
+
+	return (ret);
+}
+
+/*
+ * Label an individual disk.  The name provided is the short name, stripped of
+ * any leading /dev path.
+ */
+int
+label_disk(char *name)
+{
+	char path[MAXPATHLEN];
+	struct dk_gpt *vtoc;
+	int fd;
+	size_t resv = 16384;
+
+	(void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name,
+	    BACKUP_SLICE);
+
+	if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) {
+		/*
+		 * This shouldn't happen.  We've long since verified that this
+		 * is a valid device.
+		 */
+		(void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
+		    path, strerror(errno));
+		return (-1);
+	}
+
+
+	if (efi_alloc_and_init(fd, 9, &vtoc) != 0) {
+		/*
+		 * The only way this can fail is if we run out of memory, or we
+		 * were unable to read the disk geometry.
+		 */
+		if (errno == ENOMEM)
+			no_memory();
+
+		(void) fprintf(stderr, gettext("cannot label '%s': unable to "
+		    "read disk geometry\n"), name);
+		(void) close(fd);
+		return (-1);
+	}
+
+	vtoc->efi_parts[0].p_start = vtoc->efi_first_u_lba;
+	vtoc->efi_parts[0].p_size = vtoc->efi_last_u_lba + 1 -
+	    vtoc->efi_first_u_lba - resv;
+
+	/*
+	 * Why we use V_USR: V_BACKUP confuses users, and is considered
+	 * disposable by some EFI utilities (since EFI doesn't have a backup
+	 * slice).  V_UNASSIGNED is supposed to be used only for zero size
+	 * partitions, and efi_write() will fail if we use it.  V_ROOT, V_BOOT,
+	 * etc. were all pretty specific.  V_USR is as close to reality as we
+	 * can get, in the absence of V_OTHER.
+	 */
+	vtoc->efi_parts[0].p_tag = V_USR;
+	(void) strcpy(vtoc->efi_parts[0].p_name, "zfs");
+
+	vtoc->efi_parts[8].p_start = vtoc->efi_last_u_lba + 1 - resv;
+	vtoc->efi_parts[8].p_size = resv;
+	vtoc->efi_parts[8].p_tag = V_RESERVED;
+
+	if (efi_write(fd, vtoc) != 0) {
+		/*
+		 * Currently, EFI labels are not supported for IDE disks, and it
+		 * is likely that they will not be supported on other drives for
+		 * some time.  Print out a helpful error message directing the
+		 * user to manually label the disk and give a specific slice.
+		 */
+		(void) fprintf(stderr, gettext("cannot label '%s': failed to "
+		    "write EFI label\n"), name);
+		(void) fprintf(stderr, gettext("use fdisk(1M) to partition "
+		    "the disk, and provide a specific slice\n"));
+		(void) close(fd);
+		return (-1);
+	}
+
+	(void) close(fd);
+	return (0);
+}
+
+/*
+ * Go through and find any whole disks in the vdev specification, labelling them
+ * as appropriate.  When constructing the vdev spec, we were unable to open this
+ * device in order to provide a devid.  Now that we have labelled the disk and
+ * know that slice 0 is valid, we can construct the devid now.
+ *
+ * If the disk was already labelled with an EFI label, we will have gotten the
+ * devid already (because we were able to open the whole disk).  Otherwise, we
+ * need to get the devid after we label the disk.
+ */
+int
+make_disks(nvlist_t *nv)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	char *type, *path, *diskname;
+	char buf[MAXPATHLEN];
+	struct stat64 statbuf;
+	int fd;
+	int ret;
+	ddi_devid_t devid;
+	char *minor = NULL, *devid_str = NULL;
+
+	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0) {
+
+		if (strcmp(type, VDEV_TYPE_DISK) != 0)
+			return (0);
+
+		/*
+		 * We have a disk device.  Get the path to the device
+		 * and see if its a whole disk by appending the backup
+		 * slice and stat()ing the device.
+		 */
+		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
+
+		if (!is_whole_disk(path, &statbuf))
+			return (0);
+
+		diskname = strrchr(path, '/');
+		assert(diskname != NULL);
+		diskname++;
+		if (label_disk(diskname) != 0)
+			return (-1);
+
+		/*
+		 * Fill in the devid, now that we've labeled the disk.
+		 */
+		(void) snprintf(buf, sizeof (buf), "%ss0", path);
+		if ((fd = open(buf, O_RDONLY)) < 0) {
+			(void) fprintf(stderr,
+			    gettext("cannot open '%s': %s\n"),
+			    buf, strerror(errno));
+			return (-1);
+		}
+
+		if (devid_get(fd, &devid) == 0) {
+			if (devid_get_minor_name(fd, &minor) == 0 &&
+			    (devid_str = devid_str_encode(devid, minor)) !=
+			    NULL) {
+				verify(nvlist_add_string(nv,
+				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
+			}
+			if (devid_str != NULL)
+				devid_str_free(devid_str);
+			if (minor != NULL)
+				devid_str_free(minor);
+			devid_free(devid);
+		}
+
+		(void) close(fd);
+
+		return (0);
+	}
+
+	for (c = 0; c < children; c++)
+		if ((ret = make_disks(child[c])) != 0)
+			return (ret);
+
+	return (0);
+}
+
+/*
+ * Go through and find any devices that are in use.  We rely on libdiskmgt for
+ * the majority of this task.
+ */
+int
+check_in_use(nvlist_t *nv, int force)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	char *type, *path;
+	int ret;
+
+	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0) {
+
+		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
+
+		if (strcmp(type, VDEV_TYPE_DISK) == 0)
+			ret = check_device(path, force);
+
+		if (strcmp(type, VDEV_TYPE_FILE) == 0)
+			ret = check_file(path, force);
+
+		return (ret);
+	}
+
+	for (c = 0; c < children; c++)
+		if ((ret = check_in_use(child[c], force)) != 0)
+			return (ret);
+
+	return (0);
+}
+
+/*
+ * Construct a syntactically valid vdev specification,
+ * and ensure that all devices and files exist and can be opened.
+ * Note: we don't bother freeing anything in the error paths
+ * because the program is just going to exit anyway.
+ */
+nvlist_t *
+construct_spec(int argc, char **argv)
+{
+	nvlist_t *nvroot, *nv, **top;
+	int t, toplevels;
+
+	top = NULL;
+	toplevels = 0;
+
+	while (argc > 0) {
+		nv = NULL;
+
+		/*
+		 * If it's a mirror or raidz, the subsequent arguments are
+		 * its leaves -- until we encounter the next mirror or raidz.
+		 */
+		if (strcmp(argv[0], VDEV_TYPE_MIRROR) == 0 ||
+		    strcmp(argv[0], VDEV_TYPE_RAIDZ) == 0) {
+
+			char *type = argv[0];
+			nvlist_t **child = NULL;
+			int children = 0;
+			int c;
+
+			for (c = 1; c < argc; c++) {
+				if (strcmp(argv[c], VDEV_TYPE_MIRROR) == 0 ||
+				    strcmp(argv[c], VDEV_TYPE_RAIDZ) == 0)
+					break;
+				children++;
+				child = realloc(child,
+				    children * sizeof (nvlist_t *));
+				if (child == NULL)
+					no_memory();
+				if ((nv = make_leaf_vdev(argv[c])) == NULL)
+					return (NULL);
+				child[children - 1] = nv;
+			}
+
+			argc -= c;
+			argv += c;
+
+			/*
+			 * Mirrors and RAID-Z devices require at least
+			 * two components.
+			 */
+			if (children < 2) {
+				(void) fprintf(stderr,
+				    gettext("invalid vdev specification: "
+				    "%s requires at least 2 devices\n"), type);
+				return (NULL);
+			}
+
+			verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0);
+			verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
+			    type) == 0);
+			verify(nvlist_add_nvlist_array(nv,
+			    ZPOOL_CONFIG_CHILDREN, child, children) == 0);
+
+			for (c = 0; c < children; c++)
+				nvlist_free(child[c]);
+			free(child);
+		} else {
+			/*
+			 * We have a device.  Pass off to make_leaf_vdev() to
+			 * construct the appropriate nvlist describing the vdev.
+			 */
+			if ((nv = make_leaf_vdev(argv[0])) == NULL)
+				return (NULL);
+			argc--;
+			argv++;
+		}
+
+		toplevels++;
+		top = realloc(top, toplevels * sizeof (nvlist_t *));
+		if (top == NULL)
+			no_memory();
+		top[toplevels - 1] = nv;
+	}
+
+	/*
+	 * Finally, create nvroot and add all top-level vdevs to it.
+	 */
+	verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
+	verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+	    VDEV_TYPE_ROOT) == 0);
+	verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    top, toplevels) == 0);
+
+	for (t = 0; t < toplevels; t++)
+		nvlist_free(top[t]);
+	free(top);
+
+	return (nvroot);
+}
+
+/*
+ * Get and validate the contents of the given vdev specification.  This ensures
+ * that the nvlist returned is well-formed, that all the devices exist, and that
+ * they are not currently in use by any other known consumer.  The 'poolconfig'
+ * parameter is the current configuration of the pool when adding devices
+ * existing pool, and is used to perform additional checks, such as changing the
+ * replication level of the pool.  It can be 'NULL' to indicate that this is a
+ * new pool.  The 'force' flag controls whether devices should be forcefully
+ * added, even if they appear in use.
+ */
+nvlist_t *
+make_root_vdev(nvlist_t *poolconfig, int force, int check_rep,
+    int argc, char **argv)
+{
+	nvlist_t *newroot;
+
+	is_force = force;
+
+	/*
+	 * Construct the vdev specification.  If this is successful, we know
+	 * that we have a valid specification, and that all devices can be
+	 * opened.
+	 */
+	if ((newroot = construct_spec(argc, argv)) == NULL)
+		return (NULL);
+
+	/*
+	 * Validate each device to make sure that its not shared with another
+	 * subsystem.  We do this even if 'force' is set, because there are some
+	 * uses (such as a dedicated dump device) that even '-f' cannot
+	 * override.
+	 */
+	if (check_in_use(newroot, force) != 0) {
+		nvlist_free(newroot);
+		return (NULL);
+	}
+
+	/*
+	 * Check the replication level of the given vdevs and report any errors
+	 * found.  We include the existing pool spec, if any, as we need to
+	 * catch changes against the existing replication level.
+	 */
+	if (check_rep && check_replication(poolconfig, newroot) != 0) {
+		nvlist_free(newroot);
+		return (NULL);
+	}
+
+	/*
+	 * Run through the vdev specification and label any whole disks found.
+	 */
+	if (make_disks(newroot) != 0) {
+		nvlist_free(newroot);
+		return (NULL);
+	}
+
+	return (newroot);
+}
diff --git a/usr/src/cmd/ztest/Makefile b/usr/src/cmd/ztest/Makefile
new file mode 100644
index 000000000000..52e17eb41340
--- /dev/null
+++ b/usr/src/cmd/ztest/Makefile
@@ -0,0 +1,59 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+PROG:sh=	basename `pwd`
+
+include ../Makefile.cmd
+
+$(INTEL_BLD)SUBDIRS	= $(MACH)
+$(BUILD64)SUBDIRS	+= $(MACH64)
+
+all	:=	TARGET = all
+install	:=	TARGET = install
+clean	:=	TARGET = clean
+clobber	:=	TARGET = clobber
+lint	:=	TARGET = lint
+
+.KEEP_STATE:
+
+all clean clobber lint:	$(SUBDIRS)
+
+#
+# This should really be $(LN), but protocmp detects link inconsistencies 
+# between isaexec (which we ship) and ztest (which we do not ship).
+#
+install:	$(SUBDIRS)
+	-$(RM) $(ROOTPROG)
+	-$(CP) $(ISAEXEC) $(ROOTPROG)
+
+$(SUBDIRS):	FRC
+	@cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/ztest/Makefile.com b/usr/src/cmd/ztest/Makefile.com
new file mode 100644
index 000000000000..c5695f003765
--- /dev/null
+++ b/usr/src/cmd/ztest/Makefile.com
@@ -0,0 +1,61 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+PROG= ztest
+SRCS= ../$(PROG).c
+
+include ../../Makefile.cmd
+
+INCS += -I../../../lib/libzpool/common 
+INCS += -I../../../uts/common/fs/zfs 
+
+LDLIBS += -lumem -lzpool -lm -lnvpair
+
+C99MODE= -xc99=%all
+C99LMODE= -Xc99=%all
+CFLAGS += -g $(CCVERBOSE)
+CFLAGS64 += -g $(CCVERBOSE)
+CPPFLAGS += -D_LARGEFILE64_SOURCE=1 -D_REENTRANT $(INCS)
+
+# lint complains about unused _umem_* functions
+LINTFLAGS += -xerroff=E_NAME_DEF_NOT_USED2 
+LINTFLAGS64 += -xerroff=E_NAME_DEF_NOT_USED2  
+
+.KEEP_STATE:
+
+all: $(PROG)
+
+$(PROG): $(SRCS)
+	$(LINK.c) -o $(PROG) $(SRCS) $(LDLIBS)
+	$(POST_PROCESS)
+
+clean:
+
+lint:	lint_SRCS
+
+include ../../Makefile.targ
diff --git a/usr/src/cmd/ztest/amd64/Makefile b/usr/src/cmd/ztest/amd64/Makefile
new file mode 100644
index 000000000000..f259815d6ee3
--- /dev/null
+++ b/usr/src/cmd/ztest/amd64/Makefile
@@ -0,0 +1,32 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+include ../../Makefile.cmd.64
+
+install: all $(ROOTPROG64)
diff --git a/usr/src/cmd/ztest/i386/Makefile b/usr/src/cmd/ztest/i386/Makefile
new file mode 100644
index 000000000000..8ca4d0b7637f
--- /dev/null
+++ b/usr/src/cmd/ztest/i386/Makefile
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+
+install: all $(ROOTPROG32)
diff --git a/usr/src/cmd/ztest/inc.flg b/usr/src/cmd/ztest/inc.flg
new file mode 100644
index 000000000000..bb65300ccae9
--- /dev/null
+++ b/usr/src/cmd/ztest/inc.flg
@@ -0,0 +1,30 @@
+#!/bin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+find_files "s.*" usr/src/uts/common/fs/zfs/sys
+echo_file usr/src/uts/common/sys/fs/zfs.h
diff --git a/usr/src/cmd/ztest/sparc/Makefile b/usr/src/cmd/ztest/sparc/Makefile
new file mode 100644
index 000000000000..8ca4d0b7637f
--- /dev/null
+++ b/usr/src/cmd/ztest/sparc/Makefile
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+
+install: all $(ROOTPROG32)
diff --git a/usr/src/cmd/ztest/sparcv9/Makefile b/usr/src/cmd/ztest/sparcv9/Makefile
new file mode 100644
index 000000000000..f259815d6ee3
--- /dev/null
+++ b/usr/src/cmd/ztest/sparcv9/Makefile
@@ -0,0 +1,32 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+include ../../Makefile.cmd.64
+
+install: all $(ROOTPROG64)
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
new file mode 100644
index 000000000000..ce870c5a119e
--- /dev/null
+++ b/usr/src/cmd/ztest/ztest.c
@@ -0,0 +1,3303 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * The objective of this program is to provide a DMU/ZAP/SPA stress test
+ * that runs entirely in userland, is easy to use, and easy to extend.
+ *
+ * The overall design of the ztest program is as follows:
+ *
+ * (1) For each major functional area (e.g. adding vdevs to a pool,
+ *     creating and destroying datasets, reading and writing objects, etc)
+ *     we have a simple routine to test that functionality.  These
+ *     individual routines do not have to do anything "stressful".
+ *
+ * (2) We turn these simple functionality tests into a stress test by
+ *     running them all in parallel, with as many threads as desired,
+ *     and spread across as many datasets, objects, and vdevs as desired.
+ *
+ * (3) While all this is happening, we inject faults into the pool to
+ *     verify that self-healing data really works.
+ *
+ * (4) Every time we open a dataset, we change its checksum and compression
+ *     functions.  Thus even individual objects vary from block to block
+ *     in which checksum they use and whether they're compressed.
+ *
+ * (5) To verify that we never lose on-disk consistency after a crash,
+ *     we run the entire test in a child of the main process.
+ *     At random times, the child self-immolates with a SIGKILL.
+ *     This is the software equivalent of pulling the power cord.
+ *     The parent then runs the test again, using the existing
+ *     storage pool, as many times as desired.
+ *
+ * (6) To verify that we don't have future leaks or temporal incursions,
+ *     many of the functional tests record the transaction group number
+ *     as part of their data.  When reading old data, they verify that
+ *     the transaction group number is less than the current, open txg.
+ *     If you add a new test, please do this if applicable.
+ *
+ * When run with no arguments, ztest runs for about five minutes and
+ * produces no output if successful.  To get a little bit of information,
+ * specify -V.  To get more information, specify -VV, and so on.
+ *
+ * To turn this into an overnight stress test, use -T to specify run time.
+ *
+ * You can ask more more vdevs [-v], datasets [-d], or threads [-t]
+ * to increase the pool capacity, fanout, and overall stress level.
+ *
+ * The -N(okill) option will suppress kills, so each child runs to completion.
+ * This can be useful when you're trying to distinguish temporal incursions
+ * from plain old race conditions.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/txg.h>
+#include <sys/zap.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_objset.h>
+#include <sys/poll.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/spa_impl.h>
+#include <sys/dsl_prop.h>
+#include <sys/refcount.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <umem.h>
+#include <dlfcn.h>
+#include <ctype.h>
+#include <math.h>
+#include <sys/fs/zfs.h>
+
+static char cmdname[] = "ztest";
+static char *zopt_pool = cmdname;
+
+static uint64_t zopt_vdevs = 5;
+static uint64_t zopt_vdevtime;
+static int zopt_mirrors = 2;
+static int zopt_raidz = 4;
+static size_t zopt_vdev_size = SPA_MINDEVSIZE;
+static int zopt_dirs = 7;
+static int zopt_threads = 23;
+static uint64_t zopt_passtime = 60;	/* 60 seconds */
+static uint64_t zopt_killrate = 70;	/* 70% kill rate */
+static int zopt_verbose = 0;
+static int zopt_init = 1;
+static char *zopt_dir = "/tmp";
+static uint64_t zopt_time = 300;	/* 5 minutes */
+static int zopt_maxfaults;
+
+typedef struct ztest_args {
+	char		*za_pool;
+	objset_t	*za_os;
+	zilog_t		*za_zilog;
+	thread_t	za_thread;
+	uint64_t	za_instance;
+	uint64_t	za_random;
+	uint64_t	za_diroff;
+	uint64_t	za_diroff_shared;
+	hrtime_t	za_start;
+	hrtime_t	za_stop;
+	hrtime_t	za_kill;
+	traverse_handle_t *za_th;
+} ztest_args_t;
+
+typedef void ztest_func_t(ztest_args_t *);
+
+/*
+ * Note: these aren't static because we want dladdr() to work.
+ */
+ztest_func_t ztest_dmu_read_write;
+ztest_func_t ztest_dmu_write_parallel;
+ztest_func_t ztest_dmu_object_alloc_free;
+ztest_func_t ztest_zap;
+ztest_func_t ztest_zap_parallel;
+ztest_func_t ztest_traverse;
+ztest_func_t ztest_dsl_prop_get_set;
+ztest_func_t ztest_dmu_objset_create_destroy;
+ztest_func_t ztest_dmu_snapshot_create_destroy;
+ztest_func_t ztest_spa_create_destroy;
+ztest_func_t ztest_fault_inject;
+ztest_func_t ztest_vdev_attach_detach;
+ztest_func_t ztest_vdev_LUN_growth;
+ztest_func_t ztest_vdev_add_remove;
+ztest_func_t ztest_scrub;
+ztest_func_t ztest_spa_rename;
+
+typedef struct ztest_info {
+	ztest_func_t	*zi_func;	/* test function */
+	uint64_t	*zi_interval;	/* execute every <interval> seconds */
+	uint64_t	zi_calls;	/* per-pass count */
+	uint64_t	zi_call_time;	/* per-pass time */
+	uint64_t	zi_call_total;	/* cumulative total */
+	uint64_t	zi_call_target;	/* target cumulative total */
+} ztest_info_t;
+
+uint64_t zopt_always = 0;		/* all the time */
+uint64_t zopt_often = 1;		/* every second */
+uint64_t zopt_sometimes = 10;		/* every 10 seconds */
+uint64_t zopt_rarely = 60;		/* every 60 seconds */
+
+ztest_info_t ztest_info[] = {
+	{ ztest_dmu_read_write,			&zopt_always	},
+	{ ztest_dmu_write_parallel,		&zopt_always	},
+	{ ztest_dmu_object_alloc_free,		&zopt_always	},
+	{ ztest_zap,				&zopt_always	},
+	{ ztest_zap_parallel,			&zopt_always	},
+	{ ztest_traverse,			&zopt_often	},
+	{ ztest_dsl_prop_get_set,		&zopt_sometimes	},
+	{ ztest_dmu_objset_create_destroy,	&zopt_sometimes	},
+	{ ztest_dmu_snapshot_create_destroy,	&zopt_sometimes	},
+	{ ztest_spa_create_destroy,		&zopt_sometimes	},
+	{ ztest_fault_inject,			&zopt_sometimes	},
+	{ ztest_spa_rename,			&zopt_rarely	},
+	{ ztest_vdev_attach_detach,		&zopt_rarely	},
+	{ ztest_vdev_LUN_growth,		&zopt_rarely	},
+	{ ztest_vdev_add_remove,		&zopt_vdevtime	},
+	{ ztest_scrub,				&zopt_vdevtime	},
+};
+
+#define	ZTEST_FUNCS	(sizeof (ztest_info) / sizeof (ztest_info_t))
+
+#define	ZTEST_SYNC_LOCKS	16
+
+/*
+ * Stuff we need to share writably between parent and child.
+ */
+typedef struct ztest_shared {
+	mutex_t		zs_vdev_lock;
+	rwlock_t	zs_name_lock;
+	uint64_t	zs_vdev_primaries;
+	uint64_t	zs_enospc_count;
+	hrtime_t	zs_start_time;
+	hrtime_t	zs_stop_time;
+	uint64_t	zs_alloc;
+	uint64_t	zs_space;
+	ztest_info_t	zs_info[ZTEST_FUNCS];
+	mutex_t		zs_sync_lock[ZTEST_SYNC_LOCKS];
+	uint64_t	zs_seq[ZTEST_SYNC_LOCKS];
+} ztest_shared_t;
+
+typedef struct ztest_block_tag {
+	uint64_t	bt_objset;
+	uint64_t	bt_object;
+	uint64_t	bt_offset;
+	uint64_t	bt_txg;
+	uint64_t	bt_thread;
+	uint64_t	bt_seq;
+} ztest_block_tag_t;
+
+static char ztest_dev_template[] = "%s/%s.%llua";
+static ztest_shared_t *ztest_shared;
+
+static int ztest_random_fd;
+static int ztest_dump_core = 1;
+
+extern uint64_t zio_gang_bang;
+
+#define	ZTEST_DIROBJ		1
+#define	ZTEST_MICROZAP_OBJ	2
+#define	ZTEST_FATZAP_OBJ	3
+
+#define	ZTEST_DIROBJ_BLOCKSIZE	(1 << 10)
+#define	ZTEST_DIRSIZE		256
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+const char *
+_umem_debug_init()
+{
+	return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+	return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+
+#define	FATAL_MSG_SZ	1024
+
+char *fatal_msg;
+
+static void
+fatal(int do_perror, char *message, ...)
+{
+	va_list args;
+	int save_errno = errno;
+	char buf[FATAL_MSG_SZ];
+
+	(void) fflush(stdout);
+
+	va_start(args, message);
+	(void) sprintf(buf, "ztest: ");
+	/* LINTED */
+	(void) vsprintf(buf + strlen(buf), message, args);
+	va_end(args);
+	if (do_perror) {
+		(void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf),
+		    ": %s", strerror(save_errno));
+	}
+	(void) fprintf(stderr, "%s\n", buf);
+	fatal_msg = buf;			/* to ease debugging */
+	if (ztest_dump_core)
+		abort();
+	exit(3);
+}
+
+static int
+str2shift(const char *buf)
+{
+	const char *ends = "BKMGTPEZ";
+	int i;
+
+	if (buf[0] == '\0')
+		return (0);
+	for (i = 0; i < strlen(ends); i++) {
+		if (toupper(buf[0]) == ends[i])
+			break;
+	}
+	if (i == strlen(ends))
+		fatal(0, "invalid bytes suffix: %s", buf);
+	if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) {
+		return (10*i);
+	}
+	fatal(0, "invalid bytes suffix: %s", buf);
+	return (-1);
+}
+
+static uint64_t
+nicenumtoull(const char *buf)
+{
+	char *end;
+	uint64_t val;
+
+	val = strtoull(buf, &end, 0);
+	if (end == buf) {
+		fatal(0, "bad numeric value: %s", buf);
+	} else if (end[0] == '.') {
+		double fval = strtod(buf, &end);
+		fval *= pow(2, str2shift(end));
+		if (fval > UINT64_MAX)
+			fatal(0, "value too large: %s", buf);
+		val = (uint64_t)fval;
+	} else {
+		int shift = str2shift(end);
+		if (shift >= 64 || (val << shift) >> shift != val)
+			fatal(0, "value too large: %s", buf);
+		val <<= shift;
+	}
+	return (val);
+}
+
+static void
+usage(void)
+{
+	char nice_vdev_size[10];
+	char nice_gang_bang[10];
+
+	nicenum(zopt_vdev_size, nice_vdev_size);
+	nicenum(zio_gang_bang, nice_gang_bang);
+
+	(void) printf("Usage: %s\n"
+	    "\t[-v vdevs (default: %llu)]\n"
+	    "\t[-s size_of_each_vdev (default: %s)]\n"
+	    "\t[-m mirror_copies (default: %d)]\n"
+	    "\t[-r raidz_disks (default: %d)]\n"
+	    "\t[-d datasets (default: %d)]\n"
+	    "\t[-t threads (default: %d)]\n"
+	    "\t[-g gang_block_threshold (default: %s)]\n"
+	    "\t[-i initialize pool i times (default: %d)]\n"
+	    "\t[-k kill percentage (default: %llu%%)]\n"
+	    "\t[-p pool_name (default: %s)]\n"
+	    "\t[-f file directory for vdev files (default: %s)]\n"
+	    "\t[-V(erbose)] (use multiple times for ever more blather)\n"
+	    "\t[-E(xisting)] (use existing pool instead of creating new one\n"
+	    "\t[-I(mport)] (discover and import existing pools)\n"
+	    "\t[-T time] total run time (default: %llu sec)\n"
+	    "\t[-P passtime] time per pass (default: %llu sec)\n"
+	    "",
+	    cmdname,
+	    (u_longlong_t)zopt_vdevs,		/* -v */
+	    nice_vdev_size,			/* -s */
+	    zopt_mirrors,			/* -m */
+	    zopt_raidz,				/* -r */
+	    zopt_dirs,			/* -d */
+	    zopt_threads,			/* -t */
+	    nice_gang_bang,			/* -g */
+	    zopt_init,				/* -i */
+	    (u_longlong_t)zopt_killrate,	/* -k */
+	    zopt_pool,				/* -p */
+	    zopt_dir,				/* -f */
+	    (u_longlong_t)zopt_time,		/* -T */
+	    (u_longlong_t)zopt_passtime);	/* -P */
+	exit(1);
+}
+
+static uint64_t
+ztest_random(uint64_t range)
+{
+	uint64_t r;
+
+	if (range == 0)
+		return (0);
+
+	if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
+		fatal(1, "short read from /dev/urandom");
+
+	return (r % range);
+}
+
+static void
+ztest_record_enospc(char *s)
+{
+	dprintf("ENOSPC doing: %s\n", s ? s : "<unknown>");
+	ztest_shared->zs_enospc_count++;
+}
+
+static void
+process_options(int argc, char **argv)
+{
+	int opt;
+	uint64_t value;
+
+	/* By default, test gang blocks for blocks 32K and greater */
+	zio_gang_bang = 32 << 10;
+
+	while ((opt = getopt(argc, argv,
+	    "v:s:m:r:c:d:t:g:i:k:p:f:VEIT:P:S")) != EOF) {
+		value = 0;
+		switch (opt) {
+		    case 'v':
+		    case 's':
+		    case 'm':
+		    case 'r':
+		    case 'c':
+		    case 'd':
+		    case 't':
+		    case 'g':
+		    case 'i':
+		    case 'k':
+		    case 'T':
+		    case 'P':
+			value = nicenumtoull(optarg);
+		}
+		switch (opt) {
+		    case 'v':
+			zopt_vdevs = value;
+			break;
+		    case 's':
+			zopt_vdev_size = MAX(SPA_MINDEVSIZE, value);
+			break;
+		    case 'm':
+			zopt_mirrors = value;
+			break;
+		    case 'r':
+			zopt_raidz = MAX(1, value);
+			break;
+		    case 'd':
+			zopt_dirs = MAX(1, value);
+			break;
+		    case 't':
+			zopt_threads = MAX(1, value);
+			break;
+		    case 'g':
+			zio_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value);
+			break;
+		    case 'i':
+			zopt_init = value;
+			break;
+		    case 'k':
+			zopt_killrate = value;
+			break;
+		    case 'p':
+			zopt_pool = strdup(optarg);
+			break;
+		    case 'f':
+			zopt_dir = strdup(optarg);
+			break;
+		    case 'V':
+			zopt_verbose++;
+			break;
+		    case 'E':
+			zopt_init = 0;
+			break;
+		    case 'T':
+			zopt_time = value;
+			break;
+		    case 'P':
+			zopt_passtime = MAX(1, value);
+			break;
+		    case '?':
+		    default:
+			usage();
+			break;
+		}
+	}
+
+	zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time / zopt_vdevs : UINT64_MAX);
+	zopt_maxfaults = MAX(zopt_mirrors, 1) * (zopt_raidz >= 2 ? 2 : 1) - 1;
+}
+
+static nvlist_t *
+make_vdev_file(size_t size)
+{
+	char dev_name[MAXPATHLEN];
+	uint64_t vdev;
+	int fd;
+	nvlist_t *file;
+
+	if (size == 0) {
+		(void) snprintf(dev_name, sizeof (dev_name), "%s",
+		    "/dev/bogus");
+	} else {
+		vdev = ztest_shared->zs_vdev_primaries++;
+		(void) sprintf(dev_name, ztest_dev_template,
+		    zopt_dir, zopt_pool, vdev);
+
+		fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666);
+		if (fd == -1)
+			fatal(1, "can't open %s", dev_name);
+		if (ftruncate(fd, size) != 0)
+			fatal(1, "can't ftruncate %s", dev_name);
+		(void) close(fd);
+	}
+
+	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
+	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, dev_name) == 0);
+
+	return (file);
+}
+
+static nvlist_t *
+make_vdev_raidz(size_t size, int r)
+{
+	nvlist_t *raidz, **child;
+	int c;
+
+	if (r < 2)
+		return (make_vdev_file(size));
+
+	child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);
+
+	for (c = 0; c < r; c++)
+		child[c] = make_vdev_file(size);
+
+	VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
+	    VDEV_TYPE_RAIDZ) == 0);
+	VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
+	    child, r) == 0);
+
+	for (c = 0; c < r; c++)
+		nvlist_free(child[c]);
+
+	umem_free(child, r * sizeof (nvlist_t *));
+
+	return (raidz);
+}
+
+static nvlist_t *
+make_vdev_mirror(size_t size, int r, int m)
+{
+	nvlist_t *mirror, **child;
+	int c;
+
+	if (m < 1)
+		return (make_vdev_raidz(size, r));
+
+	child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
+
+	for (c = 0; c < m; c++)
+		child[c] = make_vdev_raidz(size, r);
+
+	VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
+	    VDEV_TYPE_MIRROR) == 0);
+	VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN,
+	    child, m) == 0);
+
+	for (c = 0; c < m; c++)
+		nvlist_free(child[c]);
+
+	umem_free(child, m * sizeof (nvlist_t *));
+
+	return (mirror);
+}
+
+static nvlist_t *
+make_vdev_root(size_t size, int r, int m, int t)
+{
+	nvlist_t *root, **child;
+	int c;
+
+	ASSERT(t > 0);
+
+	child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
+
+	for (c = 0; c < t; c++)
+		child[c] = make_vdev_mirror(size, r, m);
+
+	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
+	VERIFY(nvlist_add_nvlist_array(root, ZPOOL_CONFIG_CHILDREN,
+	    child, t) == 0);
+
+	for (c = 0; c < t; c++)
+		nvlist_free(child[c]);
+
+	umem_free(child, t * sizeof (nvlist_t *));
+
+	return (root);
+}
+
+static void
+ztest_set_random_blocksize(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+	int bs = SPA_MINBLOCKSHIFT +
+	    ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1);
+	int ibs = DN_MIN_INDBLKSHIFT +
+	    ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1);
+	int error;
+
+	error = dmu_object_set_blocksize(os, object, 1ULL << bs, ibs, tx);
+	if (error) {
+		char osname[300];
+		dmu_objset_name(os, osname);
+		fatal(0, "dmu_object_set_blocksize('%s', %llu, %d, %d) = %d",
+		    osname, object, 1 << bs, ibs, error);
+	}
+}
+
+static uint8_t
+ztest_random_checksum(void)
+{
+	uint8_t checksum;
+
+	do {
+		checksum = ztest_random(ZIO_CHECKSUM_FUNCTIONS);
+	} while (zio_checksum_table[checksum].ci_zbt);
+
+	if (checksum == ZIO_CHECKSUM_OFF)
+		checksum = ZIO_CHECKSUM_ON;
+
+	return (checksum);
+}
+
+static uint8_t
+ztest_random_compress(void)
+{
+	return ((uint8_t)ztest_random(ZIO_COMPRESS_FUNCTIONS));
+}
+
+typedef struct ztest_replay {
+	objset_t	*zr_os;
+	uint64_t	zr_assign;
+} ztest_replay_t;
+
+static int
+ztest_replay_create(ztest_replay_t *zr, lr_create_t *lr, boolean_t byteswap)
+{
+	objset_t *os = zr->zr_os;
+	dmu_tx_t *tx;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+	error = dmu_tx_assign(tx, zr->zr_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (error);
+	}
+
+	error = dmu_object_claim(os, lr->lr_doid, lr->lr_mode, 0,
+	    DMU_OT_NONE, 0, tx);
+	ASSERT(error == 0);
+	dmu_tx_commit(tx);
+
+	if (zopt_verbose >= 5) {
+		char osname[MAXNAMELEN];
+		dmu_objset_name(os, osname);
+		(void) printf("replay create of %s object %llu"
+		    " in txg %llu = %d\n",
+		    osname, (u_longlong_t)lr->lr_doid,
+		    (u_longlong_t)zr->zr_assign, error);
+	}
+
+	return (error);
+}
+
+static int
+ztest_replay_remove(ztest_replay_t *zr, lr_remove_t *lr, boolean_t byteswap)
+{
+	objset_t *os = zr->zr_os;
+	dmu_tx_t *tx;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_free(tx, lr->lr_doid, 0, DMU_OBJECT_END);
+	error = dmu_tx_assign(tx, zr->zr_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (error);
+	}
+
+	error = dmu_object_free(os, lr->lr_doid, tx);
+	dmu_tx_commit(tx);
+
+	return (error);
+}
+
+zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
+	NULL,			/* 0 no such transaction type */
+	ztest_replay_create,	/* TX_CREATE */
+	NULL,			/* TX_MKDIR */
+	NULL,			/* TX_MKXATTR */
+	NULL,			/* TX_SYMLINK */
+	ztest_replay_remove,	/* TX_REMOVE */
+	NULL,			/* TX_RMDIR */
+	NULL,			/* TX_LINK */
+	NULL,			/* TX_RENAME */
+	NULL,			/* TX_WRITE */
+	NULL,			/* TX_TRUNCATE */
+	NULL,			/* TX_SETATTR */
+	NULL,			/* TX_ACL */
+};
+
+/*
+ * Verify that we can't destroy an active pool, create an existing pool,
+ * or create a pool with a bad vdev spec.
+ */
+void
+ztest_spa_create_destroy(ztest_args_t *za)
+{
+	int error;
+	spa_t *spa;
+	nvlist_t *nvroot;
+
+	/*
+	 * Attempt to create using a bad file.
+	 */
+	nvroot = make_vdev_root(0, 0, 0, 1);
+	error = spa_create("ztest_bad_file", nvroot, NULL);
+	nvlist_free(nvroot);
+	if (error != ENOENT)
+		fatal(0, "spa_create(bad_file) = %d", error);
+
+	/*
+	 * Attempt to create using a bad mirror.
+	 */
+	nvroot = make_vdev_root(0, 0, 2, 1);
+	error = spa_create("ztest_bad_mirror", nvroot, NULL);
+	nvlist_free(nvroot);
+	if (error != ENOENT)
+		fatal(0, "spa_create(bad_mirror) = %d", error);
+
+	/*
+	 * Attempt to create an existing pool.  It shouldn't matter
+	 * what's in the nvroot; we should fail with EEXIST.
+	 */
+	(void) rw_rdlock(&ztest_shared->zs_name_lock);
+	nvroot = make_vdev_root(0, 0, 0, 1);
+	error = spa_create(za->za_pool, nvroot, NULL);
+	nvlist_free(nvroot);
+	if (error != EEXIST)
+		fatal(0, "spa_create(whatever) = %d", error);
+
+	error = spa_open(za->za_pool, &spa, FTAG);
+	if (error)
+		fatal(0, "spa_open() = %d", error);
+
+	error = spa_destroy(za->za_pool);
+	if (error != EBUSY)
+		fatal(0, "spa_destroy() = %d", error);
+
+	spa_close(spa, FTAG);
+	(void) rw_unlock(&ztest_shared->zs_name_lock);
+}
+
+/*
+ * Verify that vdev_add() works as expected.
+ */
+void
+ztest_vdev_add_remove(ztest_args_t *za)
+{
+	spa_t *spa = dmu_objset_spa(za->za_os);
+	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
+	nvlist_t *nvroot;
+	int error;
+
+	if (zopt_verbose >= 6)
+		(void) printf("adding vdev\n");
+
+	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
+
+	spa_config_enter(spa, RW_READER);
+
+	ztest_shared->zs_vdev_primaries =
+	    spa->spa_root_vdev->vdev_children * leaves;
+
+	spa_config_exit(spa);
+
+	nvroot = make_vdev_root(zopt_vdev_size, zopt_raidz, zopt_mirrors, 1);
+	error = spa_vdev_add(spa, nvroot);
+	nvlist_free(nvroot);
+
+	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+
+	if (error == ENOSPC)
+		ztest_record_enospc("spa_vdev_add");
+	else if (error != 0)
+		fatal(0, "spa_vdev_add() = %d", error);
+
+	if (zopt_verbose >= 6)
+		(void) printf("spa_vdev_add = %d, as expected\n", error);
+}
+
+/*
+ * Verify that we can attach and detach devices.
+ */
+void
+ztest_vdev_attach_detach(ztest_args_t *za)
+{
+	spa_t *spa = dmu_objset_spa(za->za_os);
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *vd0, *vd1, *pvd;
+	nvlist_t *root, *file;
+	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
+	uint64_t leaf, top;
+	size_t size0, size1;
+	char path0[MAXPATHLEN], path1[MAXPATHLEN];
+	int replacing;
+	int error, expected_error;
+	int fd;
+
+	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
+
+	spa_config_enter(spa, RW_READER);
+
+	/*
+	 * Decide whether to do an attach or a replace.
+	 */
+	replacing = ztest_random(2);
+
+	/*
+	 * Pick a random top-level vdev.
+	 */
+	top = ztest_random(rvd->vdev_children);
+
+	/*
+	 * Pick a random leaf within it.
+	 */
+	leaf = ztest_random(leaves);
+
+	/*
+	 * Generate the path to this leaf.  The filename will end with 'a'.
+	 * We'll alternate replacements with a filename that ends with 'b'.
+	 */
+	(void) snprintf(path0, sizeof (path0),
+	    ztest_dev_template, zopt_dir, zopt_pool, top * leaves + leaf);
+
+	bcopy(path0, path1, MAXPATHLEN);
+
+	/*
+	 * If the 'a' file isn't part of the pool, the 'b' file must be.
+	 */
+	if (vdev_lookup_by_path(rvd, path0) == NULL)
+		path0[strlen(path0) - 1] = 'b';
+	else
+		path1[strlen(path1) - 1] = 'b';
+
+	/*
+	 * Now path0 represents something that's already in the pool,
+	 * and path1 is the thing we'll try to attach.
+	 */
+	vd0 = vdev_lookup_by_path(rvd, path0);
+	vd1 = vdev_lookup_by_path(rvd, path1);
+	ASSERT(vd0 != NULL);
+	pvd = vd0->vdev_parent;
+
+
+	/*
+	 * Make size1 a little bigger or smaller than size0.
+	 * If it's smaller, the attach should fail.
+	 * If it's larger, and we're doing a replace,
+	 * we should get dynamic LUN growth when we're done.
+	 */
+	size0 = vd0->vdev_psize;
+	size1 = 10 * size0 / (9 + ztest_random(3));
+
+	/*
+	 * If pvd is not a mirror or root, the attach should fail with ENOTSUP,
+	 * unless it's a replace; in that case any non-replacing parent is OK.
+	 *
+	 * If vd1 is already part of the pool, it should fail with EBUSY.
+	 *
+	 * If vd1 is too small, it should fail with EOVERFLOW.
+	 */
+	if (pvd->vdev_ops != &vdev_mirror_ops &&
+	    pvd->vdev_ops != &vdev_root_ops &&
+	    (!replacing || pvd->vdev_ops == &vdev_replacing_ops))
+		expected_error = ENOTSUP;
+	else if (vd1 != NULL)
+		expected_error = EBUSY;
+	else if (size1 < size0)
+		expected_error = EOVERFLOW;
+	else
+		expected_error = 0;
+
+	/*
+	 * If vd1 isn't already part of the pool, create it.
+	 */
+	if (vd1 == NULL) {
+		fd = open(path1, O_RDWR | O_CREAT | O_TRUNC, 0666);
+		if (fd == -1)
+			fatal(1, "can't open %s", path1);
+		if (ftruncate(fd, size1) != 0)
+			fatal(1, "can't ftruncate %s", path1);
+		(void) close(fd);
+	}
+
+	spa_config_exit(spa);
+
+	/*
+	 * Build the nvlist describing path1.
+	 */
+	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
+	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path1) == 0);
+
+	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
+	VERIFY(nvlist_add_nvlist_array(root, ZPOOL_CONFIG_CHILDREN,
+	    &file, 1) == 0);
+
+	error = spa_vdev_attach(spa, path0, root, replacing);
+
+	nvlist_free(file);
+	nvlist_free(root);
+
+	/*
+	 * If our parent was the replacing vdev, but the replace completed,
+	 * then instead of failing with ENOTSUP we may either succeed,
+	 * fail with ENODEV, or fail with EOVERFLOW.
+	 */
+	if (expected_error == ENOTSUP &&
+	    (error == 0 || error == ENODEV || error == EOVERFLOW))
+		expected_error = error;
+
+	if (error != expected_error) {
+		fatal(0, "attach (%s, %s, %d) returned %d, expected %d",
+		    path0, path1, replacing, error, expected_error);
+	}
+
+	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+}
+
+/*
+ * Verify that dynamic LUN growth works as expected.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_LUN_growth(ztest_args_t *za)
+{
+	spa_t *spa = dmu_objset_spa(za->za_os);
+	char dev_name[MAXPATHLEN];
+	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
+	uint64_t vdev;
+	size_t fsize;
+	int fd;
+
+	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
+
+	/*
+	 * Pick a random leaf vdev.
+	 */
+	spa_config_enter(spa, RW_READER);
+	vdev = ztest_random(spa->spa_root_vdev->vdev_children * leaves);
+	spa_config_exit(spa);
+
+	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
+
+	if ((fd = open(dev_name, O_RDWR)) != -1) {
+		/*
+		 * Determine the size.
+		 */
+		fsize = lseek(fd, 0, SEEK_END);
+
+		/*
+		 * If it's less than 2x the original size, grow by around 3%.
+		 */
+		if (fsize < 2 * zopt_vdev_size) {
+			size_t newsize = fsize + ztest_random(fsize / 32);
+			(void) ftruncate(fd, newsize);
+			if (zopt_verbose >= 6) {
+				(void) printf("%s grew from %lu to %lu bytes\n",
+				    dev_name, (ulong_t)fsize, (ulong_t)newsize);
+			}
+		}
+		(void) close(fd);
+	}
+
+	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+}
+
+/* ARGSUSED */
+static void
+ztest_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
+{
+	/*
+	 * Create the directory object.
+	 */
+	VERIFY(dmu_object_claim(os, ZTEST_DIROBJ,
+	    DMU_OT_UINT64_OTHER, ZTEST_DIROBJ_BLOCKSIZE,
+	    DMU_OT_UINT64_OTHER, sizeof (ztest_block_tag_t), tx) == 0);
+
+	VERIFY(zap_create_claim(os, ZTEST_MICROZAP_OBJ,
+	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
+
+	VERIFY(zap_create_claim(os, ZTEST_FATZAP_OBJ,
+	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
+}
+
+/* ARGSUSED */
+static void
+ztest_destroy_cb(char *name, void *arg)
+{
+	objset_t *os;
+	dmu_object_info_t doi;
+	int error;
+
+	/*
+	 * Verify that the dataset contains a directory object.
+	 */
+	error = dmu_objset_open(name, DMU_OST_OTHER,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+	ASSERT3U(error, ==, 0);
+	error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
+	ASSERT3U(error, ==, 0);
+	ASSERT3U(doi.doi_type, ==, DMU_OT_UINT64_OTHER);
+	ASSERT3S(doi.doi_physical_blks, >=, 0);
+	dmu_objset_close(os);
+
+	/*
+	 * Destroy the dataset.
+	 */
+	error = dmu_objset_destroy(name);
+	ASSERT3U(error, ==, 0);
+}
+
+/*
+ * Verify that dmu_objset_{create,destroy,open,close} work as expected.
+ */
+static uint64_t
+ztest_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t object, int mode)
+{
+	itx_t *itx;
+	lr_create_t *lr;
+	size_t namesize;
+	char name[24];
+
+	(void) sprintf(name, "ZOBJ_%llu", (u_longlong_t)object);
+	namesize = strlen(name) + 1;
+
+	itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize +
+	    ztest_random(ZIL_MAX_BLKSZ));
+	lr = (lr_create_t *)&itx->itx_lr;
+	bzero(lr + 1, lr->lr_common.lrc_reclen - sizeof (*lr));
+	lr->lr_doid = object;
+	lr->lr_foid = 0;
+	lr->lr_mode = mode;
+	lr->lr_uid = 0;
+	lr->lr_gid = 0;
+	lr->lr_gen = dmu_tx_get_txg(tx);
+	lr->lr_crtime[0] = time(NULL);
+	lr->lr_crtime[1] = 0;
+	lr->lr_rdev = 0;
+	bcopy(name, (char *)(lr + 1), namesize);
+
+	return (zil_itx_assign(zilog, itx, tx));
+}
+
+#ifndef lint
+static uint64_t
+ztest_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t object)
+{
+	itx_t *itx;
+	lr_remove_t *lr;
+	size_t namesize;
+	char name[24];
+
+	(void) sprintf(name, "ZOBJ_%llu", (u_longlong_t)object);
+	namesize = strlen(name) + 1;
+
+	itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize +
+	    ztest_random(8000));
+	lr = (lr_remove_t *)&itx->itx_lr;
+	lr->lr_doid = object;
+	bcopy(name, (char *)(lr + 1), namesize);
+
+	return (zil_itx_assign(zilog, itx, tx));
+}
+#endif /* lint */
+
+void
+ztest_dmu_objset_create_destroy(ztest_args_t *za)
+{
+	int error;
+	objset_t *os;
+	char name[100];
+	int mode, basemode, expected_error;
+	zilog_t *zilog;
+	uint64_t seq;
+	uint64_t objects;
+	ztest_replay_t zr;
+
+	(void) rw_rdlock(&ztest_shared->zs_name_lock);
+	(void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool,
+	    (u_longlong_t)za->za_instance);
+
+	basemode = DS_MODE_LEVEL(za->za_instance);
+	if (basemode == DS_MODE_NONE)
+		basemode++;
+
+	/*
+	 * If this dataset exists from a previous run, process its replay log
+	 * half of the time.  If we don't replay it, then dmu_objset_destroy()
+	 * (invoked from ztest_destroy_cb() below) should just throw it away.
+	 */
+	if (ztest_random(2) == 0 &&
+	    dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_PRIMARY, &os) == 0) {
+		zr.zr_os = os;
+		zil_replay(os, &zr, &zr.zr_assign, ztest_replay_vector, NULL);
+		dmu_objset_close(os);
+	}
+
+	/*
+	 * There may be an old instance of the dataset we're about to
+	 * create lying around from a previous run.  If so, destroy it
+	 * and all of its snapshots.
+	 */
+	dmu_objset_find(name, ztest_destroy_cb, NULL, DS_FIND_SNAPSHOTS);
+
+	/*
+	 * Verify that the destroyed dataset is no longer in the namespace.
+	 */
+	error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os);
+	if (error != ENOENT)
+		fatal(1, "dmu_objset_open(%s) found destroyed dataset %p",
+		    name, os);
+
+	/*
+	 * Verify that we can create a new dataset.
+	 */
+	error = dmu_objset_create(name, DMU_OST_OTHER, NULL, ztest_create_cb,
+	    NULL);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc("dmu_objset_create");
+			(void) rw_unlock(&ztest_shared->zs_name_lock);
+			return;
+		}
+		fatal(0, "dmu_objset_create(%s) = %d", name, error);
+	}
+
+	error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os);
+	if (error) {
+		fatal(0, "dmu_objset_open(%s) = %d", name, error);
+	}
+
+	/*
+	 * Open the intent log for it.
+	 */
+	zilog = zil_open(os, NULL);
+
+	/*
+	 * Put a random number of objects in there.
+	 */
+	objects = ztest_random(50);
+	seq = 0;
+	while (objects-- != 0) {
+		uint64_t object;
+		dmu_tx_t *tx = dmu_tx_create(os);
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, sizeof (name));
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+		} else {
+			object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
+			    DMU_OT_NONE, 0, tx);
+			ztest_set_random_blocksize(os, object, tx);
+			seq = ztest_log_create(zilog, tx, object,
+			    DMU_OT_UINT64_OTHER);
+			dmu_write(os, object, 0, sizeof (name), name, tx);
+			dmu_tx_commit(tx);
+		}
+		if (ztest_random(5) == 0) {
+			zil_commit(zilog, seq, FSYNC);
+		}
+		if (ztest_random(5) == 0) {
+			error = zil_suspend(zilog);
+			if (error == 0) {
+				zil_resume(zilog);
+			}
+		}
+	}
+
+	/*
+	 * Verify that we cannot create an existing dataset.
+	 */
+	error = dmu_objset_create(name, DMU_OST_OTHER, NULL, NULL, NULL);
+	if (error != EEXIST)
+		fatal(0, "created existing dataset, error = %d", error);
+
+	/*
+	 * Verify that multiple dataset opens are allowed, but only when
+	 * the new access mode is compatible with the base mode.
+	 * We use a mixture of typed and typeless opens, and when the
+	 * open succeeds, verify that the discovered type is correct.
+	 */
+	for (mode = DS_MODE_STANDARD; mode < DS_MODE_LEVELS; mode++) {
+		objset_t *os2;
+		error = dmu_objset_open(name, DMU_OST_OTHER, mode, &os2);
+		expected_error = (basemode + mode < DS_MODE_LEVELS) ? 0 : EBUSY;
+		if (error != expected_error)
+			fatal(0, "dmu_objset_open('%s') = %d, expected %d",
+			    name, error, expected_error);
+		if (error == 0)
+			dmu_objset_close(os2);
+	}
+
+	zil_close(zilog);
+	dmu_objset_close(os);
+
+	error = dmu_objset_destroy(name);
+	if (error)
+		fatal(0, "dmu_objset_destroy(%s) = %d", name, error);
+
+	(void) rw_unlock(&ztest_shared->zs_name_lock);
+}
+
+/*
+ * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
+ */
+void
+ztest_dmu_snapshot_create_destroy(ztest_args_t *za)
+{
+	int error;
+	objset_t *os = za->za_os;
+	char snapname[100];
+	char osname[MAXNAMELEN];
+
+	(void) rw_rdlock(&ztest_shared->zs_name_lock);
+	dmu_objset_name(os, osname);
+	(void) snprintf(snapname, 100, "%s@%llu", osname,
+	    (u_longlong_t)za->za_instance);
+
+	error = dmu_objset_destroy(snapname);
+	if (error != 0 && error != ENOENT)
+		fatal(0, "dmu_objset_destroy() = %d", error);
+	error = dmu_objset_create(snapname, DMU_OST_OTHER, NULL, NULL, NULL);
+	if (error == ENOSPC)
+		ztest_record_enospc("dmu_take_snapshot");
+	else if (error != 0 && error != EEXIST)
+		fatal(0, "dmu_take_snapshot() = %d", error);
+	(void) rw_unlock(&ztest_shared->zs_name_lock);
+}
+
+#define	ZTEST_TRAVERSE_BLOCKS	1000
+
+static int
+ztest_blk_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+{
+	ztest_args_t *za = arg;
+	zbookmark_t *zb = &bc->bc_bookmark;
+	blkptr_t *bp = &bc->bc_blkptr;
+	dnode_phys_t *dnp = bc->bc_dnode;
+	traverse_handle_t *th = za->za_th;
+	uint64_t size = BP_GET_LSIZE(bp);
+
+	ASSERT(dnp != NULL);
+
+	if (bc->bc_errno)
+		return (ERESTART);
+
+	/*
+	 * Once in a while, abort the traverse.   We only do this to odd
+	 * instance numbers to ensure that even ones can run to completion.
+	 */
+	if ((za->za_instance & 1) && ztest_random(10000) == 0)
+		return (EINTR);
+
+	if (bp->blk_birth == 0) {
+		ASSERT(th->th_advance & ADVANCE_HOLES);
+		return (0);
+	}
+
+	if (zb->zb_level == 0 && !(th->th_advance & ADVANCE_DATA) &&
+	    bc == &th->th_cache[ZB_DN_CACHE][0]) {
+		ASSERT(bc->bc_data == NULL);
+		return (0);
+	}
+
+	ASSERT(bc->bc_data != NULL);
+
+	/*
+	 * This is an expensive question, so don't ask it too often.
+	 */
+	if (((za->za_random ^ th->th_callbacks) & 0xff) == 0) {
+		void *xbuf = umem_alloc(size, UMEM_NOFAIL);
+		if (arc_tryread(spa, bp, xbuf) == 0) {
+			ASSERT(bcmp(bc->bc_data, xbuf, size) == 0);
+		}
+		umem_free(xbuf, size);
+	}
+
+	if (zb->zb_level > 0) {
+		ASSERT3U(size, ==, 1ULL << dnp->dn_indblkshift);
+		return (0);
+	}
+
+	if (zb->zb_level == -1) {
+		ASSERT3U(size, ==, sizeof (objset_phys_t));
+		return (0);
+	}
+
+	ASSERT(zb->zb_level == 0);
+	ASSERT3U(size, ==, dnp->dn_datablkszsec << DEV_BSHIFT);
+
+	return (0);
+}
+
+/*
+ * Verify that live pool traversal works.
+ */
+void
+ztest_traverse(ztest_args_t *za)
+{
+	spa_t *spa = dmu_objset_spa(za->za_os);
+	traverse_handle_t *th = za->za_th;
+	int rc, advance;
+	uint64_t cbstart, cblimit;
+
+	if (th == NULL) {
+		advance = 0;
+
+		if (ztest_random(2) == 0)
+			advance |= ADVANCE_PRE;
+
+		if (ztest_random(2) == 0)
+			advance |= ADVANCE_PRUNE;
+
+		if (ztest_random(2) == 0)
+			advance |= ADVANCE_DATA;
+
+		if (ztest_random(2) == 0)
+			advance |= ADVANCE_HOLES;
+
+		th = za->za_th = traverse_init(spa, ztest_blk_cb, za, advance,
+		    ZIO_FLAG_CANFAIL);
+
+		traverse_add_pool(th, 0, -1ULL);
+	}
+
+	advance = th->th_advance;
+	cbstart = th->th_callbacks;
+	cblimit = cbstart + ((advance & ADVANCE_DATA) ? 100 : 1000);
+
+	while ((rc = traverse_more(th)) == EAGAIN && th->th_callbacks < cblimit)
+		continue;
+
+	if (zopt_verbose >= 5)
+		(void) printf("traverse %s%s%s%s %llu blocks to "
+		    "<%llu, %llu, %d, %llx>%s\n",
+		    (advance & ADVANCE_PRE) ? "pre" : "post",
+		    (advance & ADVANCE_PRUNE) ? "|prune" : "",
+		    (advance & ADVANCE_DATA) ? "|data" : "",
+		    (advance & ADVANCE_HOLES) ? "|holes" : "",
+		    (u_longlong_t)(th->th_callbacks - cbstart),
+		    (u_longlong_t)th->th_lastcb.zb_objset,
+		    (u_longlong_t)th->th_lastcb.zb_object,
+		    th->th_lastcb.zb_level,
+		    (u_longlong_t)th->th_lastcb.zb_blkid,
+		    rc == 0 ? " [done]" :
+		    rc == EINTR ? " [aborted]" :
+		    rc == EAGAIN ? "" :
+		    strerror(rc));
+
+	if (rc != EAGAIN) {
+		if (rc != 0 && rc != EINTR)
+			fatal(0, "traverse_more(%p) = %d", th, rc);
+		traverse_fini(th);
+		za->za_th = NULL;
+	}
+}
+
+/*
+ * Verify that dmu_object_{alloc,free} work as expected.
+ */
+void
+ztest_dmu_object_alloc_free(ztest_args_t *za)
+{
+	objset_t *os = za->za_os;
+	dmu_buf_t *db;
+	dmu_tx_t *tx;
+	uint64_t batchobj, object, batchsize, endoff, temp;
+	int b, c, error, bonuslen;
+	dmu_object_info_t doi;
+	char osname[MAXNAMELEN];
+
+	dmu_objset_name(os, osname);
+
+	endoff = -8ULL;
+	batchsize = 2;
+
+	/*
+	 * Create a batch object if necessary, and record it in the directory.
+	 */
+	dmu_read(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t), &batchobj);
+	if (batchobj == 0) {
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
+		    sizeof (uint64_t));
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			ztest_record_enospc("create a batch object");
+			dmu_tx_abort(tx);
+			return;
+		}
+		batchobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
+		    DMU_OT_NONE, 0, tx);
+		ztest_set_random_blocksize(os, batchobj, tx);
+		dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
+		    sizeof (uint64_t), &batchobj, tx);
+		dmu_tx_commit(tx);
+	}
+
+	/*
+	 * Destroy the previous batch of objects.
+	 */
+	for (b = 0; b < batchsize; b++) {
+		dmu_read(os, batchobj, b * sizeof (uint64_t),
+		    sizeof (uint64_t), &object);
+		if (object == 0)
+			continue;
+		/*
+		 * Read and validate contents.
+		 * We expect the nth byte of the bonus buffer to be n.
+		 */
+		db = dmu_bonus_hold(os, object);
+
+		dmu_object_info_from_db(db, &doi);
+		ASSERT(doi.doi_type == DMU_OT_UINT64_OTHER);
+		ASSERT(doi.doi_bonus_type == DMU_OT_PLAIN_OTHER);
+		ASSERT3S(doi.doi_physical_blks, >=, 0);
+
+		dmu_buf_read(db);
+
+		bonuslen = db->db_size;
+
+		for (c = 0; c < bonuslen; c++) {
+			if (((uint8_t *)db->db_data)[c] !=
+			    (uint8_t)(c + bonuslen)) {
+				fatal(0,
+				    "bad bonus: %s, obj %llu, off %d: %u != %u",
+				    osname, object, c,
+				    ((uint8_t *)db->db_data)[c],
+				    (uint8_t)(c + bonuslen));
+			}
+		}
+
+		dmu_buf_rele(db);
+
+		/*
+		 * We expect the word at endoff to be our object number.
+		 */
+		dmu_read(os, object, endoff, sizeof (uint64_t), &temp);
+
+		if (temp != object) {
+			fatal(0, "bad data in %s, got %llu, expected %llu",
+			    osname, temp, object);
+		}
+
+		/*
+		 * Destroy old object and clear batch entry.
+		 */
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_write(tx, batchobj,
+		    b * sizeof (uint64_t), sizeof (uint64_t));
+		dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			ztest_record_enospc("free object");
+			dmu_tx_abort(tx);
+			return;
+		}
+		error = dmu_object_free(os, object, tx);
+		if (error) {
+			fatal(0, "dmu_object_free('%s', %llu) = %d",
+			    osname, object, error);
+		}
+		object = 0;
+
+		dmu_object_set_checksum(os, batchobj,
+		    ztest_random_checksum(), tx);
+		dmu_object_set_compress(os, batchobj,
+		    ztest_random_compress(), tx);
+
+		dmu_write(os, batchobj, b * sizeof (uint64_t),
+		    sizeof (uint64_t), &object, tx);
+
+		dmu_tx_commit(tx);
+	}
+
+	/*
+	 * Before creating the new batch of objects, generate a bunch of churn.
+	 */
+	for (b = ztest_random(100); b > 0; b--) {
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			ztest_record_enospc("churn objects");
+			dmu_tx_abort(tx);
+			return;
+		}
+		object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
+		    DMU_OT_NONE, 0, tx);
+		ztest_set_random_blocksize(os, object, tx);
+		error = dmu_object_free(os, object, tx);
+		if (error) {
+			fatal(0, "dmu_object_free('%s', %llu) = %d",
+			    osname, object, error);
+		}
+		dmu_tx_commit(tx);
+	}
+
+	/*
+	 * Create a new batch of objects with randomly chosen
+	 * blocksizes and record them in the batch directory.
+	 */
+	for (b = 0; b < batchsize; b++) {
+		uint32_t va_blksize;
+		u_longlong_t va_nblocks;
+
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_write(tx, batchobj, b * sizeof (uint64_t),
+		    sizeof (uint64_t));
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, endoff,
+		    sizeof (uint64_t));
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			ztest_record_enospc("create batchobj");
+			dmu_tx_abort(tx);
+			return;
+		}
+		bonuslen = (int)ztest_random(dmu_bonus_max()) + 1;
+
+		object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
+		    DMU_OT_PLAIN_OTHER, bonuslen, tx);
+
+		ztest_set_random_blocksize(os, object, tx);
+
+		dmu_object_set_checksum(os, object,
+		    ztest_random_checksum(), tx);
+		dmu_object_set_compress(os, object,
+		    ztest_random_compress(), tx);
+
+		dmu_write(os, batchobj, b * sizeof (uint64_t),
+		    sizeof (uint64_t), &object, tx);
+
+		/*
+		 * Write to both the bonus buffer and the regular data.
+		 */
+		db = dmu_bonus_hold(os, object);
+		ASSERT3U(bonuslen, ==, db->db_size);
+
+		dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
+		ASSERT3S(va_nblocks, >=, 0);
+
+		dmu_buf_will_dirty(db, tx);
+
+		/*
+		 * See comments above regarding the contents of
+		 * the bonus buffer and the word at endoff.
+		 */
+		for (c = 0; c < db->db_size; c++)
+			((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
+
+		dmu_buf_rele(db);
+
+		/*
+		 * Write to a large offset to increase indirection.
+		 */
+		dmu_write(os, object, endoff, sizeof (uint64_t), &object, tx);
+
+		dmu_tx_commit(tx);
+	}
+}
+
+/*
+ * Verify that dmu_{read,write} work as expected.
+ */
+typedef struct bufwad {
+	uint64_t	bw_index;
+	uint64_t	bw_txg;
+	uint64_t	bw_data;
+} bufwad_t;
+
+typedef struct dmu_read_write_dir {
+	uint64_t	dd_packobj;
+	uint64_t	dd_bigobj;
+	uint64_t	dd_chunk;
+} dmu_read_write_dir_t;
+
+void
+ztest_dmu_read_write(ztest_args_t *za)
+{
+	objset_t *os = za->za_os;
+	dmu_read_write_dir_t dd;
+	dmu_tx_t *tx;
+	int i, freeit, error;
+	uint64_t n, s, txg;
+	bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
+	uint64_t packoff, packsize, bigoff, bigsize;
+	uint64_t regions = 997;
+	uint64_t stride = 123456789ULL;
+	uint64_t width = 40;
+	int free_percent = 5;
+
+	/*
+	 * This test uses two objects, packobj and bigobj, that are always
+	 * updated together (i.e. in the same tx) so that their contents are
+	 * in sync and can be compared.  Their contents relate to each other
+	 * in a simple way: packobj is a dense array of 'bufwad' structures,
+	 * while bigobj is a sparse array of the same bufwads.  Specifically,
+	 * for any index n, there are three bufwads that should be identical:
+	 *
+	 *	packobj, at offset n * sizeof (bufwad_t)
+	 *	bigobj, at the head of the nth chunk
+	 *	bigobj, at the tail of the nth chunk
+	 *
+	 * The chunk size is arbitrary. It doesn't have to be a power of two,
+	 * and it doesn't have any relation to the object blocksize.
+	 * The only requirement is that it can hold at least two bufwads.
+	 *
+	 * Normally, we write the bufwad to each of these locations.
+	 * However, free_percent of the time we instead write zeroes to
+	 * packobj and perform a dmu_free_range() on bigobj.  By comparing
+	 * bigobj to packobj, we can verify that the DMU is correctly
+	 * tracking which parts of an object are allocated and free,
+	 * and that the contents of the allocated blocks are correct.
+	 */
+
+	/*
+	 * Read the directory info.  If it's the first time, set things up.
+	 */
+	dmu_read(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd);
+	if (dd.dd_chunk == 0) {
+		ASSERT(dd.dd_packobj == 0);
+		ASSERT(dd.dd_bigobj == 0);
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			ztest_record_enospc("create r/w directory");
+			dmu_tx_abort(tx);
+			return;
+		}
+
+		dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
+		    DMU_OT_NONE, 0, tx);
+		dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
+		    DMU_OT_NONE, 0, tx);
+		dd.dd_chunk = (1000 + ztest_random(1000)) * sizeof (uint64_t);
+
+		ztest_set_random_blocksize(os, dd.dd_packobj, tx);
+		ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
+
+		dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
+		    tx);
+		dmu_tx_commit(tx);
+	}
+
+	/*
+	 * Prefetch a random chunk of the big object.
+	 * Our aim here is to get some async reads in flight
+	 * for blocks that we may free below; the DMU should
+	 * handle this race correctly.
+	 */
+	n = ztest_random(regions) * stride + ztest_random(width);
+	s = 1 + ztest_random(2 * width - 1);
+	dmu_prefetch(os, dd.dd_bigobj, n * dd.dd_chunk, s * dd.dd_chunk);
+
+	/*
+	 * Pick a random index and compute the offsets into packobj and bigobj.
+	 */
+	n = ztest_random(regions) * stride + ztest_random(width);
+	s = 1 + ztest_random(width - 1);
+
+	packoff = n * sizeof (bufwad_t);
+	packsize = s * sizeof (bufwad_t);
+
+	bigoff = n * dd.dd_chunk;
+	bigsize = s * dd.dd_chunk;
+
+	packbuf = umem_alloc(packsize, UMEM_NOFAIL);
+	bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
+
+	/*
+	 * free_percent of the time, free a range of bigobj rather than
+	 * overwriting it.
+	 */
+	freeit = (ztest_random(100) < free_percent);
+
+	/*
+	 * Read the current contents of our objects.
+	 */
+	dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf);
+	dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf);
+
+	/*
+	 * Get a tx for the mods to both packobj and bigobj.
+	 */
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
+
+	if (freeit)
+		dmu_tx_hold_free(tx, dd.dd_bigobj, bigoff, bigsize);
+	else
+		dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
+
+	error = dmu_tx_assign(tx, TXG_WAIT);
+
+	if (error) {
+		ztest_record_enospc("dmu r/w range");
+		dmu_tx_abort(tx);
+		umem_free(packbuf, packsize);
+		umem_free(bigbuf, bigsize);
+		return;
+	}
+
+	txg = dmu_tx_get_txg(tx);
+
+	/*
+	 * For each index from n to n + s, verify that the existing bufwad
+	 * in packobj matches the bufwads at the head and tail of the
+	 * corresponding chunk in bigobj.  Then update all three bufwads
+	 * with the new values we want to write out.
+	 */
+	for (i = 0; i < s; i++) {
+		/* LINTED */
+		pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
+		/* LINTED */
+		bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
+		/* LINTED */
+		bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
+
+		ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
+		ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
+
+		if (pack->bw_txg > txg)
+			fatal(0, "future leak: got %llx, open txg is %llx",
+			    pack->bw_txg, txg);
+
+		if (pack->bw_data != 0 && pack->bw_index != n + i)
+			fatal(0, "wrong index: got %llx, wanted %llx+%llx",
+			    pack->bw_index, n, i);
+
+		if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
+			fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
+
+		if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
+			fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
+
+		if (freeit) {
+			bzero(pack, sizeof (bufwad_t));
+		} else {
+			pack->bw_index = n + i;
+			pack->bw_txg = txg;
+			pack->bw_data = 1 + ztest_random(-2ULL);
+		}
+		*bigH = *pack;
+		*bigT = *pack;
+	}
+
+	/*
+	 * We've verified all the old bufwads, and made new ones.
+	 * Now write them out.
+	 */
+	dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
+
+	if (freeit) {
+		if (zopt_verbose >= 6) {
+			(void) printf("freeing offset %llx size %llx"
+			    " txg %llx\n",
+			    (u_longlong_t)bigoff,
+			    (u_longlong_t)bigsize,
+			    (u_longlong_t)txg);
+		}
+		dmu_free_range(os, dd.dd_bigobj, bigoff, bigsize, tx);
+	} else {
+		if (zopt_verbose >= 6) {
+			(void) printf("writing offset %llx size %llx"
+			    " txg %llx\n",
+			    (u_longlong_t)bigoff,
+			    (u_longlong_t)bigsize,
+			    (u_longlong_t)txg);
+		}
+		dmu_write(os, dd.dd_bigobj, bigoff, bigsize, bigbuf, tx);
+	}
+
+	dmu_tx_commit(tx);
+
+	/*
+	 * Sanity check the stuff we just wrote.
+	 */
+	{
+		void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
+		void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
+
+		dmu_read(os, dd.dd_packobj, packoff, packsize, packcheck);
+		dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigcheck);
+
+		ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
+		ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
+
+		umem_free(packcheck, packsize);
+		umem_free(bigcheck, bigsize);
+	}
+
+	umem_free(packbuf, packsize);
+	umem_free(bigbuf, bigsize);
+}
+
+void
+ztest_dmu_write_parallel(ztest_args_t *za)
+{
+	objset_t *os = za->za_os;
+	dmu_tx_t *tx;
+	dmu_buf_t *db;
+	int i, b, error, do_free, bs;
+	uint64_t off, txg_how, txg;
+	mutex_t *lp;
+	char osname[MAXNAMELEN];
+	char iobuf[SPA_MAXBLOCKSIZE];
+	ztest_block_tag_t rbt, wbt;
+
+	dmu_objset_name(os, osname);
+	bs = ZTEST_DIROBJ_BLOCKSIZE;
+
+	/*
+	 * Have multiple threads write to large offsets in ZTEST_DIROBJ
+	 * to verify that having multiple threads writing to the same object
+	 * in parallel doesn't cause any trouble.
+	 * Also do parallel writes to the bonus buffer on occasion.
+	 */
+	for (i = 0; i < 50; i++) {
+		b = ztest_random(ZTEST_SYNC_LOCKS);
+		lp = &ztest_shared->zs_sync_lock[b];
+
+		do_free = (ztest_random(4) == 0);
+
+		off = za->za_diroff_shared + ((uint64_t)b << SPA_MAXBLOCKSHIFT);
+
+		if (ztest_random(4) == 0) {
+			/*
+			 * Do the bonus buffer instead of a regular block.
+			 */
+			do_free = 0;
+			off = -1ULL;
+		}
+
+		tx = dmu_tx_create(os);
+
+		if (off == -1ULL)
+			dmu_tx_hold_bonus(tx, ZTEST_DIROBJ);
+		else if (do_free)
+			dmu_tx_hold_free(tx, ZTEST_DIROBJ, off, bs);
+		else
+			dmu_tx_hold_write(tx, ZTEST_DIROBJ, off, bs);
+
+		txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT;
+		error = dmu_tx_assign(tx, txg_how);
+		if (error) {
+			dmu_tx_abort(tx);
+			if (error == ERESTART) {
+				ASSERT(txg_how == TXG_NOWAIT);
+				txg_wait_open(dmu_objset_pool(os), 0);
+				continue;
+			}
+			ztest_record_enospc("dmu write parallel");
+			return;
+		}
+		txg = dmu_tx_get_txg(tx);
+
+		if (do_free) {
+			(void) mutex_lock(lp);
+			dmu_free_range(os, ZTEST_DIROBJ, off, bs, tx);
+			(void) mutex_unlock(lp);
+			dmu_tx_commit(tx);
+			continue;
+		}
+
+		wbt.bt_objset = dmu_objset_id(os);
+		wbt.bt_object = ZTEST_DIROBJ;
+		wbt.bt_offset = off;
+		wbt.bt_txg = txg;
+		wbt.bt_thread = za->za_instance;
+
+		if (off == -1ULL) {
+			wbt.bt_seq = 0;
+			db = dmu_bonus_hold(os, ZTEST_DIROBJ);
+			ASSERT3U(db->db_size, ==, sizeof (wbt));
+			dmu_buf_read(db);
+			bcopy(db->db_data, &rbt, db->db_size);
+			if (rbt.bt_objset != 0) {
+				ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset);
+				ASSERT3U(rbt.bt_object, ==, wbt.bt_object);
+				ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset);
+				ASSERT3U(rbt.bt_txg, <=, wbt.bt_txg);
+			}
+			dmu_buf_will_dirty(db, tx);
+			bcopy(&wbt, db->db_data, db->db_size);
+			dmu_buf_rele(db);
+			dmu_tx_commit(tx);
+			continue;
+		}
+
+		(void) mutex_lock(lp);
+
+		wbt.bt_seq = ztest_shared->zs_seq[b]++;
+
+		dmu_write(os, ZTEST_DIROBJ, off, sizeof (wbt), &wbt, tx);
+
+		(void) mutex_unlock(lp);
+
+		if (ztest_random(100) == 0)
+			(void) poll(NULL, 0, 1); /* open dn_notxholds window */
+
+		dmu_tx_commit(tx);
+
+		if (ztest_random(1000) == 0)
+			txg_wait_synced(dmu_objset_pool(os), txg);
+
+		if (ztest_random(2) == 0) {
+			blkptr_t blk = { 0 };
+			uint64_t blkoff;
+
+			txg_suspend(dmu_objset_pool(os));
+			(void) mutex_lock(lp);
+			error = dmu_sync(os, ZTEST_DIROBJ, off, &blkoff, &blk,
+			    txg);
+			(void) mutex_unlock(lp);
+			if (error) {
+				txg_resume(dmu_objset_pool(os));
+				dprintf("dmu_sync(%s, %d, %llx) = %d\n",
+				    osname, ZTEST_DIROBJ, off, error);
+				continue;
+			}
+
+			if (blk.blk_birth == 0)	{	/* concurrent free */
+				txg_resume(dmu_objset_pool(os));
+				continue;
+			}
+
+			ASSERT(blk.blk_fill == 1);
+			ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
+			ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
+			ASSERT3U(BP_GET_LSIZE(&blk), ==, bs);
+
+			/*
+			 * Read the block that dmu_sync() returned to
+			 * make sure its contents match what we wrote.
+			 * We do this while still txg_suspend()ed to ensure
+			 * that the block can't be reused before we read it.
+			 */
+			error = zio_wait(zio_read(NULL, dmu_objset_spa(os),
+			    &blk, iobuf, bs, NULL, NULL,
+			    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED));
+			ASSERT(error == 0);
+
+			txg_resume(dmu_objset_pool(os));
+
+			bcopy(&iobuf[blkoff], &rbt, sizeof (rbt));
+
+			ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset);
+			ASSERT3U(rbt.bt_object, ==, wbt.bt_object);
+			ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset);
+
+			/*
+			 * The semantic of dmu_sync() is that we always
+			 * push the most recent version of the data,
+			 * so in the face of concurrent updates we may
+			 * see a newer version of the block.  That's OK.
+			 */
+			ASSERT3U(rbt.bt_txg, >=, wbt.bt_txg);
+			if (rbt.bt_thread == wbt.bt_thread)
+				ASSERT3U(rbt.bt_seq, ==, wbt.bt_seq);
+			else
+				ASSERT3U(rbt.bt_seq, >, wbt.bt_seq);
+		}
+	}
+}
+
+/*
+ * Verify that zap_{create,destroy,add,remove,update} work as expected.
+ */
+#define	ZTEST_ZAP_MIN_INTS	1
+#define	ZTEST_ZAP_MAX_INTS	4
+#define	ZTEST_ZAP_MAX_PROPS	1000
+
+void
+ztest_zap(ztest_args_t *za)
+{
+	objset_t *os = za->za_os;
+	uint64_t object;
+	uint64_t txg, last_txg;
+	uint64_t value[ZTEST_ZAP_MAX_INTS];
+	uint64_t zl_ints, zl_intsize, prop;
+	int i, ints;
+	int iters = 100;
+	dmu_tx_t *tx;
+	char propname[100], txgname[100];
+	int error;
+	char osname[MAXNAMELEN];
+	char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
+
+	dmu_objset_name(os, osname);
+
+	/*
+	 * Create a new object if necessary, and record it in the directory.
+	 */
+	dmu_read(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t), &object);
+
+	if (object == 0) {
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
+		    sizeof (uint64_t));
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 2);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			ztest_record_enospc("create zap test obj");
+			dmu_tx_abort(tx);
+			return;
+		}
+		object = zap_create(os, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx);
+		if (error) {
+			fatal(0, "zap_create('%s', %llu) = %d",
+			    osname, object, error);
+		}
+		ASSERT(object != 0);
+		dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
+		    sizeof (uint64_t), &object, tx);
+		/*
+		 * Generate a known hash collision, and verify that
+		 * we can lookup and remove both entries.
+		 */
+		for (i = 0; i < 2; i++) {
+			value[i] = i;
+			error = zap_add(os, object, hc[i], sizeof (uint64_t),
+			    1, &value[i], tx);
+			ASSERT3U(error, ==, 0);
+		}
+		for (i = 0; i < 2; i++) {
+			error = zap_add(os, object, hc[i], sizeof (uint64_t),
+			    1, &value[i], tx);
+			ASSERT3U(error, ==, EEXIST);
+			error = zap_length(os, object, hc[i],
+			    &zl_intsize, &zl_ints);
+			ASSERT3U(error, ==, 0);
+			ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+			ASSERT3U(zl_ints, ==, 1);
+		}
+		for (i = 0; i < 2; i++) {
+			error = zap_remove(os, object, hc[i], tx);
+			ASSERT3U(error, ==, 0);
+		}
+
+		dmu_tx_commit(tx);
+	}
+
+	ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
+
+	while (--iters >= 0) {
+		prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
+		(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
+		(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
+		bzero(value, sizeof (value));
+		last_txg = 0;
+
+		/*
+		 * If these zap entries already exist, validate their contents.
+		 */
+		error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
+		if (error == 0) {
+			ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+			ASSERT3U(zl_ints, ==, 1);
+
+			error = zap_lookup(os, object, txgname, zl_intsize,
+			    zl_ints, &last_txg);
+
+			ASSERT3U(error, ==, 0);
+
+			error = zap_length(os, object, propname, &zl_intsize,
+			    &zl_ints);
+
+			ASSERT3U(error, ==, 0);
+			ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+			ASSERT3U(zl_ints, ==, ints);
+
+			error = zap_lookup(os, object, propname, zl_intsize,
+			    zl_ints, value);
+
+			ASSERT3U(error, ==, 0);
+
+			for (i = 0; i < ints; i++) {
+				ASSERT3U(value[i], ==, last_txg + object + i);
+			}
+		} else {
+			ASSERT3U(error, ==, ENOENT);
+		}
+
+		/*
+		 * Atomically update two entries in our zap object.
+		 * The first is named txg_%llu, and contains the txg
+		 * in which the property was last updated.  The second
+		 * is named prop_%llu, and the nth element of its value
+		 * should be txg + object + n.
+		 */
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_zap(tx, object, 2);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			ztest_record_enospc("create zap entry");
+			dmu_tx_abort(tx);
+			return;
+		}
+		txg = dmu_tx_get_txg(tx);
+
+		if (last_txg > txg)
+			fatal(0, "zap future leak: old %llu new %llu",
+			    last_txg, txg);
+
+		for (i = 0; i < ints; i++)
+			value[i] = txg + object + i;
+
+		error = zap_update(os, object, txgname, sizeof (uint64_t),
+		    1, &txg, tx);
+		if (error)
+			fatal(0, "zap_update('%s', %llu, '%s') = %d",
+			    osname, object, txgname, error);
+
+		error = zap_update(os, object, propname, sizeof (uint64_t),
+		    ints, value, tx);
+		if (error)
+			fatal(0, "zap_update('%s', %llu, '%s') = %d",
+			    osname, object, propname, error);
+
+		dmu_tx_commit(tx);
+
+		/*
+		 * Remove a random pair of entries.
+		 */
+		prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
+		(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
+		(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
+
+		error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
+
+		if (error == ENOENT)
+			continue;
+
+		ASSERT3U(error, ==, 0);
+
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_zap(tx, object, 2);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			ztest_record_enospc("remove zap entry");
+			dmu_tx_abort(tx);
+			return;
+		}
+		error = zap_remove(os, object, txgname, tx);
+		if (error)
+			fatal(0, "zap_remove('%s', %llu, '%s') = %d",
+			    osname, object, txgname, error);
+
+		error = zap_remove(os, object, propname, tx);
+		if (error)
+			fatal(0, "zap_remove('%s', %llu, '%s') = %d",
+			    osname, object, propname, error);
+
+		dmu_tx_commit(tx);
+	}
+
+	/*
+	 * Once in a while, destroy the object.
+	 */
+	if (ztest_random(100) != 0)
+		return;
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
+	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		ztest_record_enospc("destroy zap object");
+		dmu_tx_abort(tx);
+		return;
+	}
+	error = zap_destroy(os, object, tx);
+	if (error)
+		fatal(0, "zap_destroy('%s', %llu) = %d",
+		    osname, object, error);
+	object = 0;
+	dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
+	    &object, tx);
+	dmu_tx_commit(tx);
+}
+
+void
+ztest_zap_parallel(ztest_args_t *za)
+{
+	objset_t *os = za->za_os;
+	uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
+	int iters = 100;
+	dmu_tx_t *tx;
+	int i, namelen, error;
+	char name[20], string_value[20];
+	void *data;
+
+	while (--iters >= 0) {
+		/*
+		 * Generate a random name of the form 'xxx.....' where each
+		 * x is a random printable character and the dots are dots.
+		 * There are 94 such characters, and the name length goes from
+		 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
+		 */
+		namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
+
+		for (i = 0; i < 3; i++)
+			name[i] = '!' + ztest_random('~' - '!' + 1);
+		for (; i < namelen - 1; i++)
+			name[i] = '.';
+		name[i] = '\0';
+
+		if (ztest_random(2) == 0)
+			object = ZTEST_MICROZAP_OBJ;
+		else
+			object = ZTEST_FATZAP_OBJ;
+
+		if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
+			wsize = sizeof (txg);
+			wc = 1;
+			data = &txg;
+		} else {
+			wsize = 1;
+			wc = namelen;
+			data = string_value;
+		}
+
+		count = -1ULL;
+		VERIFY(zap_count(os, object, &count) == 0);
+		ASSERT(count != -1ULL);
+
+		/*
+		 * Select an operation: length, lookup, add, update, remove.
+		 */
+		i = ztest_random(5);
+
+		if (i >= 2) {
+			tx = dmu_tx_create(os);
+			dmu_tx_hold_zap(tx, object, 1);
+			error = dmu_tx_assign(tx, TXG_WAIT);
+			if (error) {
+				ztest_record_enospc("zap parallel");
+				dmu_tx_abort(tx);
+				return;
+			}
+			txg = dmu_tx_get_txg(tx);
+			bcopy(name, string_value, namelen);
+		} else {
+			tx = NULL;
+			txg = 0;
+			bzero(string_value, namelen);
+		}
+
+		switch (i) {
+
+		case 0:
+			error = zap_length(os, object, name, &zl_wsize, &zl_wc);
+			if (error == 0) {
+				ASSERT3U(wsize, ==, zl_wsize);
+				ASSERT3U(wc, ==, zl_wc);
+			} else {
+				ASSERT3U(error, ==, ENOENT);
+			}
+			break;
+
+		case 1:
+			error = zap_lookup(os, object, name, wsize, wc, data);
+			if (error == 0) {
+				if (data == string_value &&
+				    bcmp(name, data, namelen) != 0)
+					fatal(0, "name '%s' != val '%s' len %d",
+					    name, data, namelen);
+			} else {
+				ASSERT3U(error, ==, ENOENT);
+			}
+			break;
+
+		case 2:
+			error = zap_add(os, object, name, wsize, wc, data, tx);
+			ASSERT(error == 0 || error == EEXIST);
+			break;
+
+		case 3:
+			VERIFY(zap_update(os, object, name, wsize, wc,
+			    data, tx) == 0);
+			break;
+
+		case 4:
+			error = zap_remove(os, object, name, tx);
+			ASSERT(error == 0 || error == ENOENT);
+			break;
+		}
+
+		if (tx != NULL)
+			dmu_tx_commit(tx);
+	}
+}
+
+void
+ztest_dsl_prop_get_set(ztest_args_t *za)
+{
+	objset_t *os = za->za_os;
+	int i, inherit;
+	uint64_t value;
+	const char *prop, *valname;
+	char setpoint[MAXPATHLEN];
+	char osname[MAXNAMELEN];
+
+	(void) rw_rdlock(&ztest_shared->zs_name_lock);
+
+	dmu_objset_name(os, osname);
+
+	for (i = 0; i < 2; i++) {
+		if (i == 0) {
+			prop = "checksum";
+			value = ztest_random_checksum();
+			inherit = (value == ZIO_CHECKSUM_INHERIT);
+		} else {
+			prop = "compression";
+			value = ztest_random_compress();
+			inherit = (value == ZIO_COMPRESS_INHERIT);
+		}
+
+		VERIFY3U(dsl_prop_set(osname, prop, sizeof (value),
+		    !inherit, &value), ==, 0);
+
+		VERIFY3U(dsl_prop_get(osname, prop, sizeof (value),
+		    1, &value, setpoint), ==, 0);
+
+		if (i == 0)
+			valname = zio_checksum_table[value].ci_name;
+		else
+			valname = zio_compress_table[value].ci_name;
+
+		if (zopt_verbose >= 6) {
+			(void) printf("%s %s = %s for '%s'\n",
+			    osname, prop, valname, setpoint);
+		}
+	}
+
+	(void) rw_unlock(&ztest_shared->zs_name_lock);
+}
+
+/*
+ * Inject random faults into the on-disk data.
+ */
+void
+ztest_fault_inject(ztest_args_t *za)
+{
+	int fd;
+	uint64_t offset;
+	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
+	uint64_t bad = 0x1990c0ffeedecade;
+	uint64_t top, leaf;
+	char path0[MAXPATHLEN];
+	char path1[MAXPATHLEN];
+	char pathrand[MAXPATHLEN];
+	size_t fsize;
+	spa_t *spa = dmu_objset_spa(za->za_os);
+	int bshift = SPA_MAXBLOCKSHIFT + 2;	/* don't scrog all labels */
+	int iters = 1000;
+	int ftype;
+
+	/*
+	 * Pick a random top-level vdev.
+	 */
+	spa_config_enter(spa, RW_READER);
+	top = ztest_random(spa->spa_root_vdev->vdev_children);
+	spa_config_exit(spa);
+
+	/*
+	 * Pick a random leaf.
+	 */
+	leaf = ztest_random(leaves);
+
+	/*
+	 * Generate paths to the first to leaves in this top-level vdev,
+	 * and to the random leaf we selected.  We'll induce transient
+	 * faults on leaves 0 and 1, we'll online/offline leaf 1,
+	 * and we'll write random garbage to the randomly chosen leaf.
+	 */
+	(void) snprintf(path0, sizeof (path0),
+	    ztest_dev_template, zopt_dir, zopt_pool, top * leaves + 0);
+	(void) snprintf(path1, sizeof (path1),
+	    ztest_dev_template, zopt_dir, zopt_pool, top * leaves + 1);
+	(void) snprintf(pathrand, sizeof (pathrand),
+	    ztest_dev_template, zopt_dir, zopt_pool, top * leaves + leaf);
+
+	if (leaves < 2)			/* there is no second leaf */
+		path1[0] = '\0';
+
+	dprintf("damaging %s, %s, and %s\n", path0, path1, pathrand);
+
+	/*
+	 * If we have exactly one-fault tolerance, just randomly offline
+	 * and online one device.
+	 */
+	if (zopt_maxfaults == 1 && path1[0] != '\0') {
+		if (ztest_random(10) < 6)
+			(void) vdev_offline(spa, path1);
+		else
+			(void) vdev_online(spa, path1);
+		return;
+	}
+
+	/*
+	 * Always inject a little random device failure, regardless of
+	 * the replication level.  The I/Os should be retried successfully.
+	 * If we only have single-fault tolerance, don't inject write
+	 * faults, because then we'll be doing partial writes and won't
+	 * be able to recover when we inject data corruption.
+	 */
+	if (zopt_maxfaults <= 1)
+		ftype = (1U << ZIO_TYPE_READ);
+	else
+		ftype = (1U << ZIO_TYPE_READ) | (1U << ZIO_TYPE_WRITE);
+
+	(void) vdev_error_setup(spa, path0, VDEV_FAULT_COUNT, ftype, 10);
+
+	/*
+	 * If we can tolerate three or more faults, make one of the
+	 * devices fail quite a lot.
+	 */
+	if (zopt_maxfaults >= 3 && path1[0] != '\0')
+		(void) vdev_error_setup(spa, path1, VDEV_FAULT_COUNT,
+		    ftype, 100);
+
+	/*
+	 * If we can tolerate four or more faults, offline one of the devices.
+	 */
+	if (zopt_maxfaults >= 4 && path1[0] != '\0') {
+		if (ztest_random(10) < 6)
+			(void) vdev_offline(spa, path1);
+		else
+			(void) vdev_online(spa, path1);
+	}
+
+	/*
+	 * If we have at least single-fault tolerance, inject data corruption.
+	 */
+	if (zopt_maxfaults < 1)
+		return;
+
+	fd = open(pathrand, O_RDWR);
+
+	if (fd == -1)	/* we hit a gap in the device namespace */
+		return;
+
+	fsize = lseek(fd, 0, SEEK_END);
+
+	while (--iters != 0) {
+		offset = ztest_random(fsize / (leaves << bshift)) *
+		    (leaves << bshift) + (leaf << bshift) +
+		    (ztest_random(1ULL << (bshift - 1)) & -8ULL);
+
+		if (offset >= fsize)
+			continue;
+
+		if (zopt_verbose >= 6)
+			(void) printf("injecting bad word into %s,"
+			    " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
+
+		if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
+			fatal(1, "can't inject bad word at 0x%llx in %s",
+			    offset, pathrand);
+	}
+
+	(void) close(fd);
+}
+
+static void
+ztest_error_setup(vdev_t *vd, int mode, int mask, uint64_t arg)
+{
+	int c;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		ztest_error_setup(vd->vdev_child[c], mode, mask, arg);
+
+	if (vd->vdev_path != NULL)
+		(void) vdev_error_setup(vd->vdev_spa, vd->vdev_path,
+		    mode, mask, arg);
+}
+
+/*
+ * Scrub the pool.
+ */
+void
+ztest_scrub(ztest_args_t *za)
+{
+	spa_t *spa = dmu_objset_spa(za->za_os);
+
+	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_FALSE);
+	(void) poll(NULL, 0, 1000); /* wait a second, then force a restart */
+	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_FALSE);
+}
+
+/*
+ * Rename the pool to a different name and then rename it back.
+ */
+void
+ztest_spa_rename(ztest_args_t *za)
+{
+	char *oldname, *newname;
+	int error;
+	spa_t *spa;
+
+	(void) rw_wrlock(&ztest_shared->zs_name_lock);
+
+	oldname = za->za_pool;
+	newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
+	(void) strcpy(newname, oldname);
+	(void) strcat(newname, "_tmp");
+
+	/*
+	 * Do the rename
+	 */
+	error = spa_rename(oldname, newname);
+	if (error)
+		fatal(0, "spa_rename('%s', '%s') = %d", oldname,
+		    newname, error);
+
+	/*
+	 * Try to open it under the old name, which shouldn't exist
+	 */
+	error = spa_open(oldname, &spa, FTAG);
+	if (error != ENOENT)
+		fatal(0, "spa_open('%s') = %d", oldname, error);
+
+	/*
+	 * Open it under the new name and make sure it's still the same spa_t.
+	 */
+	error = spa_open(newname, &spa, FTAG);
+	if (error != 0)
+		fatal(0, "spa_open('%s') = %d", newname, error);
+
+	ASSERT(spa == dmu_objset_spa(za->za_os));
+	spa_close(spa, FTAG);
+
+	/*
+	 * Rename it back to the original
+	 */
+	error = spa_rename(newname, oldname);
+	if (error)
+		fatal(0, "spa_rename('%s', '%s') = %d", newname,
+		    oldname, error);
+
+	/*
+	 * Make sure it can still be opened
+	 */
+	error = spa_open(oldname, &spa, FTAG);
+	if (error != 0)
+		fatal(0, "spa_open('%s') = %d", oldname, error);
+
+	ASSERT(spa == dmu_objset_spa(za->za_os));
+	spa_close(spa, FTAG);
+
+	umem_free(newname, strlen(newname) + 1);
+
+	(void) rw_unlock(&ztest_shared->zs_name_lock);
+}
+
+
+/*
+ * Completely obliterate one disk.
+ */
+static void
+ztest_obliterate_one_disk(uint64_t vdev)
+{
+	int fd;
+	char dev_name[MAXPATHLEN];
+	size_t fsize;
+
+	if (zopt_maxfaults < 2)
+		return;
+
+	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
+
+	fd = open(dev_name, O_RDWR);
+
+	if (fd == -1)
+		fatal(1, "can't open %s", dev_name);
+
+	/*
+	 * Determine the size.
+	 */
+	fsize = lseek(fd, 0, SEEK_END);
+	(void) close(fd);
+
+	/*
+	 * Remove it.
+	 */
+	VERIFY(remove(dev_name) == 0);
+
+	/*
+	 * Create a new one.
+	 */
+	VERIFY((fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666)) >= 0);
+	VERIFY(ftruncate(fd, fsize) == 0);
+	(void) close(fd);
+}
+
+static void
+ztest_replace_one_disk(spa_t *spa, uint64_t vdev)
+{
+	char dev_name[MAXPATHLEN];
+	nvlist_t *file, *root;
+	int error;
+
+	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
+
+	/*
+	 * Build the nvlist describing dev_name.
+	 */
+	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
+	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, dev_name) == 0);
+
+	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
+	VERIFY(nvlist_add_nvlist_array(root, ZPOOL_CONFIG_CHILDREN,
+	    &file, 1) == 0);
+
+	error = spa_vdev_attach(spa, dev_name, root, B_TRUE);
+	if (error != 0 && error != EBUSY && error != ENOTSUP && error != ENODEV)
+		fatal(0, "spa_vdev_attach(in-place) = %d", error);
+
+	nvlist_free(file);
+	nvlist_free(root);
+}
+
+static void
+ztest_verify_blocks(char *pool)
+{
+	int status;
+	char zdb[MAXPATHLEN + MAXNAMELEN + 20];
+	char zbuf[1024];
+	char *bin;
+	FILE *fp;
+
+	(void) realpath(getexecname(), zdb);
+
+	/* zdb lives in /usr/sbin, while ztest lives in /usr/bin */
+	bin = strstr(zdb, "/usr/bin/");
+	/* LINTED */
+	(void) sprintf(bin, "/usr/sbin/zdb -bc%s%s -U -O %s %s",
+	    zopt_verbose >= 3 ? "s" : "",
+	    zopt_verbose >= 4 ? "v" : "",
+	    ztest_random(2) == 0 ? "pre" : "post", pool);
+
+	if (zopt_verbose >= 5)
+		(void) printf("Executing %s\n", strstr(zdb, "zdb "));
+
+	fp = popen(zdb, "r");
+
+	while (fgets(zbuf, sizeof (zbuf), fp) != NULL)
+		if (zopt_verbose >= 3)
+			(void) printf("%s", zbuf);
+
+	status = pclose(fp);
+
+	if (status == 0)
+		return;
+
+	ztest_dump_core = 0;
+	if (WIFEXITED(status))
+		fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status));
+	else
+		fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status));
+}
+
+static void
+ztest_walk_pool_directory(char *header)
+{
+	spa_t *spa = NULL;
+
+	if (zopt_verbose >= 6)
+		(void) printf("%s\n", header);
+
+	mutex_enter(&spa_namespace_lock);
+	while ((spa = spa_next(spa)) != NULL)
+		if (zopt_verbose >= 6)
+			(void) printf("\t%s\n", spa_name(spa));
+	mutex_exit(&spa_namespace_lock);
+}
+
+static void
+ztest_spa_import_export(char *oldname, char *newname)
+{
+	nvlist_t *config;
+	uint64_t pool_guid;
+	spa_t *spa;
+	int error;
+
+	if (zopt_verbose >= 4) {
+		(void) printf("import/export: old = %s, new = %s\n",
+		    oldname, newname);
+	}
+
+	/*
+	 * Clean up from previous runs.
+	 */
+	(void) spa_destroy(newname);
+
+	/*
+	 * Get the pool's configuration and guid.
+	 */
+	error = spa_open(oldname, &spa, FTAG);
+	if (error)
+		fatal(0, "spa_open('%s') = %d", oldname, error);
+
+	ASSERT(spa->spa_config != NULL);
+
+	VERIFY(nvlist_dup(spa->spa_config, &config, 0) == 0);
+	pool_guid = spa_guid(spa);
+	spa_close(spa, FTAG);
+
+	ztest_walk_pool_directory("pools before export");
+
+	/*
+	 * Export it.
+	 */
+	error = spa_export(oldname);
+	if (error)
+		fatal(0, "spa_export('%s') = %d", oldname, error);
+
+	ztest_walk_pool_directory("pools after export");
+
+	/*
+	 * Import it under the new name.
+	 */
+	error = spa_import(newname, config, NULL);
+	if (error)
+		fatal(0, "spa_import('%s') = %d", newname, error);
+
+	ztest_walk_pool_directory("pools after import");
+
+	/*
+	 * Try to import it again -- should fail with EEXIST.
+	 */
+	error = spa_import(newname, config, NULL);
+	if (error != EEXIST)
+		fatal(0, "spa_import('%s') twice", newname);
+
+	/*
+	 * Try to import it under a different name -- should fail with EEXIST.
+	 */
+	error = spa_import(oldname, config, NULL);
+	if (error != EEXIST)
+		fatal(0, "spa_import('%s') under multiple names", newname);
+
+	/*
+	 * Verify that the pool is no longer visible under the old name.
+	 */
+	error = spa_open(oldname, &spa, FTAG);
+	if (error != ENOENT)
+		fatal(0, "spa_open('%s') = %d", newname, error);
+
+	/*
+	 * Verify that we can open and close the pool using the new name.
+	 */
+	error = spa_open(newname, &spa, FTAG);
+	if (error)
+		fatal(0, "spa_open('%s') = %d", newname, error);
+	ASSERT(pool_guid == spa_guid(spa));
+	spa_close(spa, FTAG);
+
+	nvlist_free(config);
+}
+
+static void *
+ztest_thread(void *arg)
+{
+	ztest_args_t *za = arg;
+	ztest_shared_t *zs = ztest_shared;
+	hrtime_t now, functime;
+	ztest_info_t *zi;
+	int f;
+
+	while ((now = gethrtime()) < za->za_stop) {
+		/*
+		 * See if it's time to force a crash.
+		 */
+		if (now > za->za_kill) {
+			zs->zs_alloc = spa_get_alloc(dmu_objset_spa(za->za_os));
+			zs->zs_space = spa_get_space(dmu_objset_spa(za->za_os));
+			(void) kill(getpid(), SIGKILL);
+		}
+
+		/*
+		 * Pick a random function.
+		 */
+		f = ztest_random(ZTEST_FUNCS);
+		zi = &zs->zs_info[f];
+
+		/*
+		 * Decide whether to call it, based on the requested frequency.
+		 */
+		if (zi->zi_call_target == 0 ||
+		    (double)zi->zi_call_total / zi->zi_call_target >
+		    (double)(now - zs->zs_start_time) / (zopt_time * NANOSEC))
+			continue;
+
+		atomic_add_64(&zi->zi_calls, 1);
+		atomic_add_64(&zi->zi_call_total, 1);
+
+		za->za_diroff = (za->za_instance * ZTEST_FUNCS + f) *
+		    ZTEST_DIRSIZE;
+		za->za_diroff_shared = (1ULL << 63);
+
+		ztest_dmu_write_parallel(za);
+
+		zi->zi_func(za);
+
+		functime = gethrtime() - now;
+
+		atomic_add_64(&zi->zi_call_time, functime);
+
+		if (zopt_verbose >= 4) {
+			Dl_info dli;
+			(void) dladdr((void *)zi->zi_func, &dli);
+			(void) printf("%6.2f sec in %s\n",
+			    (double)functime / NANOSEC, dli.dli_sname);
+		}
+
+		/*
+		 * If we're getting ENOSPC with some regularity, stop.
+		 */
+		if (zs->zs_enospc_count > 10)
+			break;
+	}
+
+	return (NULL);
+}
+
+/*
+ * Kick off threads to run tests on all datasets in parallel.
+ */
+static void
+ztest_run(char *pool)
+{
+	int t, d, error;
+	ztest_shared_t *zs = ztest_shared;
+	ztest_args_t *za;
+	spa_t *spa;
+	char name[100];
+
+	(void) _mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL);
+	(void) rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL);
+
+	for (t = 0; t < ZTEST_SYNC_LOCKS; t++)
+		(void) _mutex_init(&zs->zs_sync_lock[t], USYNC_THREAD, NULL);
+
+	/*
+	 * Destroy one disk before we even start.
+	 * It's mirrored, so everything should work just fine.
+	 * This makes us exercise fault handling very early in spa_load().
+	 */
+	ztest_obliterate_one_disk(0);
+
+	/*
+	 * Verify that the sum of the sizes of all blocks in the pool
+	 * equals the SPA's allocated space total.
+	 */
+	ztest_verify_blocks(pool);
+
+	/*
+	 * Kick off a replacement of the disk we just obliterated.
+	 */
+	kernel_init(FREAD | FWRITE);
+	error = spa_open(pool, &spa, FTAG);
+	if (error)
+		fatal(0, "spa_open(%s) = %d", pool, error);
+	ztest_replace_one_disk(spa, 0);
+	if (zopt_verbose >= 5)
+		show_pool_stats(spa);
+	spa_close(spa, FTAG);
+	kernel_fini();
+
+	kernel_init(FREAD | FWRITE);
+
+	/*
+	 * Verify that we can export the pool and reimport it under a
+	 * different name.
+	 */
+	(void) snprintf(name, 100, "%s_import", pool);
+	ztest_spa_import_export(pool, name);
+	ztest_spa_import_export(name, pool);
+
+	/*
+	 * Verify that we can loop over all pools.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) {
+		if (zopt_verbose > 3) {
+			(void) printf("spa_next: found %s\n", spa_name(spa));
+		}
+	}
+	mutex_exit(&spa_namespace_lock);
+
+	/*
+	 * Open our pool.
+	 */
+	error = spa_open(pool, &spa, FTAG);
+	if (error)
+		fatal(0, "spa_open() = %d", error);
+
+	/*
+	 * Verify that we can safely inquire about about any object,
+	 * whether it's allocated or not.  To make it interesting,
+	 * we probe a 5-wide window around each power of two.
+	 * This hits all edge cases, including zero and the max.
+	 */
+	for (t = 0; t < 64; t++) {
+		for (d = -5; d <= 5; d++) {
+			error = dmu_object_info(spa->spa_meta_objset,
+			    (1ULL << t) + d, NULL);
+			ASSERT(error == 0 || error == ENOENT);
+		}
+	}
+
+	/*
+	 * Now kick off all the tests that run in parallel.
+	 */
+	zs->zs_enospc_count = 0;
+
+	za = umem_zalloc(zopt_threads * sizeof (ztest_args_t), UMEM_NOFAIL);
+
+	if (zopt_verbose >= 4)
+		(void) printf("starting main threads...\n");
+
+	za[0].za_start = gethrtime();
+	za[0].za_stop = za[0].za_start + zopt_passtime * NANOSEC;
+	za[0].za_stop = MIN(za[0].za_stop, zs->zs_stop_time);
+	za[0].za_kill = za[0].za_stop;
+	if (ztest_random(100) < zopt_killrate)
+		za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC);
+
+	for (t = 0; t < zopt_threads; t++) {
+		d = t % zopt_dirs;
+		if (t < zopt_dirs) {
+			ztest_replay_t zr;
+			(void) rw_rdlock(&ztest_shared->zs_name_lock);
+			(void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
+			error = dmu_objset_create(name, DMU_OST_OTHER, NULL,
+			    ztest_create_cb, NULL);
+			if (error != 0 && error != EEXIST) {
+				if (error == ENOSPC) {
+					zs->zs_enospc_count++;
+					(void) rw_unlock(
+					    &ztest_shared->zs_name_lock);
+					break;
+				}
+				fatal(0, "dmu_objset_create(%s) = %d",
+				    name, error);
+			}
+			error = dmu_objset_open(name, DMU_OST_OTHER,
+			    DS_MODE_STANDARD, &za[d].za_os);
+			if (error)
+				fatal(0, "dmu_objset_open('%s') = %d",
+				    name, error);
+			(void) rw_unlock(&ztest_shared->zs_name_lock);
+			zr.zr_os = za[d].za_os;
+			zil_replay(zr.zr_os, &zr, &zr.zr_assign,
+			    ztest_replay_vector, NULL);
+			za[d].za_zilog = zil_open(za[d].za_os, NULL);
+		}
+		za[t].za_pool = spa_strdup(pool);
+		za[t].za_os = za[d].za_os;
+		za[t].za_zilog = za[d].za_zilog;
+		za[t].za_instance = t;
+		za[t].za_random = ztest_random(-1ULL);
+		za[t].za_start = za[0].za_start;
+		za[t].za_stop = za[0].za_stop;
+		za[t].za_kill = za[0].za_kill;
+
+		error = thr_create(0, 0, ztest_thread, &za[t], THR_BOUND,
+		    &za[t].za_thread);
+		if (error)
+			fatal(0, "can't create thread %d: error %d",
+			    t, error);
+	}
+
+	while (--t >= 0) {
+		error = thr_join(za[t].za_thread, NULL, NULL);
+		if (error)
+			fatal(0, "thr_join(%d) = %d", t, error);
+		if (za[t].za_th)
+			traverse_fini(za[t].za_th);
+		if (t < zopt_dirs) {
+			zil_close(za[t].za_zilog);
+			dmu_objset_close(za[t].za_os);
+		}
+		spa_strfree(za[t].za_pool);
+	}
+
+	umem_free(za, zopt_threads * sizeof (ztest_args_t));
+
+	if (zopt_verbose >= 3)
+		show_pool_stats(spa);
+
+	txg_wait_synced(spa_get_dsl(spa), 0);
+
+	zs->zs_alloc = spa_get_alloc(spa);
+	zs->zs_space = spa_get_space(spa);
+
+	/*
+	 * Did we have out-of-space errors?  If so, destroy a random objset.
+	 */
+	if (zs->zs_enospc_count != 0) {
+		(void) rw_rdlock(&ztest_shared->zs_name_lock);
+		(void) snprintf(name, 100, "%s/%s_%d", pool, pool,
+		    (int)ztest_random(zopt_dirs));
+		if (zopt_verbose >= 3)
+			(void) printf("Destroying %s to free up space\n", name);
+		dmu_objset_find(name, ztest_destroy_cb, NULL,
+		    DS_FIND_SNAPSHOTS);
+		(void) rw_unlock(&ztest_shared->zs_name_lock);
+	}
+
+	/*
+	 * Prepare every leaf device to inject a few random read faults.
+	 */
+	ztest_error_setup(spa->spa_root_vdev, VDEV_FAULT_COUNT,
+	    (1U << ZIO_TYPE_READ), 10);
+
+	/*
+	 * Right before closing the pool, kick off a bunch of async I/O;
+	 * spa_close() should wait for it to complete.
+	 */
+	for (t = 1; t < 50; t++)
+		dmu_prefetch(spa->spa_meta_objset, t, 0, 1 << 15);
+
+	spa_close(spa, FTAG);
+
+	kernel_fini();
+}
+
+void
+print_time(hrtime_t t, char *timebuf)
+{
+	hrtime_t s = t / NANOSEC;
+	hrtime_t m = s / 60;
+	hrtime_t h = m / 60;
+	hrtime_t d = h / 24;
+
+	s -= m * 60;
+	m -= h * 60;
+	h -= d * 24;
+
+	timebuf[0] = '\0';
+
+	if (d)
+		(void) sprintf(timebuf,
+		    "%llud%02lluh%02llum%02llus", d, h, m, s);
+	else if (h)
+		(void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s);
+	else if (m)
+		(void) sprintf(timebuf, "%llum%02llus", m, s);
+	else
+		(void) sprintf(timebuf, "%llus", s);
+}
+
+/*
+ * Create a storage pool with the given name and initial vdev size.
+ * Then create the specified number of datasets in the pool.
+ */
+static void
+ztest_init(char *pool)
+{
+	spa_t *spa;
+	int error;
+	nvlist_t *nvroot;
+
+	kernel_init(FREAD | FWRITE);
+
+	/*
+	 * Create the storage pool.
+	 */
+	(void) spa_destroy(pool);
+	ztest_shared->zs_vdev_primaries = 0;
+	nvroot = make_vdev_root(zopt_vdev_size, zopt_raidz, zopt_mirrors, 1);
+	error = spa_create(pool, nvroot, NULL);
+	nvlist_free(nvroot);
+
+	if (error)
+		fatal(0, "spa_create() = %d", error);
+	error = spa_open(pool, &spa, FTAG);
+	if (error)
+		fatal(0, "spa_open() = %d", error);
+
+	if (zopt_verbose >= 3)
+		show_pool_stats(spa);
+
+	spa_close(spa, FTAG);
+
+	kernel_fini();
+}
+
+int
+main(int argc, char **argv)
+{
+	int kills = 0;
+	int iters = 0;
+	int i, f;
+	ztest_shared_t *zs;
+	ztest_info_t *zi;
+	char timebuf[100];
+	char numbuf[6];
+
+	(void) setvbuf(stdout, NULL, _IOLBF, 0);
+
+	/* Override location of zpool.cache */
+	spa_config_dir = "/tmp";
+
+	/*
+	 * Blow away any existing copy of zpool.cache
+	 */
+	(void) remove("/tmp/zpool.cache");
+
+	ztest_random_fd = open("/dev/urandom", O_RDONLY);
+
+	process_options(argc, argv);
+
+	argc -= optind;
+	argv += optind;
+
+	dprintf_setup(&argc, argv);
+
+	zs = ztest_shared = (void *)mmap(0,
+	    P2ROUNDUP(sizeof (ztest_shared_t), getpagesize()),
+	    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+
+	if (zopt_verbose >= 1) {
+		(void) printf("%llu vdevs, %d datasets, %d threads,"
+		    " %llu seconds...\n",
+		    (u_longlong_t)zopt_vdevs, zopt_dirs, zopt_threads,
+		    (u_longlong_t)zopt_time);
+	}
+
+	/*
+	 * Create and initialize our storage pool.
+	 */
+	for (i = 1; i <= zopt_init; i++) {
+		bzero(zs, sizeof (ztest_shared_t));
+		if (zopt_verbose >= 3 && zopt_init != 1)
+			(void) printf("ztest_init(), pass %d\n", i);
+		ztest_init(zopt_pool);
+	}
+
+	/*
+	 * Initialize the call targets for each function.
+	 */
+	for (f = 0; f < ZTEST_FUNCS; f++) {
+		zi = &zs->zs_info[f];
+
+		*zi = ztest_info[f];
+
+		if (*zi->zi_interval == 0)
+			zi->zi_call_target = UINT64_MAX;
+		else
+			zi->zi_call_target = zopt_time / *zi->zi_interval;
+	}
+
+	zs->zs_start_time = gethrtime();
+	zs->zs_stop_time = zs->zs_start_time + zopt_time * NANOSEC;
+
+	/*
+	 * Run the tests in a loop.  These tests include fault injection
+	 * to verify that self-healing data works, and forced crashes
+	 * to verify that we never lose on-disk consistency.
+	 */
+	while (gethrtime() < zs->zs_stop_time) {
+		int status;
+		pid_t pid;
+		char *tmp;
+
+		/*
+		 * Initialize the workload counters for each function.
+		 */
+		for (f = 0; f < ZTEST_FUNCS; f++) {
+			zi = &zs->zs_info[f];
+			zi->zi_calls = 0;
+			zi->zi_call_time = 0;
+		}
+
+		pid = fork();
+
+		if (pid == -1)
+			fatal(1, "fork failed");
+
+		if (pid == 0) {	/* child */
+			struct rlimit rl = { 1024, 1024 };
+			(void) setrlimit(RLIMIT_NOFILE, &rl);
+			ztest_run(zopt_pool);
+			exit(0);
+		}
+
+		while (waitpid(pid, &status, WEXITED) != pid)
+			continue;
+
+		if (WIFEXITED(status)) {
+			if (WEXITSTATUS(status) != 0) {
+				(void) fprintf(stderr,
+				    "child exited with code %d\n",
+				    WEXITSTATUS(status));
+				exit(2);
+			}
+		} else {
+			if (WTERMSIG(status) != SIGKILL) {
+				(void) fprintf(stderr,
+				    "child died with signal %d\n",
+				    WTERMSIG(status));
+				exit(3);
+			}
+			kills++;
+		}
+
+		iters++;
+
+		if (zopt_verbose >= 1) {
+			hrtime_t now = gethrtime();
+
+			now = MIN(now, zs->zs_stop_time);
+			print_time(zs->zs_stop_time - now, timebuf);
+			nicenum(zs->zs_space, numbuf);
+
+			(void) printf("Pass %3d, %8s, %3llu ENOSPC, "
+			    "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
+			    iters,
+			    WIFEXITED(status) ? "Complete" : "SIGKILL",
+			    (u_longlong_t)zs->zs_enospc_count,
+			    100.0 * zs->zs_alloc / zs->zs_space,
+			    numbuf,
+			    100.0 * (now - zs->zs_start_time) /
+			    (zopt_time * NANOSEC), timebuf);
+		}
+
+		if (zopt_verbose >= 2) {
+			(void) printf("\nWorkload summary:\n\n");
+			(void) printf("%7s %9s   %s\n",
+			    "Calls", "Time", "Function");
+			(void) printf("%7s %9s   %s\n",
+			    "-----", "----", "--------");
+			for (f = 0; f < ZTEST_FUNCS; f++) {
+				Dl_info dli;
+
+				zi = &zs->zs_info[f];
+				print_time(zi->zi_call_time, timebuf);
+				(void) dladdr((void *)zi->zi_func, &dli);
+				(void) printf("%7llu %9s   %s\n",
+				    (u_longlong_t)zi->zi_calls, timebuf,
+				    dli.dli_sname);
+			}
+			(void) printf("\n");
+		}
+
+		/*
+		 * It's possible that we killed a child during a rename test, in
+		 * which case we'll have a 'ztest_tmp' pool lying around instead
+		 * of 'ztest'.  Do a blind rename in case this happened.
+		 */
+		tmp = umem_alloc(strlen(zopt_pool) + 5, UMEM_NOFAIL);
+		(void) strcpy(tmp, zopt_pool);
+		(void) strcat(tmp, "_tmp");
+		kernel_init(FREAD | FWRITE);
+		(void) spa_rename(tmp, zopt_pool);
+		kernel_fini();
+		umem_free(tmp, strlen(tmp) + 1);
+	}
+
+	ztest_verify_blocks(zopt_pool);
+
+	if (zopt_verbose >= 1) {
+		(void) printf("%d killed, %d completed, %.0f%% kill rate\n",
+		    kills, iters - kills, (100.0 * kills) / MAX(1, iters));
+	}
+
+	return (0);
+}
diff --git a/usr/src/common/acl/acl_common.c b/usr/src/common/acl/acl_common.c
new file mode 100644
index 000000000000..ebe4f060b51f
--- /dev/null
+++ b/usr/src/common/acl/acl_common.c
@@ -0,0 +1,214 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/acl.h>
+#include <sys/stat.h>
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#else
+#include <errno.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <assert.h>
+#define	ASSERT	assert
+#endif
+
+
+ace_t trivial_acl[] = {
+	{-1, 0, ACE_OWNER, ACE_ACCESS_DENIED_ACE_TYPE},
+	{-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES|
+	    ACE_WRITE_NAMED_ATTRS, ACE_OWNER, ACE_ACCESS_ALLOWED_ACE_TYPE},
+	{-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, ACE_ACCESS_DENIED_ACE_TYPE},
+	{-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, ACE_ACCESS_ALLOWED_ACE_TYPE},
+	{-1, ACE_WRITE_ACL|ACE_WRITE_OWNER| ACE_WRITE_ATTRIBUTES|
+	    ACE_WRITE_NAMED_ATTRS, ACE_EVERYONE, ACE_ACCESS_DENIED_ACE_TYPE},
+	{-1, ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+	    ACE_SYNCHRONIZE, ACE_EVERYONE, ACE_ACCESS_ALLOWED_ACE_TYPE}
+};
+
+
+void
+adjust_ace_pair(ace_t *pair, mode_t mode)
+{
+	if (mode & S_IROTH)
+		pair[1].a_access_mask |= ACE_READ_DATA;
+	else
+		pair[0].a_access_mask |= ACE_READ_DATA;
+	if (mode & S_IWOTH)
+		pair[1].a_access_mask |=
+		    ACE_WRITE_DATA|ACE_APPEND_DATA;
+	else
+		pair[0].a_access_mask |=
+		    ACE_WRITE_DATA|ACE_APPEND_DATA;
+	if (mode & S_IXOTH)
+		pair[1].a_access_mask |= ACE_EXECUTE;
+	else
+		pair[0].a_access_mask |= ACE_EXECUTE;
+}
+
+/*
+ * ace_trivial:
+ * determine whether an ace_t acl is trivial
+ *
+ * Trivialness implys that the acl is composed of only
+ * owner, group, everyone entries.  ACL can't
+ * have read_acl denied, and write_owner/write_acl/write_attributes
+ * can only be owner@ entry.
+ */
+int
+ace_trivial(ace_t *acep, int aclcnt)
+{
+	int i;
+	int owner_seen = 0;
+	int group_seen = 0;
+	int everyone_seen = 0;
+
+	for (i = 0; i != aclcnt; i++) {
+		switch (acep[i].a_flags & 0xf040) {
+		case ACE_OWNER:
+			if (group_seen || everyone_seen)
+				return (1);
+			owner_seen++;
+			break;
+		case ACE_GROUP|ACE_IDENTIFIER_GROUP:
+			if (everyone_seen || owner_seen == 0)
+				return (1);
+			group_seen++;
+			break;
+
+		case ACE_EVERYONE:
+			if (owner_seen == 0 || group_seen == 0)
+				return (1);
+			everyone_seen++;
+			break;
+		default:
+			return (1);
+
+		}
+
+		if (acep[i].a_flags & (ACE_FILE_INHERIT_ACE|
+		    ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
+		    ACE_INHERIT_ONLY_ACE))
+			return (1);
+
+		/*
+		 * Special check for some special bits
+		 *
+		 * Don't allow anybody to deny reading an ACL
+		 */
+		if ((acep[i].a_access_mask & ACE_READ_ACL) &&
+		    (acep[i].a_type == ACE_ACCESS_DENIED_ACE_TYPE))
+			return (1);
+
+		/*
+		 * Allow on owner@ to allow
+		 * write_acl/write_owner/write_attributes
+		 */
+		if (acep[i].a_type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
+		    (!(acep[i].a_flags & ACE_OWNER) && (acep[i].a_access_mask &
+		    (ACE_WRITE_OWNER|ACE_WRITE_ACL|ACE_WRITE_ATTRIBUTES))))
+			return (1);
+	}
+
+	if ((owner_seen == 0) || (group_seen == 0) || (everyone_seen == 0))
+	    return (1);
+
+	return (0);
+}
+
+
+/*
+ * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified.
+ * v = Ptr to array/vector of objs
+ * n = # objs in the array
+ * s = size of each obj (must be multiples of a word size)
+ * f = ptr to function to compare two objs
+ *	returns (-1 = less than, 0 = equal, 1 = greater than
+ */
+void
+ksort(caddr_t v, int n, int s, int (*f)())
+{
+	int g, i, j, ii;
+	unsigned int *p1, *p2;
+	unsigned int tmp;
+
+	/* No work to do */
+	if (v == NULL || n <= 1)
+		return;
+
+	/* Sanity check on arguments */
+	ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0);
+	ASSERT(s > 0);
+	for (g = n / 2; g > 0; g /= 2) {
+		for (i = g; i < n; i++) {
+			for (j = i - g; j >= 0 &&
+				(*f)(v + j * s, v + (j + g) * s) == 1;
+					j -= g) {
+				p1 = (void *)(v + j * s);
+				p2 = (void *)(v + (j + g) * s);
+				for (ii = 0; ii < s / 4; ii++) {
+					tmp = *p1;
+					*p1++ = *p2;
+					*p2++ = tmp;
+				}
+			}
+		}
+	}
+}
+
+/*
+ * Compare two acls, all fields.  Returns:
+ * -1 (less than)
+ *  0 (equal)
+ * +1 (greater than)
+ */
+int
+cmp2acls(void *a, void *b)
+{
+	aclent_t *x = (aclent_t *)a;
+	aclent_t *y = (aclent_t *)b;
+
+	/* Compare types */
+	if (x->a_type < y->a_type)
+		return (-1);
+	if (x->a_type > y->a_type)
+		return (1);
+	/* Equal types; compare id's */
+	if (x->a_id < y->a_id)
+		return (-1);
+	if (x->a_id > y->a_id)
+		return (1);
+	/* Equal ids; compare perms */
+	if (x->a_perm < y->a_perm)
+		return (-1);
+	if (x->a_perm > y->a_perm)
+		return (1);
+	/* Totally equal */
+	return (0);
+}
diff --git a/usr/src/common/acl/acl_common.h b/usr/src/common/acl/acl_common.h
new file mode 100644
index 000000000000..2227ad77ea93
--- /dev/null
+++ b/usr/src/common/acl/acl_common.h
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_ACL_ACL_UTILS_H
+#define	_ACL_ACL_UTILS_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+
+#include <sys/types.h>
+#include <sys/acl.h>
+#include <sys/stat.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+extern ace_t trivial_acl[6];
+
+extern int acltrivial(const char *);
+extern void adjust_ace_pair(ace_t *pair, mode_t mode);
+extern int ace_trivial(ace_t *acep, int aclcnt);
+void ksort(caddr_t v, int n, int s, int (*f)());
+int cmp2acls(void *a, void *b);
+
+
+
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif /* _ACL_ACL_UTILS_H */
diff --git a/usr/src/common/avl/avl.c b/usr/src/common/avl/avl.c
index 267fe2ede917..579e7408a9af 100644
--- a/usr/src/common/avl/avl.c
+++ b/usr/src/common/avl/avl.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -54,7 +54,7 @@
  *	- Since the AVL data is always embedded in other structures, there is
  *	  no locking or memory allocation in the AVL routines. This must be
  *	  provided for by the enclosing data structure's semantics. Typically,
- *	  avl_insert()/_remove()/avl_insert_here() require some kind of
+ *	  avl_insert()/_add()/_remove()/avl_insert_here() require some kind of
  *	  exclusive write lock. Other operations require a read lock.
  *
  *      - The implementation uses iteration instead of explicit recursion,
@@ -95,6 +95,7 @@
 #include <sys/param.h>
 #include <sys/debug.h>
 #include <sys/avl.h>
+#include <sys/cmn_err.h>
 
 /*
  * Small arrays to translate between balance (or diff) values and child indeces.
@@ -599,6 +600,29 @@ avl_insert_here(
 	avl_insert(tree, new_data, AVL_MKINDEX(node, child));
 }
 
+/*
+ * Add a new node to an AVL tree.
+ */
+void
+avl_add(avl_tree_t *tree, void *new_node)
+{
+	avl_index_t where;
+
+	/*
+	 * This is unfortunate.  We want to call panic() here, even for
+	 * non-DEBUG kernels.  In userland, however, we can't depend on anything
+	 * in libc or else the rtld build process gets confused.  So, all we can
+	 * do in userland is resort to a normal ASSERT().
+	 */
+	if (avl_find(tree, new_node, &where) != NULL)
+#ifdef _KERNEL
+		panic("avl_find() succeeded inside avl_add()");
+#else
+		ASSERT(0);
+#endif
+	avl_insert(tree, new_node, where);
+}
+
 /*
  * Delete a node from the AVL tree.  Deletion is similar to insertion, but
  * with 2 complications.
diff --git a/usr/src/common/zfs/zfs_namecheck.c b/usr/src/common/zfs/zfs_namecheck.c
new file mode 100644
index 000000000000..8ccf684b519e
--- /dev/null
+++ b/usr/src/common/zfs/zfs_namecheck.c
@@ -0,0 +1,204 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Common name validation routines for ZFS.  These routines are shared by the
+ * userland code as well as the ioctl() layer to ensure that we don't
+ * inadvertently expose a hole through direct ioctl()s that never gets tested.
+ * In userland, however, we want significantly more information about _why_ the
+ * name is invalid.  In the kernel, we only care whether it's valid or not.
+ * Each routine therefore takes a 'namecheck_err_t' which describes exactly why
+ * the name failed to validate.
+ *
+ * Each function returns 0 on success, -1 on error.
+ */
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+#include "zfs_namecheck.h"
+
+static int
+valid_char(char c)
+{
+	return ((c >= 'a' && c <= 'z') ||
+	    (c >= 'A' && c <= 'Z') ||
+	    (c >= '0' && c <= '9') ||
+	    c == '-' || c == '_' || c == '.' || c == ':');
+}
+
+/*
+ * Dataset names must be of the following form:
+ *
+ * 	[component][/]*[component][@component]
+ *
+ * Where each component is made up of alphanumeric characters plus the following
+ * characters:
+ *
+ * 	[-_.:]
+ */
+int
+dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+	const char *loc, *end;
+	int found_snapshot;
+
+	/* Explicitly check for a leading slash.  */
+	if (path[0] == '/') {
+		if (why)
+			*why = NAME_ERR_LEADING_SLASH;
+		return (-1);
+	}
+
+	if (path[0] == '\0') {
+		if (why)
+			*why = NAME_ERR_EMPTY_COMPONENT;
+		return (-1);
+	}
+
+	loc = path;
+	found_snapshot = 0;
+	for (;;) {
+		/* Find the end of this component */
+		end = loc;
+		while (*end != '/' && *end != '@' && *end != '\0')
+			end++;
+
+		if (*end == '\0' && end[-1] == '/') {
+			/* trailing slashes are not allowed */
+			if (why)
+				*why = NAME_ERR_TRAILING_SLASH;
+			return (-1);
+		}
+
+		/* Zero-length components are not allowed */
+		if (loc == end) {
+			if (why)
+				*why = NAME_ERR_EMPTY_COMPONENT;
+			return (-1);
+		}
+
+		/* Validate the contents of this component */
+		while (loc != end) {
+			if (!valid_char(*loc)) {
+				if (why) {
+					*why = NAME_ERR_INVALCHAR;
+					*what = *loc;
+				}
+				return (-1);
+			}
+			loc++;
+		}
+
+		/* If we've reached the end of the string, we're OK */
+		if (*end == '\0')
+			return (0);
+
+		if (*end == '@') {
+			/*
+			 * If we've found an @ symbol, indicate that we're in
+			 * the snapshot component, and report a second '@'
+			 * character as an error.
+			 */
+			if (found_snapshot) {
+				if (why)
+					*why = NAME_ERR_MULTIPLE_AT;
+				return (-1);
+			}
+
+			found_snapshot = 1;
+		}
+
+		/* Update to the next component */
+		loc = end + 1;
+	}
+}
+
+/*
+ * For pool names, we have the same set of valid characters as described in
+ * dataset names, with the additional restriction that the pool name must begin
+ * with a letter.  The pool names 'raidz' and 'mirror' are also reserved names
+ * that cannot be used.
+ */
+int
+pool_namecheck(const char *pool, namecheck_err_t *why, char *what)
+{
+	const char *c;
+
+	c = pool;
+	while (*c != '\0') {
+		if (!valid_char(*c)) {
+			if (why) {
+				*why = NAME_ERR_INVALCHAR;
+				*what = *c;
+			}
+			return (-1);
+		}
+		c++;
+	}
+
+	if (!(*pool >= 'a' && *pool <= 'z') &&
+	    !(*pool >= 'A' && *pool <= 'Z')) {
+		if (why)
+			*why = NAME_ERR_NOLETTER;
+		return (-1);
+	}
+
+	if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) {
+		if (why)
+			*why = NAME_ERR_RESERVED;
+		return (-1);
+	}
+
+	if (pool[0] == 'c' && (pool[1] >= '0' && pool[1] <= '9')) {
+		if (why)
+			*why = NAME_ERR_DISKLIKE;
+		return (-1);
+	}
+
+	return (0);
+}
+
+/*
+ * Check if the dataset name is private for internal usage.
+ * '$' is reserved for internal dataset names. e.g. "$MOS"
+ *
+ * Return 1 if the given name is used internally.
+ * Return 0 if it is not.
+ */
+int
+dataset_name_hidden(const char *name)
+{
+	if (strchr(name, '$') != NULL)
+		return (1);
+
+	return (0);
+}
diff --git a/usr/src/common/zfs/zfs_namecheck.h b/usr/src/common/zfs/zfs_namecheck.h
new file mode 100644
index 000000000000..f590cee6035d
--- /dev/null
+++ b/usr/src/common/zfs/zfs_namecheck.h
@@ -0,0 +1,55 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_ZFS_NAMECHECK_H
+#define	_ZFS_NAMECHECK_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+	NAME_ERR_LEADING_SLASH,		/* name begins with leading slash */
+	NAME_ERR_EMPTY_COMPONENT,	/* name contains an empty component */
+	NAME_ERR_TRAILING_SLASH,	/* name ends with a slash */
+	NAME_ERR_INVALCHAR,		/* invalid character found */
+	NAME_ERR_MULTIPLE_AT,		/* multiple '@' characters found */
+	NAME_ERR_NOLETTER,		/* pool doesn't begin with a letter */
+	NAME_ERR_RESERVED,		/* entire name is reserved */
+	NAME_ERR_DISKLIKE,		/* reserved disk name (c[0-9].*) */
+} namecheck_err_t;
+
+int pool_namecheck(const char *, namecheck_err_t *, char *);
+int dataset_namecheck(const char *, namecheck_err_t *, char *);
+int dataset_name_hidden(const char *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZFS_NAMECHECK_H */
diff --git a/usr/src/common/zfs/zfs_prop.c b/usr/src/common/zfs/zfs_prop.c
new file mode 100644
index 000000000000..dbd783a975ea
--- /dev/null
+++ b/usr/src/common/zfs/zfs_prop.c
@@ -0,0 +1,334 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Master property table.
+ *
+ * This table keeps track of all the properties supported by ZFS, and their
+ * various attributes.  Not all of these are needed by the kernel, and several
+ * are only used by a single libzfs client.  But having them here centralizes
+ * all property information in one location.
+ *
+ * 	name		The human-readable string representing this property
+ * 	proptype	Basic type (string, boolean, number)
+ * 	default		Default value for the property.  Sadly, C only allows
+ * 			you to initialize the first member of a union, so we
+ * 			have two default members for each property.
+ * 	attr		Attributes (readonly, inheritable) for the property
+ * 	types		Valid dataset types to which this applies
+ * 	values		String describing acceptable values for the property
+ * 	colname		The column header for 'zfs list'
+ *	colfmt		The column formatting for 'zfs list'
+ *
+ * This table must match the order of property types in libzfs.h.
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+
+#include "zfs_prop.h"
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#else
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#endif
+
+typedef enum {
+	prop_default,
+	prop_readonly,
+	prop_inherit
+} prop_attr_t;
+
+typedef struct {
+	const char	*pd_name;
+	zfs_proptype_t	pd_proptype;
+	uint64_t	pd_numdefault;
+	const char	*pd_strdefault;
+	prop_attr_t	pd_attr;
+	int		pd_types;
+	const char	*pd_values;
+	const char	*pd_colname;
+	const char	*pd_colfmt;
+} prop_desc_t;
+
+static prop_desc_t zfs_prop_table[ZFS_NPROP_ALL] = {
+	{ "type",	prop_type_string,	0,	NULL,	prop_readonly,
+	    ZFS_TYPE_ANY, "filesystem | volume | snapshot", "TYPE", "%10s" },
+	{ "creation",	prop_type_number,	0,	NULL,	prop_readonly,
+	    ZFS_TYPE_ANY, "<date>", "CREATION", "%-20s" },
+	{ "used",	prop_type_number,	0,	NULL,	prop_readonly,
+	    ZFS_TYPE_ANY, "<size>",	"USED", "%5s" },
+	{ "available",	prop_type_number,	0,	NULL,	prop_readonly,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL", "%5s" },
+	{ "referenced",	prop_type_number,	0,	NULL,	prop_readonly,
+	    ZFS_TYPE_SNAPSHOT | ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "<size>", "REFER", "%5s" },
+	{ "compressratio", prop_type_number,	0,	NULL,	prop_readonly,
+	    ZFS_TYPE_ANY, "<1.00x or higher if compressed>", "RATIO", "%5s" },
+	{ "mounted",	prop_type_boolean,	0,	NULL,	prop_readonly,
+	    ZFS_TYPE_FILESYSTEM, "yes | no | -", "MOUNTED", "%7s" },
+	{ "origin",	prop_type_string,	0,	NULL,	prop_readonly,
+	    ZFS_TYPE_FILESYSTEM, "<snapshot>", "ORIGIN", "%-20s" },
+	{ "quota",	prop_type_number,	0,	NULL,	prop_default,
+	    ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA", "%5s" },
+	{ "reservation", prop_type_number,	0,	NULL,	prop_default,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "<size> | none", "RESERV", "%6s" },
+	{ "volsize",	prop_type_number,	0,	NULL,	prop_default,
+	    ZFS_TYPE_VOLUME, "<size>", "VOLSIZE", "%7s" },
+	{ "volblocksize", prop_type_number,	8192,	NULL,	prop_default,
+	    ZFS_TYPE_VOLUME, "512 to 128k, power of 2",	"VOLBLOCK", "%8s" },
+	{ "recordsize",	prop_type_number,	SPA_MAXBLOCKSIZE,	NULL,
+	    prop_inherit,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+	    "512 to 128k, power of 2", "RECSIZE", "%7s" },
+	{ "mountpoint",	prop_type_string,	0,	"/",	prop_inherit,
+	    ZFS_TYPE_FILESYSTEM,
+	    "<path> | legacy | none", "MOUNTPOINT", "%-20s" },
+	{ "sharenfs",	prop_type_string,	0,	"off",	prop_inherit,
+	    ZFS_TYPE_FILESYSTEM,
+	    "on | off | share(1M) options", "SHARENFS", "%-15s" },
+	{ "checksum",	prop_type_index,	ZIO_CHECKSUM_DEFAULT,	NULL,
+	    prop_inherit,	ZFS_TYPE_ANY,
+	    "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM", "%10s" },
+	{ "compression", prop_type_index,	ZIO_COMPRESS_DEFAULT,	NULL,
+	    prop_inherit,	ZFS_TYPE_ANY,
+	    "on | off | lzjb", "COMPRESS", "%8s" },
+	{ "atime",	prop_type_boolean,	1,	NULL,	prop_inherit,
+	    ZFS_TYPE_FILESYSTEM,
+	    "on | off", "ATIME", "%5s" },
+	{ "devices",	prop_type_boolean,	1,	NULL,	prop_inherit,
+	    ZFS_TYPE_FILESYSTEM,
+	    "on | off", "DEVICES", "%7s" },
+	{ "exec",	prop_type_boolean,	1,	NULL,	prop_inherit,
+	    ZFS_TYPE_FILESYSTEM,
+	    "on | off", "EXEC", "%4s" },
+	{ "setuid",	prop_type_boolean,	1,	NULL,	prop_inherit,
+	    ZFS_TYPE_FILESYSTEM, "on | off", "SETUID", "%6s" },
+	{ "readonly",	prop_type_boolean,	0,	NULL,	prop_inherit,
+	    ZFS_TYPE_ANY, "on | off", "RDONLY", "%6s" },
+	{ "zoned",	prop_type_boolean,	0,	NULL,	prop_inherit,
+	    ZFS_TYPE_ANY,
+	    "on | off", "ZONED", "%5s" },
+	{ "snapdir",	prop_type_index,	VISIBLE, NULL,	prop_inherit,
+	    ZFS_TYPE_FILESYSTEM,
+	    "hidden | visible", "SNAPDIR", "%7s" },
+	{ "aclmode", prop_type_index,	GROUPMASK,	 NULL,	prop_inherit,
+	    ZFS_TYPE_FILESYSTEM,
+	    "discard | groupmask | passthrough", "ACLMODE", "%11s" },
+	{ "aclinherit", prop_type_index,	SECURE,	NULL, 	prop_inherit,
+	    ZFS_TYPE_FILESYSTEM,
+	    "discard | noallow | secure | passthrough", "ACLINHERIT", "%11s" },
+	{ "createtxg",	prop_type_number,	0,	NULL,	prop_readonly,
+	    ZFS_TYPE_ANY, NULL, NULL, NULL},
+	{ "name",	prop_type_string,	0,	NULL,	prop_readonly,
+	    ZFS_TYPE_ANY,
+	    NULL, "NAME", "%-20s" },
+};
+
+zfs_proptype_t
+zfs_prop_get_type(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_proptype);
+}
+
+/*
+ * Given a property name, returns the corresponding property ID.
+ */
+zfs_prop_t
+zfs_name_to_prop(const char *propname)
+{
+	int i;
+
+	for (i = 0; i < ZFS_NPROP_ALL; i++) {
+		if (strcmp(zfs_prop_table[i].pd_name, propname) == 0)
+			return (i);
+#ifndef _KERNEL
+		if (zfs_prop_table[i].pd_colname != NULL &&
+		    strcasecmp(zfs_prop_table[i].pd_colname, propname) == 0)
+			return (i);
+#endif
+	}
+
+	return (ZFS_PROP_INVAL);
+}
+
+/*
+ * Return the default value for the given property.
+ */
+void
+zfs_prop_default_string(zfs_prop_t prop, char *buf, size_t buflen)
+{
+	/*
+	 * For index types (compression and checksum), we want the numeric value
+	 * in the kernel, but the string value in userland.  The kernel will
+	 * call zfs_prop_default_numeric() based on the property type.  In
+	 * userland, the zfs_prop_is_string() will return TRUE for index types,
+	 * and we'll return "on" from this function.
+	 */
+	if (zfs_prop_table[prop].pd_proptype == prop_type_index)
+		(void) strncpy(buf, "on", buflen);
+	else
+		(void) strncpy(buf, zfs_prop_table[prop].pd_strdefault, buflen);
+}
+
+uint64_t
+zfs_prop_default_numeric(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_numdefault);
+}
+
+/*
+ * Returns TRUE if the property is readonly.
+ */
+int
+zfs_prop_readonly(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_attr == prop_readonly);
+}
+
+#ifndef _KERNEL
+/*
+ * Given a property ID, returns the corresponding name.
+ */
+const char *
+zfs_prop_to_name(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_name);
+}
+
+/*
+ * Returns TRUE if the property is inheritable.
+ */
+int
+zfs_prop_inheritable(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_attr == prop_inherit);
+}
+
+/*
+ * Returns TRUE if the property applies to the given dataset types.
+ */
+int
+zfs_prop_valid_for_type(zfs_prop_t prop, int types)
+{
+	return ((zfs_prop_table[prop].pd_types & types) != 0);
+}
+
+/*
+ * Returns a string describing the set of acceptable values for the given
+ * property, or NULL if it cannot be set.
+ */
+const char *
+zfs_prop_values(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_values);
+}
+
+/*
+ * Returns TRUE if this property is a string type.  Note that index types
+ * (compression, checksum) are treated as strings in userland, even though they
+ * are stored numerically on disk.
+ */
+int
+zfs_prop_is_string(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_proptype == prop_type_string ||
+	    zfs_prop_table[prop].pd_proptype == prop_type_index);
+}
+
+/*
+ * Returns the column header for the given property.  Used only in
+ * 'zfs list -o', but centralized here with the other property information.
+ */
+const char *
+zfs_prop_column_name(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_colname);
+}
+
+/*
+ * Returns the column formatting for the given property.  Used only in
+ * 'zfs list -o', but centralized here with the other property information.
+ */
+const char *
+zfs_prop_column_format(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_colfmt);
+}
+
+/*
+ * Returns an array of names suitable for passing to getsubopt() to determine
+ * the property index.
+ */
+char **
+zfs_prop_column_subopts(void)
+{
+	char **ret = malloc((ZFS_NPROP_ALL + 1) * sizeof (char *));
+	int i;
+
+	for (i = 0; i < ZFS_NPROP_ALL; i++)
+		ret[i] = (char *)zfs_prop_table[i].pd_name;
+
+	ret[i] = NULL;
+
+	return (ret);
+}
+
+/*
+ * Same as above, but using the short (abbreviated) column names as indices.
+ */
+char **
+zfs_prop_column_short_subopts(void)
+{
+	char **ret = malloc((ZFS_NPROP_ALL + 1) * sizeof (char *) * 2);
+	char *cur;
+	int i;
+
+	for (i = 0; i < ZFS_NPROP_ALL; i++) {
+		if (zfs_prop_table[i].pd_colname == NULL) {
+			ret[i] = "";
+		} else {
+			ret[i] = strdup(zfs_prop_table[i].pd_colname);
+			for (cur = ret[i]; *cur != '\0'; cur++)
+				*cur = tolower(*cur);
+		}
+	}
+
+
+	ret[i] = NULL;
+
+	return (ret);
+}
+
+#endif
diff --git a/usr/src/common/zfs/zfs_prop.h b/usr/src/common/zfs/zfs_prop.h
new file mode 100644
index 000000000000..f7d78c64f5e8
--- /dev/null
+++ b/usr/src/common/zfs/zfs_prop.h
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_ZFS_PROP_H
+#define	_ZFS_PROP_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/fs/zfs.h>
+#include <sys/types.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+	prop_type_number,	/* numeric value */
+	prop_type_string,	/* string value */
+	prop_type_boolean,	/* boolean value */
+	prop_type_index		/* numeric value indexed by string */
+} zfs_proptype_t;
+
+zfs_proptype_t zfs_prop_get_type(zfs_prop_t);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZFS_PROP_H */
diff --git a/usr/src/head/libzonecfg.h b/usr/src/head/libzonecfg.h
index a82931928add..d45408d2dd83 100644
--- a/usr/src/head/libzonecfg.h
+++ b/usr/src/head/libzonecfg.h
@@ -158,6 +158,10 @@ struct zone_attrtab {
 	char	zone_attr_value[2 * BUFSIZ];
 };
 
+struct zone_dstab {
+	char	zone_dataset_name[MAXNAMELEN];
+};
+
 /*
  * Basic configuration management routines.
  */
@@ -261,6 +265,15 @@ extern	int	zonecfg_get_attr_string(const struct zone_attrtab *, char *,
     size_t);
 extern	int	zonecfg_get_attr_uint(const struct zone_attrtab *, uint64_t *);
 
+/*
+ * ZFS configuration.
+ */
+extern	int	zonecfg_add_ds(zone_dochandle_t, struct zone_dstab *);
+extern	int	zonecfg_delete_ds(zone_dochandle_t, struct zone_dstab *);
+extern	int	zonecfg_modify_ds(zone_dochandle_t, struct zone_dstab *,
+    struct zone_dstab *);
+extern	int	zonecfg_lookup_ds(zone_dochandle_t, struct zone_dstab *);
+
 /*
  * '*ent' iterator routines.
  */
@@ -282,6 +295,9 @@ extern	int	zonecfg_endattrent(zone_dochandle_t);
 extern	int	zonecfg_setrctlent(zone_dochandle_t);
 extern	int	zonecfg_getrctlent(zone_dochandle_t, struct zone_rctltab *);
 extern	int	zonecfg_endrctlent(zone_dochandle_t);
+extern	int	zonecfg_setdsent(zone_dochandle_t);
+extern	int	zonecfg_getdsent(zone_dochandle_t, struct zone_dstab *);
+extern	int	zonecfg_enddsent(zone_dochandle_t);
 
 /*
  * Privilege-related functions.
diff --git a/usr/src/head/zone.h b/usr/src/head/zone.h
index 90c1a9c9406c..6f2e81daa507 100644
--- a/usr/src/head/zone.h
+++ b/usr/src/head/zone.h
@@ -57,7 +57,7 @@ extern int zone_get_id(const char *, zoneid_t *);
 
 /* System call API */
 extern zoneid_t	zone_create(const char *, const char *,
-    const struct priv_set *, const char *, size_t, int *);
+    const struct priv_set *, const char *, size_t, const char *, size_t, int *);
 extern int	zone_boot(zoneid_t, const char *);
 extern int	zone_destroy(zoneid_t);
 extern ssize_t	zone_getattr(zoneid_t, int, void *, size_t);
diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile
index 8e93cef77c72..4ae89dfeaae2 100644
--- a/usr/src/lib/Makefile
+++ b/usr/src/lib/Makefile
@@ -190,6 +190,9 @@ SUBDIRS= \
 	../cmd/sendmail/libmilter	\
 	sasl_plugins	\
 	udapl		\
+	libzpool	\
+	libzfs		\
+	libzfs_jni	\
 	$($(MACH)_SUBDIRS)
 
 sparc_SUBDIRS= .WAIT	\
@@ -314,6 +317,7 @@ HDRSUBDIRS= libaio	\
 	librpcsvc	\
 	librsm		\
 	libsasl		\
+	libsec		\
 	libslp		\
 	libsmedia	\
 	libsysevent	\
@@ -326,6 +330,7 @@ HDRSUBDIRS= libaio	\
 	libuutil	\
 	libwrap		\
 	libxcurses2	\
+	libzfs		\
 	libzoneinfo	\
 	lvm		\
 	openssl		\
@@ -391,7 +396,7 @@ libc:		libc_i18n
 libcmdutils:	libavl
 libcontract:	libnvpair
 libdevid:	libdevinfo
-libdevinfo:	libnvpair
+libdevinfo:	libnvpair libsec
 libdhcpagent:	libdhcputil libnsl libsocket
 libdhcpsvc:	libinetutil
 libdhcputil:	libinetutil
@@ -427,6 +432,9 @@ librestart:	libuutil libscf
 pkcs11:		libcryptoutil
 print:		libldap5
 udapl/udapl_tavor:	udapl/libdat
+libzfs:		libdevinfo libdevid libgen libnvpair libuutil
+libzfs_jni:	libdiskmgt libnvpair libzfs
+libzpool:	libavl libumem libnvpair
 
 #
 # The reason this rule checks for the existence of the
diff --git a/usr/src/lib/libavl/spec/avl.spec b/usr/src/lib/libavl/spec/avl.spec
index b981b60711ca..67a4782034ff 100644
--- a/usr/src/lib/libavl/spec/avl.spec
+++ b/usr/src/lib/libavl/spec/avl.spec
@@ -92,6 +92,12 @@ declaration	ulong_t avl_numnodes(avl_tree_t *tree)
 version		SUNWprivate_1.1
 end
 
+function	avl_add
+include		<sys/avl.h>
+declaration	void avl_remove(avl_tree_t *tree, void *data)
+version		SUNWprivate_1.1
+end
+
 function	avl_remove
 include		<sys/avl.h>
 declaration	void avl_remove(avl_tree_t *tree, void *data)
diff --git a/usr/src/lib/libc/port/sys/zone.c b/usr/src/lib/libc/port/sys/zone.c
index 416313941fe0..c3ef0fab84e3 100644
--- a/usr/src/lib/libc/port/sys/zone.c
+++ b/usr/src/lib/libc/port/sys/zone.c
@@ -40,9 +40,10 @@
 #include <stdlib.h>
 #include <errno.h>
 
-zoneid_t
-zone_create(const char *name, const char *root, const priv_set_t *privs,
-    const char *rctls, size_t rctlsz, int *extended_error)
+extern zoneid_t
+zone_create(const char *name, const char *root, const struct priv_set *privs,
+    const char *rctls, size_t rctlsz, const char *zfs, size_t zfssz,
+    int *extended_error)
 {
 	zone_def  zd;
 
@@ -51,10 +52,11 @@ zone_create(const char *name, const char *root, const priv_set_t *privs,
 	zd.zone_privs = privs;
 	zd.rctlbuf = rctls;
 	zd.rctlbufsz = rctlsz;
+	zd.zfsbuf = zfs;
+	zd.zfsbufsz = zfssz;
 	zd.extended_error = extended_error;
 
-	return ((zoneid_t)syscall(SYS_zone,
-	    ZONE_CREATE, &zd));
+	return ((zoneid_t)syscall(SYS_zone, ZONE_CREATE, &zd));
 }
 
 int
diff --git a/usr/src/lib/libdevinfo/Makefile.com b/usr/src/lib/libdevinfo/Makefile.com
index 96ebbf9a7c8f..cc05e1bad694 100644
--- a/usr/src/lib/libdevinfo/Makefile.com
+++ b/usr/src/lib/libdevinfo/Makefile.com
@@ -20,7 +20,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -36,7 +36,7 @@ include ../../Makefile.lib
 include ../../Makefile.rootfs
 
 LIBS =		$(DYNLIB) $(LINTLIB)
-LDLIBS +=	-lnvpair -lc
+LDLIBS +=	-lnvpair -lsec -lc
 $(LINTLIB) :=	SRCS = $(SRCDIR)/$(LINTSRC)
 
 SRCDIR =	..
diff --git a/usr/src/lib/libdevinfo/devinfo_devperm.c b/usr/src/lib/libdevinfo/devinfo_devperm.c
index df1f5dff3cb3..ef5ca55c48bd 100644
--- a/usr/src/lib/libdevinfo/devinfo_devperm.c
+++ b/usr/src/lib/libdevinfo/devinfo_devperm.c
@@ -84,7 +84,6 @@ setdevaccess(char *dev, uid_t uid, gid_t gid, mode_t mode,
     void (*errmsg)(char *))
 {
 	int err = 0, local_errno;
-	aclent_t acls[4];
 	char errstring[MAX_LINELEN];
 
 	if (chown(dev, uid, gid) == -1) {
@@ -106,24 +105,12 @@ setdevaccess(char *dev, uid_t uid, gid_t gid, mode_t mode,
 		(*errmsg)(errstring);
 	}
 
-	acls[0].a_type = USER_OBJ;
-	acls[0].a_id = uid;
-	acls[0].a_perm = ((mode & 0700) >> 6);
-
-	acls[1].a_type = GROUP_OBJ;
-	acls[1].a_id = gid;
-	acls[1].a_perm = ((mode & 0070) >> 3);
-
-	acls[2].a_type = CLASS_OBJ;
-	acls[2].a_id = (uid_t)-1;
-	acls[2].a_perm = ((mode & 0070) >> 3);
-
-	acls[3].a_type = OTHER_OBJ;
-	acls[3].a_id = (uid_t)-1;
-	acls[3].a_perm = (mode & 0007);
+	/*
+	 * strip_acl sets an acl and changes the files owner/group
+	 */
+	err = acl_strip(dev, uid, gid, mode);
 
-	/* Remove ACLs */
-	if (acl(dev, SETACL, 4, acls) < 0) {
+	if (err != 0) {
 		/*
 		 * If the file system returned ENOSYS, we know that it
 		 * doesn't support ACLs, therefore, we must assume that
@@ -139,15 +126,14 @@ setdevaccess(char *dev, uid_t uid, gid_t gid, mode_t mode,
 				(*errmsg)(errstring);
 			}
 		}
-	}
-
-	if (chmod(dev, mode) == -1) {
-		err = -1;
-		if (errmsg) {
-			(void) snprintf(errstring, MAX_LINELEN,
-			    "failed to chmod device %s: %s\n",
-			    dev, strerror(errno));
-			(*errmsg)(errstring);
+		if (chmod(dev, mode) == -1) {
+			err = -1;
+			if (errmsg) {
+				(void) snprintf(errstring, MAX_LINELEN,
+				    "failed to chmod device %s: %s\n",
+				    dev, strerror(errno));
+				(*errmsg)(errstring);
+			}
 		}
 	}
 
diff --git a/usr/src/lib/libproc/common/Pcontrol.h b/usr/src/lib/libproc/common/Pcontrol.h
index 5d3797d16ff3..44d9e439045e 100644
--- a/usr/src/lib/libproc/common/Pcontrol.h
+++ b/usr/src/lib/libproc/common/Pcontrol.h
@@ -68,7 +68,7 @@ typedef struct {		/* symbol table */
 } sym_tbl_t;
 
 typedef struct file_info {	/* symbol information for a mapped file */
-	list_t	file_list;	/* linked list */
+	plist_t	file_list;	/* linked list */
 	char	file_pname[PRMAPSZ];	/* name from prmap_t */
 	struct map_info *file_map;	/* primary (text) mapping */
 	int	file_ref;	/* references from map_info_t structures */
@@ -102,7 +102,7 @@ typedef struct map_info {	/* description of an address space mapping */
 } map_info_t;
 
 typedef struct lwp_info {	/* per-lwp information from core file */
-	list_t	lwp_list;	/* linked list */
+	plist_t	lwp_list;	/* linked list */
 	lwpid_t	lwp_id;		/* lwp identifier */
 	lwpsinfo_t lwp_psinfo;	/* /proc/<pid>/lwp/<lwpid>/lwpsinfo data */
 	lwpstatus_t lwp_status;	/* /proc/<pid>/lwp/<lwpid>/lwpstatus data */
@@ -116,7 +116,7 @@ typedef struct lwp_info {	/* per-lwp information from core file */
 typedef struct core_info {	/* information specific to core files */
 	char core_dmodel;	/* data model for core file */
 	int core_errno;		/* error during initialization if != 0 */
-	list_t core_lwp_head;	/* head of list of lwp info */
+	plist_t core_lwp_head;	/* head of list of lwp info */
 	lwp_info_t *core_lwp;	/* current lwp information */
 	uint_t core_nlwp;	/* number of lwp's in list */
 	off64_t core_size;	/* size of core file in bytes */
@@ -171,7 +171,7 @@ struct ps_prochandle {
 	size_t	map_count;	/* number of mappings */
 	size_t	map_alloc;	/* number of mappings allocated */
 	uint_t	num_files;	/* number of file elements in file_info */
-	list_t	file_head;	/* head of mapped files w/ symbol table info */
+	plist_t	file_head;	/* head of mapped files w/ symbol table info */
 	char	*execname;	/* name of the executable file */
 	auxv_t	*auxv;		/* the process's aux vector */
 	int	nauxv;		/* number of aux vector entries */
@@ -228,6 +228,7 @@ extern	int	Padd_mapping(struct ps_prochandle *, off64_t, file_info_t *,
     prmap_t *);
 extern	void	Psort_mappings(struct ps_prochandle *);
 
+
 /*
  * Architecture-dependent definition of the breakpoint instruction.
  */
diff --git a/usr/src/lib/libproc/common/Putil.c b/usr/src/lib/libproc/common/Putil.c
index 7e06c14f67d6..791ec668cbd1 100644
--- a/usr/src/lib/libproc/common/Putil.c
+++ b/usr/src/lib/libproc/common/Putil.c
@@ -20,8 +20,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 1998-2001 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -41,8 +41,8 @@
 void
 list_link(void *new, void *existing)
 {
-	list_t *p = new;
-	list_t *q = existing;
+	plist_t *p = new;
+	plist_t *q = existing;
 
 	if (q) {
 		p->list_forw = q;
@@ -60,7 +60,7 @@ list_link(void *new, void *existing)
 void
 list_unlink(void *old)
 {
-	list_t *p = old;
+	plist_t *p = old;
 
 	if (p->list_forw != p) {
 		p->list_back->list_forw = p->list_forw;
diff --git a/usr/src/lib/libproc/common/Putil.h b/usr/src/lib/libproc/common/Putil.h
index 328440fc8127..55ea45dba2aa 100644
--- a/usr/src/lib/libproc/common/Putil.h
+++ b/usr/src/lib/libproc/common/Putil.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -39,7 +39,7 @@ extern "C" {
 typedef struct P_list {
 	struct P_list	*list_forw;
 	struct P_list	*list_back;
-} list_t;
+} plist_t;
 
 /*
  * Routines to manipulate linked lists:
@@ -47,8 +47,8 @@ typedef struct P_list {
 extern void list_link(void *, void *);
 extern void list_unlink(void *);
 
-#define	list_next(elem)	(void *)(((list_t *)(elem))->list_forw)
-#define	list_prev(elem)	(void *)(((list_t *)(elem))->list_back)
+#define	list_next(elem)	(void *)(((plist_t *)(elem))->list_forw)
+#define	list_prev(elem)	(void *)(((plist_t *)(elem))->list_back)
 
 /*
  * Routines to manipulate sigset_t, fltset_t, or sysset_t.
diff --git a/usr/src/lib/libsec/Makefile b/usr/src/lib/libsec/Makefile
index a698fe02514b..2a61f4775660 100644
--- a/usr/src/lib/libsec/Makefile
+++ b/usr/src/lib/libsec/Makefile
@@ -20,70 +20,49 @@
 # CDDL HEADER END
 #
 #
-# Copyright (c) 1993-1999 by Sun Microsystems, Inc.
-# All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
 #
 #ident	"%Z%%M%	%I%	%E% SMI"
 #
 # lib/libsec/Makefile
 
-include		../../Makefile.master
 include		../Makefile.lib
 
-SUBDIRS=	spec .WAIT $(MACH) $(BUILD64) $(MACH64)
+HDRDIR=		common
+HDRS=		aclutils.h
+SUBDIRS=	$(MACH) $(BUILD64) $(MACH64)
 
 all :=		TARGET= all
 clean :=	TARGET= clean
 clobber :=	TARGET= clobber
-delete :=	TARGET= delete
 install :=	TARGET= install
 lint :=		TARGET= lint
-_msg :=		TARGET= _msg
-package :=	TARGET= package
 
-LIBRARY= 	libsec.a
-TEXT_DOMAIN=	SUNW_OST_OSLIB
-XGETFLAGS=	-a
-POFILE=		$(LIBRARY:.a=.po)
-POFILES=	generic.po
-
-SED=	sed
-GREP=	grep
-CP=	cp
+MSGFILES=	common/acltext.c common/aclutils.c common/aclmode.c \
+		common/aclsort.c common/aclcheck.c
+POFILE=		libsec.po
 
 .KEEP_STATE:
 
-all clean clobber delete install lint package: $(SUBDIRS)
+all clean clobber install: spec .WAIT $(SUBDIRS) 
+
+$(POFILE):	pofile_MSGFILES
 
-# definitions for install_h target
-HDRS=
-ROOTHDRDIR=	$(ROOT)/usr/include
-ROOTHDRS=	$(HDRS:%=$(ROOTHDRDIR)/%)
-CHECKHDRS=	$(HDRS:%.h=%.check)
+lint: $(SUBDIRS)
 
 # install rule for install_h target
-$(ROOTHDRDIR)/%: common/%
-	$(INS.file)
 
 install_h: $(ROOTHDRS)
 
 check: $(CHECKHDRS)
 
-$(MACH) $(MACH64) spec:	FRC
-	@cd $@; pwd; $(MAKE) $(TARGET)
-
-_msg:	$(MSGDOMAIN) $(POFILE)
-	$(RM) $(MSGDOMAIN)/$(POFILE)
-	$(CP) $(POFILE) $(MSGDOMAIN)
+_msg: $(MSGDOMAINPOFILE)
 
-$(POFILE):	$(POFILES)
-	$(RM) $@
-	$(CAT) $(POFILES) > $@
-
-$(POFILES):
-	$(RM) messages.po
-	$(XGETTEXT) $(XGETFLAGS) *.[ch]* */*.[ch]*
-	$(SED) -e '/^# msg/d' -e '/^domain/d' messages.po > $@
-	$(RM) messages.po
+$(SUBDIRS) spec: FRC
+	@cd $@; pwd; $(MAKE) $(TARGET)
 
 FRC:
+
+include ../Makefile.targ
+include ../../Makefile.msg.targ
diff --git a/usr/src/lib/libsec/Makefile.com b/usr/src/lib/libsec/Makefile.com
index b619caa0b7d7..0cf7541cd02b 100644
--- a/usr/src/lib/libsec/Makefile.com
+++ b/usr/src/lib/libsec/Makefile.com
@@ -20,7 +20,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -31,48 +31,40 @@
 LIBRARY= libsec.a
 VERS= .1
 
-OBJECTS=	\
-	aclcheck.o	\
-	aclmode.o	\
-	aclsort.o	\
-	acltext.o
+OBJS_SHARED= acl_common.o
+OBJS_COMMON= aclcheck.o aclmode.o aclsort.o acltext.o aclutils.o
+OBJECTS= $(OBJS_COMMON) $(OBJS_SHARED)
 
 # include library definitions
 include ../../Makefile.lib
 
-# install this library in the root filesystem
-include ../../Makefile.rootfs
-
-MAPFILE=	$(MAPDIR)/mapfile
-MAPOPTS=	$(MAPFILE:%=-M %)
-SRCS=		$(OBJECTS:%.o=../common/%.c)
-
 LIBS =		$(DYNLIB) $(LINTLIB)
 
-$(LINTLIB):= SRCS=../common/llib-lsec
-
-LINTSRC=	$(LINTLIB:%.ln=%)
-
 CFLAGS +=	$(CCVERBOSE)
+CPPFLAGS +=	-I$(SRCDIR) -I../../../common/acl
 DYNFLAGS +=	$(MAPOPTS)
-LDLIBS += -lc
+LDLIBS += -lc 
 
-.KEEP_STATE:
+# install this library in the root filesystem
+include ../../Makefile.rootfs
 
-lint: lintcheck
+SRCS=		$(OBJS_COMMON:%.o=$(SRCDIR)/%.c) \
+		 $(OBJS_SHARED:%.o=$(SRC)/common/acl/%.c)
 
-$(DYNLIB):	$(MAPFILE)
+$(LINTLIB):= SRCS=	$(SRCDIR)/$(LINTSRC)
 
-$(MAPFILE):
-	@cd $(MAPDIR); $(MAKE) mapfile
+SRCDIR=		../common
+MAPDIR=		../spec/$(TRANSMACH)
+SPECMAPFILE=	$(MAPDIR)/mapfile
 
-# include library targets
-include ../../Makefile.targ
+.KEEP_STATE:
 
-pics/%.o: ../common/%.c
+all: $(LIBS)
+
+lint: lintcheck
+
+pics/%.o: ../../../common/acl/%.c
 	$(COMPILE.c) -o $@ $<
 	$(POST_PROCESS_O)
 
-# install rule for lint library target
-$(ROOTLINTDIR)/%:	../common/%
-	$(INS.file)
+include ../../Makefile.targ
diff --git a/usr/src/lib/libsec/common/aclcheck.c b/usr/src/lib/libsec/common/aclcheck.c
index 75c1a6cf56a9..6b1a12d6d922 100644
--- a/usr/src/lib/libsec/common/aclcheck.c
+++ b/usr/src/lib/libsec/common/aclcheck.c
@@ -20,7 +20,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 1993-1997 by Sun Microsystems, Inc.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -42,6 +43,7 @@
 #include <string.h>
 #include <sys/types.h>
 #include <sys/acl.h>
+#include <aclutils.h>
 
 struct entry {
 	int	count;
@@ -66,8 +68,8 @@ struct entry_stat {
 static void free_mem(struct entry_stat *);
 static int check_dup(int, uid_t *, uid_t, struct entry_stat *);
 
-int
-aclcheck(aclent_t *aclbufp, int nentries, int *which)
+static int
+aclent_aclcheck(aclent_t *aclbufp, int nentries,  int *which, int isdir)
 {
 	struct entry_stat	tally;
 	aclent_t		*aclentp;
@@ -82,10 +84,10 @@ aclcheck(aclent_t *aclbufp, int nentries, int *which)
 		case USER_OBJ:
 			/* check uniqueness */
 			if (tally.user_obj.count > 0) {
-				*which = (int) (aclentp - aclbufp);
+				*which = (int)(aclentp - aclbufp);
 				(void) free_mem(&tally);
 				errno = EINVAL;
-				return (USER_ERROR);
+				return (EACL_USER_ERROR);
 			}
 			tally.user_obj.count = 1;
 			break;
@@ -93,10 +95,10 @@ aclcheck(aclent_t *aclbufp, int nentries, int *which)
 		case GROUP_OBJ:
 			/* check uniqueness */
 			if (tally.group_obj.count > 0) {
-				*which = (int) (aclentp - aclbufp);
+				*which = (int)(aclentp - aclbufp);
 				(void) free_mem(&tally);
 				errno = EINVAL;
-				return (GRP_ERROR);
+				return (EACL_GRP_ERROR);
 			}
 			tally.group_obj.count = 1;
 			break;
@@ -104,10 +106,10 @@ aclcheck(aclent_t *aclbufp, int nentries, int *which)
 		case OTHER_OBJ:
 			/* check uniqueness */
 			if (tally.other_obj.count > 0) {
-				*which = (int) (aclentp - aclbufp);
+				*which = (int)(aclentp - aclbufp);
 				(void) free_mem(&tally);
 				errno = EINVAL;
-				return (OTHER_ERROR);
+				return (EACL_OTHER_ERROR);
 			}
 			tally.other_obj.count = 1;
 			break;
@@ -115,10 +117,10 @@ aclcheck(aclent_t *aclbufp, int nentries, int *which)
 		case CLASS_OBJ:
 			/* check uniqueness */
 			if (tally.class_obj.count > 0) {
-				*which = (int) (aclentp - aclbufp);
+				*which = (int)(aclentp - aclbufp);
 				(void) free_mem(&tally);
 				errno = EINVAL;
-				return (CLASS_ERROR);
+				return (EACL_CLASS_ERROR);
 			}
 			tally.class_obj.count = 1;
 			break;
@@ -145,12 +147,12 @@ aclcheck(aclent_t *aclbufp, int nentries, int *which)
 			if (cnt == 0) {
 				*idp = calloc(nentries, sizeof (uid_t));
 				if (*idp == NULL)
-					return (MEM_ERROR);
+					return (EACL_MEM_ERROR);
 			} else {
 				if (check_dup(cnt, *idp, aclentp->a_id,
 				    &tally) == -1) {
-					*which = (int) (aclentp - aclbufp);
-					return (DUPLICATE_ERROR);
+					*which = (int)(aclentp - aclbufp);
+					return (EACL_DUPLICATE_ERROR);
 				}
 			}
 			(*idp)[cnt] = aclentp->a_id;
@@ -159,10 +161,10 @@ aclcheck(aclent_t *aclbufp, int nentries, int *which)
 		case DEF_USER_OBJ:
 			/* check uniqueness */
 			if (tally.def_user_obj.count > 0) {
-				*which = (int) (aclentp - aclbufp);
+				*which = (int)(aclentp - aclbufp);
 				(void) free_mem(&tally);
 				errno = EINVAL;
-				return (USER_ERROR);
+				return (EACL_USER_ERROR);
 			}
 			tally.def_user_obj.count = 1;
 			break;
@@ -170,10 +172,10 @@ aclcheck(aclent_t *aclbufp, int nentries, int *which)
 		case DEF_GROUP_OBJ:
 			/* check uniqueness */
 			if (tally.def_group_obj.count > 0) {
-				*which = (int) (aclentp - aclbufp);
+				*which = (int)(aclentp - aclbufp);
 				(void) free_mem(&tally);
 				errno = EINVAL;
-				return (GRP_ERROR);
+				return (EACL_GRP_ERROR);
 			}
 			tally.def_group_obj.count = 1;
 			break;
@@ -181,10 +183,10 @@ aclcheck(aclent_t *aclbufp, int nentries, int *which)
 		case DEF_OTHER_OBJ:
 			/* check uniqueness */
 			if (tally.def_other_obj.count > 0) {
-				*which = (int) (aclentp - aclbufp);
+				*which = (int)(aclentp - aclbufp);
 				(void) free_mem(&tally);
 				errno = EINVAL;
-				return (OTHER_ERROR);
+				return (EACL_OTHER_ERROR);
 			}
 			tally.def_other_obj.count = 1;
 			break;
@@ -192,10 +194,10 @@ aclcheck(aclent_t *aclbufp, int nentries, int *which)
 		case DEF_CLASS_OBJ:
 			/* check uniqueness */
 			if (tally.def_class_obj.count > 0) {
-				*which = (int) (aclentp - aclbufp);
+				*which = (int)(aclentp - aclbufp);
 				(void) free_mem(&tally);
 				errno = EINVAL;
-				return (CLASS_ERROR);
+				return (EACL_CLASS_ERROR);
 			}
 			tally.def_class_obj.count = 1;
 			break;
@@ -203,8 +205,8 @@ aclcheck(aclent_t *aclbufp, int nentries, int *which)
 		default:
 			(void) free_mem(&tally);
 			errno = EINVAL;
-			*which = (int) (aclentp - aclbufp);
-			return (ENTRY_ERROR);
+			*which = (int)(aclentp - aclbufp);
+			return (EACL_ENTRY_ERROR);
 		}
 	}
 	/* If there are group or user entries, there must be one class entry */
@@ -212,14 +214,14 @@ aclcheck(aclent_t *aclbufp, int nentries, int *which)
 		if (tally.class_obj.count != 1) {
 			(void) free_mem(&tally);
 			errno = EINVAL;
-			return (MISS_ERROR);
+			return (EACL_MISS_ERROR);
 		}
 	/* same is true for default entries */
 	if (tally.def_user.count > 0 || tally.def_group.count > 0)
 		if (tally.def_class_obj.count != 1) {
 			(void) free_mem(&tally);
 			errno = EINVAL;
-			return (MISS_ERROR);
+			return (EACL_MISS_ERROR);
 		}
 
 	/* there must be exactly one user_obj, group_obj, and other_obj entry */
@@ -228,27 +230,43 @@ aclcheck(aclent_t *aclbufp, int nentries, int *which)
 		tally.other_obj.count != 1) {
 		(void) free_mem(&tally);
 		errno = EINVAL;
-		return (MISS_ERROR);
+		return (EACL_MISS_ERROR);
 	}
 
 	/* has default? same rules apply to default entries */
-	if (tally.def_user.count > 0 ||
-	    tally.def_user_obj.count > 0 ||
-	    tally.def_group.count > 0 ||
-	    tally.def_group_obj.count > 0 ||
-	    tally.def_class_obj.count > 0 ||
-	    tally.def_other_obj.count > 0)
+	if (tally.def_user.count > 0 || tally.def_user_obj.count > 0 ||
+	    tally.def_group.count > 0 || tally.def_group_obj.count > 0 ||
+	    tally.def_class_obj.count > 0 || tally.def_other_obj.count > 0) {
+
+		/*
+		 * Can't have default ACL's on non-directories
+		 */
+		if (isdir == 0) {
+			(void) free_mem(&tally);
+			errno = EINVAL;
+			return (EACL_INHERIT_NOTDIR);
+		}
+
 		if (tally.def_user_obj.count != 1 ||
 		    tally.def_group_obj.count != 1 ||
 		    tally.def_other_obj.count != 1) {
 			(void) free_mem(&tally);
 			errno = EINVAL;
-			return (MISS_ERROR);
+			return (EACL_MISS_ERROR);
 		}
+	}
+
 	(void) free_mem(&tally);
 	return (0);
 }
 
+int
+aclcheck(aclent_t *aclbufp, int nentries, int *which)
+{
+	return (aclent_aclcheck(aclbufp, nentries, which, 1));
+}
+
+
 static void
 free_mem(struct entry_stat *tallyp)
 {
@@ -276,3 +294,99 @@ check_dup(int count, uid_t *ids, uid_t newid, struct entry_stat *tallyp)
 	}
 	return (0);
 }
+
+#define	IFLAGS	(ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE| \
+    ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE)
+
+static int
+ace_aclcheck(acl_t *aclp, int isdir)
+{
+	ace_t 	*acep;
+	int 	i;
+	int	error = 0;
+
+	/*
+	 * step through all valid flags.
+	 */
+
+	if (aclp->acl_cnt <= 0 || aclp->acl_cnt > MAX_ACL_ENTRIES)
+		return (EACL_COUNT_ERROR);
+
+	for (i = 0, acep = aclp->acl_aclp;
+	    i != aclp->acl_cnt && error == 0; i++, acep++) {
+		switch (acep->a_flags & 0xf040) {
+		case 0:
+		case ACE_OWNER:
+		case ACE_EVERYONE:
+		case ACE_IDENTIFIER_GROUP:
+		case ACE_GROUP|ACE_IDENTIFIER_GROUP:
+			break;
+		default:
+			errno = EINVAL;
+			return (EACL_FLAGS_ERROR);
+		}
+
+		/*
+		 * Can't have inheritance on files.
+		 */
+		if ((acep->a_flags &
+		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE|
+		    ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) &&
+		    isdir == 0) {
+			errno = EINVAL;
+			return (EACL_INHERIT_NOTDIR);
+		}
+
+		/*
+		 * INHERIT_ONLY/NO_PROPAGATE need a to INHERIT_FILE
+		 * or INHERIT_DIR also
+		 */
+		if (acep->a_flags &
+		    (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
+			if ((acep->a_flags & (ACE_FILE_INHERIT_ACE|
+			    ACE_DIRECTORY_INHERIT_ACE)) == 0) {
+				errno = EINVAL;
+				return (EACL_INHERIT_ERROR);
+			}
+			break;
+		}
+
+		switch (acep->a_type) {
+		case ACE_ACCESS_ALLOWED_ACE_TYPE:
+		case ACE_ACCESS_DENIED_ACE_TYPE:
+		case ACE_SYSTEM_AUDIT_ACE_TYPE:
+		case ACE_SYSTEM_ALARM_ACE_TYPE:
+			break;
+		default:
+			errno = EINVAL;
+			return (EACL_ENTRY_ERROR);
+		}
+		if (acep->a_access_mask > ACE_ALL_PERMS) {
+			errno = EINVAL;
+			return (EACL_PERM_MASK_ERROR);
+		}
+	}
+
+	return (0);
+}
+
+int
+acl_check(acl_t *aclp, int flag)
+{
+	int error;
+	int where;
+
+	switch (aclp->acl_type) {
+	case ACLENT_T:
+		error = aclent_aclcheck(aclp->acl_aclp, aclp->acl_cnt,
+		    &where, flag);
+		break;
+	case ACE_T:
+		error = ace_aclcheck(aclp, flag);
+		break;
+	default:
+		errno = EINVAL;
+		error = EACL_ENTRY_ERROR;
+	}
+	return (error);
+}
diff --git a/usr/src/lib/libsec/common/acltext.c b/usr/src/lib/libsec/common/acltext.c
index da3195379c05..75b0dc785700 100644
--- a/usr/src/lib/libsec/common/acltext.c
+++ b/usr/src/lib/libsec/common/acltext.c
@@ -32,9 +32,15 @@
 #include <string.h>
 #include <limits.h>
 #include <stdlib.h>
+#include <errno.h>
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/acl.h>
+#include <aclutils.h>
+#include <libintl.h>
+
+
+extern acl_t *acl_alloc(enum acl_type);
 
 /*
  * acltotext() converts each ACL entry to look like this:
@@ -64,8 +70,21 @@ static char *strappend(char *, char *);
 static char *convert_perm(char *, o_mode_t);
 static int increase_length(struct dynaclstr *, size_t);
 
-#define	FREE	free(aclp);\
-		free(allocp)
+static int
+acl_str_to_id(char *str, int *id)
+{
+	char *end;
+	uid_t value;
+
+	value = strtol(str, &end, 10);
+
+	if (errno != 0 || *end != '\0')
+		return (EACL_INVALID_USER_GROUP);
+
+	*id = value;
+
+	return (0);
+}
 
 /*
  * Convert internal acl representation to external representation.
@@ -213,8 +232,8 @@ acltotext(aclent_t *aclp, int aclcnt)
  * The comma at the end is not prescribed by the man pages.
  * But it is needed not to break the old programs.
  */
-aclent_t *
-aclfromtext(char *aclstr, int *aclcnt)
+static int
+aclent_aclfromtext(char *aclstr, acl_t **ret_aclp)
 {
 	char		*fieldp;
 	char		*tp;
@@ -224,23 +243,29 @@ aclfromtext(char *aclstr, int *aclcnt)
 	int		entry_type;
 	int		id;
 	int		len;
+	int		error;
 	o_mode_t	perm;
 	aclent_t	*tmpaclp;
-	aclent_t	*aclp;
+	acl_t		*aclp;
 	struct group	*groupp;
 	struct passwd	*passwdp;
 
-	*aclcnt = 0;
 	aclp = NULL;
 
 	if (! aclstr)
 		return (NULL);
 
+	aclp = acl_alloc(ACLENT_T);
+	if (aclp == NULL) {
+		return (EACL_MEM_ERROR);
+	}
+
+	*ret_aclp = NULL;
+
 	len = strlen(aclstr);
 
 	if ((aclimport = allocp = strdup(aclstr)) == NULL) {
-		fprintf(stderr, "malloc() failed\n");
-		return (NULL);
+		return (EACL_MEM_ERROR);
 	}
 
 	if (aclimport[len - 1] == ',')
@@ -256,32 +281,33 @@ aclfromtext(char *aclstr, int *aclcnt)
 			nextp = tp + 1;
 		}
 
-		*aclcnt += 1;
+		aclp->acl_cnt += 1;
 
 		/*
 		 * get additional memory:
 		 * can be more efficient by allocating a bigger block
 		 * each time.
 		 */
-		if (*aclcnt > 1)
-			tmpaclp = (aclent_t *)realloc(aclp,
-			    sizeof (aclent_t) * (*aclcnt));
+		if (aclp->acl_cnt > 1)
+			tmpaclp = (aclent_t *)realloc(aclp->acl_aclp,
+			    sizeof (aclent_t) * (aclp->acl_cnt));
 		else
 			tmpaclp = (aclent_t *)malloc(sizeof (aclent_t));
 		if (tmpaclp == NULL) {
 			free(allocp);
-			if (aclp)
-				free(aclp);
-			return (NULL);
+			acl_free(aclp);
+			return (EACL_MEM_ERROR);
 		}
-		aclp = tmpaclp;
-		tmpaclp = aclp + (*aclcnt - 1);
+		aclp->acl_aclp = tmpaclp;
+		tmpaclp = (aclent_t *)aclp->acl_aclp + (aclp->acl_cnt - 1);
 
 		/* look for entry type field */
 		tp = strchr(aclimport, ':');
 		if (tp == NULL) {
-			FREE;
-			return (NULL);
+			free(allocp);
+			if (aclp)
+				acl_free(aclp);
+			return (EACL_ENTRY_ERROR);
 		} else
 			*tp = '\0';
 		if (strcmp(aclimport, "user") == 0) {
@@ -313,8 +339,9 @@ aclfromtext(char *aclstr, int *aclcnt)
 		else if (strcmp(aclimport, "defaultother") == 0)
 			entry_type = DEF_OTHER_OBJ;
 		else {
-			FREE;
-			return (NULL);
+			free(allocp);
+			acl_free(aclp);
+			return (EACL_ENTRY_ERROR);
 		}
 
 		/* look for user/group name */
@@ -324,8 +351,9 @@ aclfromtext(char *aclstr, int *aclcnt)
 			fieldp = tp + 1;
 			tp = strchr(fieldp, ':');
 			if (tp == NULL) {
-				FREE;
-				return (NULL);
+				free(allocp);
+				acl_free(aclp);
+				return (EACL_INVALID_USER_GROUP);
 			} else
 				*tp = '\0';
 			if (fieldp != tp) {
@@ -341,32 +369,37 @@ aclfromtext(char *aclstr, int *aclcnt)
 					 * change. Use the friendlier interface
 					 * getpwnam().
 					 */
+					error = 0;
 					passwdp = getpwnam(fieldp);
 					if (passwdp == NULL) {
-						(void) fprintf(stderr,
-						"user %s not found\n", fieldp);
-						id = UID_NOBODY; /* nobody */
-					}
-					else
+						error = acl_str_to_id(fieldp,
+						    &id);
+					} else {
 						id = passwdp->pw_uid;
+					}
+
+					if (error) {
+						free(allocp);
+						acl_free(aclp);
+						return (error);
+					}
+
 				} else {
+					error = 0;
 					if (entry_type == GROUP ||
 					    entry_type == DEF_GROUP) {
 						groupp = getgrnam(fieldp);
 						if (groupp == NULL) {
-							(void) fprintf(stderr,
-							"group %s not found\n",
-							fieldp);
-							/* no group? */
-							id = GID_NOBODY;
+							error = acl_str_to_id(
+							    fieldp, &id);
 						}
-						else
+						if (error == 0)
 							id = groupp->gr_gid;
-					} else {
-						(void) fprintf(stderr,
-						"acl import errors\n");
-						FREE;
-						return (NULL);
+					}
+					if (error) {
+						free(allocp);
+						acl_free(aclp);
+						return (error);
 					}
 				}
 			} else {
@@ -390,8 +423,9 @@ aclfromtext(char *aclstr, int *aclcnt)
 		fieldp = tp + 1;
 		if (strlen(fieldp) != 3) {
 			/*  not "rwx" format */
-			FREE;
-			return (NULL);
+			free(allocp);
+			acl_free(aclp);
+			return (EACL_PERM_MASK_ERROR);
 		} else {
 			char	s[] = "rwx";
 			int	mask = 0x04;
@@ -402,8 +436,9 @@ aclfromtext(char *aclstr, int *aclcnt)
 				if (fieldp[i] == s[i])
 					perm |= mask;
 				else if (fieldp[i] != '-') {
-					FREE;
-					return (NULL);
+					free(allocp);
+					acl_free(aclp);
+					return (EACL_PERM_MASK_ERROR);
 				}
 			}
 		}
@@ -414,9 +449,30 @@ aclfromtext(char *aclstr, int *aclcnt)
 		aclimport = nextp;
 	}
 	free(allocp);
-	return (aclp);
+	*ret_aclp = aclp;
+	return (0);
+}
+
+aclent_t *
+aclfromtext(char *aclstr, int *aclcnt)
+{
+	acl_t *aclp;
+	aclent_t *aclentp;
+	int error;
+
+	error = aclent_aclfromtext(aclstr, &aclp);
+	if (error)
+		return (NULL);
+
+	aclentp = aclp->acl_aclp;
+	aclp->acl_aclp = NULL;
+	acl_free(aclp);
+
+	*aclcnt = aclp->acl_cnt;
+	return (aclentp);
 }
 
+
 static char *
 strappend(char *where, char *newstr)
 {
@@ -443,6 +499,129 @@ convert_perm(char *where, o_mode_t perm)
 	return (where);
 }
 
+static char *
+ace_convert_perm(char *where, mode_t perm, int isdir, int iflags)
+{
+	char *start = where;
+
+	/*
+	 * The following mneumonics all have the
+	 * same value.  The only difference is the
+	 * first value is for files and second for directories
+	 * ACE_READ_DATA/ACE_LIST_DIRECTORY
+	 * ACE_WRITE_DATA/ACE_ADD_FILE
+	 * ACE_APPEND_DATA/ACE_ADD_SUBDIRECTORY
+	 */
+
+	/*
+	 * If ACE is a directory, but inheritance indicates its
+	 * for a file then print permissions for file rather than
+	 * dir.
+	 */
+	if (isdir) {
+		if (perm & ACE_LIST_DIRECTORY) {
+			if (iflags == ACE_FILE_INHERIT_ACE)
+				where = strappend(where, "read_data/");
+			else
+				where = strappend(where,
+				    "list_directory/read_data/");
+		}
+		if (perm & ACE_ADD_FILE) {
+			if (iflags == ACE_FILE_INHERIT_ACE)
+				where = strappend(where, "write_data/");
+			else
+				where = strappend(where,
+				    "add_file/write_data/");
+		}
+		if (perm & ACE_ADD_SUBDIRECTORY) {
+			if (iflags == ACE_FILE_INHERIT_ACE)
+				where = strappend(where, "append_data/");
+			else
+				where = strappend(where,
+				    "add_subdirectory/append_data/");
+		}
+	} else {
+		if (perm & ACE_READ_DATA)
+			where = strappend(where, "read_data/");
+		if (perm & ACE_WRITE_DATA)
+			where = strappend(where, "write_data/");
+		if (perm & ACE_APPEND_DATA)
+			where = strappend(where, "append_data/");
+	}
+	if (perm & ACE_READ_NAMED_ATTRS)
+		where = strappend(where, "read_xattr/");
+	if (perm & ACE_WRITE_NAMED_ATTRS)
+		where = strappend(where, "write_xattr/");
+	if (perm & ACE_EXECUTE)
+		where = strappend(where, "execute/");
+	if (perm & ACE_DELETE_CHILD)
+		where = strappend(where, "delete_child/");
+	if (perm & ACE_READ_ATTRIBUTES)
+		where = strappend(where, "read_attributes/");
+	if (perm & ACE_WRITE_ATTRIBUTES)
+		where = strappend(where, "write_attributes/");
+	if (perm & ACE_DELETE)
+		where = strappend(where, "delete/");
+	if (perm & ACE_READ_ACL)
+		where = strappend(where, "read_acl/");
+	if (perm & ACE_WRITE_ACL)
+		where = strappend(where, "write_acl/");
+	if (perm & ACE_WRITE_OWNER)
+		where = strappend(where, "write_owner/");
+	if (perm & ACE_SYNCHRONIZE)
+		where = strappend(where, "synchronize");
+
+	if (start[strlen(start) - 1] == '/') {
+		start[strlen(start) - 1] = '\0';
+		where = start + strlen(start);
+	}
+	return (where);
+}
+
+int
+ace_permask(char *perm_tok, int *perm)
+{
+	if (strcmp(perm_tok, "read_data") == 0)
+		*perm |= ACE_READ_DATA;
+	else if (strcmp(perm_tok, "list_directory") == 0)
+		*perm |= ACE_LIST_DIRECTORY;
+	else if (strcmp(perm_tok, "write_data") == 0)
+		*perm |= ACE_WRITE_DATA;
+	else if (strcmp(perm_tok, "add_file") == 0)
+		*perm |= ACE_ADD_FILE;
+	else if (strcmp(perm_tok, "append_data") == 0)
+		*perm |= ACE_APPEND_DATA;
+	else if (strcmp(perm_tok, "add_subdirectory") == 0)
+		*perm |= ACE_ADD_SUBDIRECTORY;
+	else if (strcmp(perm_tok, "read_xattr") == 0)
+		*perm |= ACE_READ_NAMED_ATTRS;
+	else if (strcmp(perm_tok, "write_xattr") == 0)
+		*perm |= ACE_WRITE_NAMED_ATTRS;
+	else if (strcmp(perm_tok, "execute") == 0)
+		*perm |= ACE_EXECUTE;
+	else if (strcmp(perm_tok, "delete_child") == 0)
+		*perm |= ACE_DELETE_CHILD;
+	else if (strcmp(perm_tok, "read_attributes") == 0)
+		*perm |= ACE_READ_ATTRIBUTES;
+	else if (strcmp(perm_tok, "write_attributes") == 0)
+		*perm |= ACE_WRITE_ATTRIBUTES;
+	else if (strcmp(perm_tok, "delete") == 0)
+		*perm |= ACE_DELETE;
+	else if (strcmp(perm_tok, "read_acl") == 0)
+		*perm |= ACE_READ_ACL;
+	else if (strcmp(perm_tok, "write_acl") == 0)
+		*perm |= ACE_WRITE_ACL;
+	else if (strcmp(perm_tok, "write_owner") == 0)
+		*perm |= ACE_WRITE_OWNER;
+	else if (strcmp(perm_tok, "synchronize") == 0)
+		*perm |= ACE_SYNCHRONIZE;
+	else {
+		return (1);
+	}
+
+	return (0);
+}
+
 /*
  * Callers should check the return code as this routine may change the string
  * pointer in dynaclstr.
@@ -462,3 +641,537 @@ increase_length(struct dynaclstr *dacl, size_t increase)
 	} else
 		return (0);
 }
+
+/*
+ * ace_acltotext() conver each ace formatted acl to look like this:
+ *
+ * entry_type:uid^gid^name:perms:allow^deny[:flags][,]
+ *
+ * The maximum length of entry_type is 5 ("group")
+ *
+ * The max length of a uid^gid^name entry (in theory) is 8, hence we use
+ * LOGNAME_MAX.
+ *
+ * The length of a perms entry is 144 i.e read_data/write_data...
+ * to each acl entry.
+ *
+ * iflags: file_inherit/dir_inherit/inherit_only/no_propagate
+ *
+ */
+
+#define	ACE_ENTRYTYPLEN		6
+#define	IFLAGS_SIZE		51
+#define	ACCESS_TYPE_SIZE	5
+#define	COLON_CNT		3
+#define	PERMS_LEN		216
+#define	ACE_ENTRY_SIZE	(ACE_ENTRYTYPLEN + LOGNAME_MAX + PERMS_LEN +\
+    ACCESS_TYPE_SIZE + IFLAGS_SIZE + COLON_CNT)
+
+static char *
+ace_acltotext(acl_t *aceaclp)
+{
+	ace_t		*aclp = aceaclp->acl_aclp;
+	int		aclcnt = aceaclp->acl_cnt;
+	char		*aclexport;
+	char		*where;
+	char		*start;
+	struct group	*groupp;
+	struct passwd	*passwdp;
+	struct dynaclstr *dstr;
+	int		i, rtn;
+	int		isdir = (aceaclp->acl_flags & ACL_IS_DIR);
+	size_t		excess = 0;
+
+	if (aclp == NULL)
+		return (NULL);
+	if ((dstr = malloc(sizeof (struct dynaclstr))) == NULL)
+		return (NULL);
+	dstr->bufsize = aclcnt * ACE_ENTRY_SIZE;
+	if ((dstr->aclexport = malloc(dstr->bufsize)) == NULL)
+		return (NULL);
+	*dstr->aclexport = '\0';
+	where = dstr->aclexport;
+
+	for (i = 0; i < aclcnt; i++, aclp++) {
+		switch (aclp->a_flags & 0xf040) {
+		case ACE_OWNER:
+		case 0:
+			if ((aclp->a_flags & 0xf040) == ACE_OWNER)
+				where = strappend(where, "owner@");
+			else
+				where = strappend(where, "user:");
+			if ((aclp->a_flags & 0xf040) == 0) {
+				passwdp = getpwuid(aclp->a_who);
+				if (passwdp == (struct passwd *)NULL) {
+					/* put in uid instead */
+					(void) sprintf(where, "%d",
+					    aclp->a_who);
+				} else {
+					excess = strlen(passwdp->pw_name) -
+					    LOGNAME_MAX;
+					if (excess > 0) {
+						rtn = increase_length(dstr,
+						    excess);
+						if (rtn == 1)
+							/* reset where */
+							where =
+							    dstr->aclexport +
+							    strlen(
+							    dstr->aclexport);
+						else
+							return (NULL);
+					}
+					where = strappend(where,
+					    passwdp->pw_name);
+				}
+			} else {
+				where = strappend(where, "");
+			}
+			where = strappend(where, ":");
+			break;
+		case ACE_GROUP|ACE_IDENTIFIER_GROUP:
+		case ACE_IDENTIFIER_GROUP:
+			if ((aclp->a_flags & 0xf040) ==
+			    (ACE_GROUP | ACE_IDENTIFIER_GROUP))
+				where = strappend(where, "group@");
+			else
+				where = strappend(where, "group:");
+			if (!(aclp->a_flags & ACE_GROUP)) {
+				groupp = getgrgid(aclp->a_who);
+				if (groupp == (struct group *)NULL) {
+					/* put in gid instead */
+					(void) sprintf(where,
+					    "%d", aclp->a_who);
+				} else {
+					excess = strlen(groupp->gr_name) -
+					    LOGNAME_MAX;
+					if (excess > 0) {
+						rtn = increase_length(dstr,
+						    excess);
+						if (rtn == 1)
+							/* reset where */
+							where =
+							    dstr->aclexport +
+							    strlen(
+							    dstr->aclexport);
+						else
+							return (NULL);
+					}
+					where = strappend(where,
+					    groupp->gr_name);
+				}
+			} else {
+					where = strappend(where, "");
+			}
+			where = strappend(where, ":");
+			break;
+		case ACE_EVERYONE:
+			where = strappend(where, "everyone@:");
+			break;
+		default:
+			free(dstr->aclexport);
+			free(dstr);
+			return (NULL);
+
+		}
+		where = ace_convert_perm(where, aclp->a_access_mask,
+		    isdir, (aclp->a_flags &
+		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)));
+		where = strappend(where,
+		    (aclp->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) ?
+		    ":allow" : ":deny");
+
+		/*
+		 * slap on inheritance flags if we have any
+		 */
+
+		if (aclp->a_flags & 0xf) {
+			where = strappend(where, ":");
+			start = where;
+			if (aclp->a_flags & ACE_FILE_INHERIT_ACE)
+				where = strappend(where, "file_inherit/");
+			if (aclp->a_flags & ACE_DIRECTORY_INHERIT_ACE)
+				where = strappend(where, "dir_inherit/");
+			if (aclp->a_flags & ACE_NO_PROPAGATE_INHERIT_ACE)
+				where = strappend(where, "no_propagate/");
+			if (aclp->a_flags & ACE_INHERIT_ONLY_ACE)
+				where = strappend(where, "inherit_only");
+
+			/*
+			 * chop off trailing slash, if present
+			 */
+			if (start[strlen(start) - 1] == '/') {
+				start[strlen(start) - 1] = '\0';
+				where = start + strlen(start);
+			}
+		}
+		if (i < aclcnt - 1)
+			where = strappend(where, ",");
+	}
+	aclexport = dstr->aclexport;
+	free(dstr);
+	return (aclexport);
+}
+
+static int
+build_iflags(char *str, int *iflags)
+{
+
+	char *tok;
+	*iflags = 0;
+
+	tok = strtok(str, "/");
+
+	if (tok == NULL)
+		return (1);
+
+	do {
+		if (strcmp(tok, "file_inherit") == 0)
+			*iflags |= ACE_FILE_INHERIT_ACE;
+		else if (strcmp(tok, "dir_inherit") == 0)
+			*iflags |= ACE_DIRECTORY_INHERIT_ACE;
+		else if (strcmp(tok, "inherit_only") == 0)
+			*iflags |= ACE_INHERIT_ONLY_ACE;
+		else if (strcmp(tok, "no_propagate") == 0)
+			*iflags |= ACE_NO_PROPAGATE_INHERIT_ACE;
+		else
+			return (1);
+	} while (tok = strtok(NULL, "/"));
+	return (0);
+}
+
+/*
+ * Convert external acl representation to internal representation.
+ * The accepted syntax is: <acl_entry>[,<acl_entry>]*[,]
+ * The comma at the end is not prescribed by the man pages.
+ * But it is needed not to break the old programs.
+ */
+
+int
+ace_aclfromtext(char *aclstr, acl_t **ret_aclp)
+{
+	char		*fieldp;
+	char		*tp;
+	char		*nextp;
+	char		*allocp;
+	char		*aclimport;
+	char 		*str;
+	char		*perm_tok;
+	int		entry_type;
+	int		id;
+	int		type;
+	int		iflags;
+	int		len;
+	int		error;
+	int32_t		perm;
+	ace_t		*tmpaclp;
+	acl_t		*aclp;
+	struct group	*groupp;
+	struct passwd	*passwdp;
+
+	if (! aclstr)
+		return (EACL_INVALID_STR);
+
+	len = strlen(aclstr);
+
+	aclp = acl_alloc(ACE_T);
+	if (aclp == NULL) {
+		return (EACL_MEM_ERROR);
+	}
+
+	*ret_aclp = NULL;
+
+	if ((aclimport = allocp = strdup(aclstr)) == NULL) {
+		return (EACL_MEM_ERROR);
+	}
+
+
+	if (aclimport[len - 1] == ',')
+		aclimport[len - 1] = '\0';
+
+	for (; aclimport; ) {
+		/* look for an ACL entry */
+		tp = strchr(aclimport, ',');
+		if (tp == NULL) {
+			nextp = NULL;
+		} else {
+			*tp = '\0';
+			nextp = tp + 1;
+		}
+
+		aclp->acl_cnt += 1;
+
+		/*
+		 * get additional memory:
+		 * can be more efficient by allocating a bigger block
+		 * each time.
+		 */
+		if (aclp->acl_cnt > 1)
+			tmpaclp = (ace_t *)realloc(aclp->acl_aclp,
+			    sizeof (ace_t) * (aclp->acl_cnt));
+		else
+			tmpaclp = (ace_t *)malloc(sizeof (ace_t));
+		if (tmpaclp == NULL) {
+			free(allocp);
+			acl_free(aclp);
+			return (EACL_MEM_ERROR);
+		}
+		aclp->acl_aclp = tmpaclp;
+		tmpaclp = (ace_t *)aclp->acl_aclp + (aclp->acl_cnt - 1);
+
+		/* look for entry type field */
+		tp = strchr(aclimport, ':');
+		if (tp == NULL) {
+			free(allocp);
+			acl_free(aclp);
+			return (EACL_ENTRY_ERROR);
+		} else
+			*tp = '\0';
+		if (strcmp(aclimport, "owner@") == 0) {
+			entry_type = ACE_OWNER;
+		} else if (strcmp(aclimport, "group@") == 0) {
+			entry_type = ACE_GROUP | ACE_IDENTIFIER_GROUP;
+		} else if (strcmp(aclimport, "everyone@") == 0) {
+			entry_type = ACE_EVERYONE;
+		} else if (strcmp(aclimport, "group") == 0) {
+			entry_type = ACE_IDENTIFIER_GROUP;
+		} else if (strcmp(aclimport, "user") == 0) {
+			entry_type = 0;
+		} else {
+			free(allocp);
+			acl_free(aclp);
+			return (EACL_ENTRY_ERROR);
+		}
+
+		/*
+		 * If not an abstraction owner@, group@ or everyone@
+		 * then we must have a user/group name next
+		 */
+
+		if (entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP) {
+			fieldp = tp + 1;
+			tp = strchr(fieldp, ':');
+			if (tp == NULL) {
+				free(allocp);
+				acl_free(aclp);
+				return (EACL_INVALID_USER_GROUP);
+			} else
+				*tp = '\0';
+			if (fieldp != tp) {
+				/*
+				 * The second field could be empty. We only care
+				 * when the field has user/group name.
+				 */
+				if (entry_type == 0) {
+					/*
+					 * The reentrant interface getpwnam_r()
+					 * is uncommitted and subject to
+					 * change. Use the friendlier interface
+					 * getpwnam().
+					 */
+					error = 0;
+					passwdp = getpwnam(fieldp);
+					if (passwdp == NULL) {
+						error = acl_str_to_id(
+						    fieldp, &id);
+					} else {
+						id = passwdp->pw_uid;
+					}
+
+					if (error) {
+						free(allocp);
+						acl_free(aclp);
+						return (error);
+					}
+				} else {
+					error = 0;
+					if (entry_type ==
+					    ACE_IDENTIFIER_GROUP) {
+						groupp = getgrnam(fieldp);
+						if (groupp == NULL) {
+							/* no group? */
+							error = acl_str_to_id(
+							    fieldp, &id);
+						} else
+							id = groupp->gr_gid;
+
+					} else if ((entry_type == ACE_OWNER) ||
+					    (entry_type ==
+					    (ACE_IDENTIFIER_GROUP|ACE_GROUP)) ||
+					    (entry_type != ACE_EVERYONE)) {
+						error = EACL_FIELD_NOT_BLANK;
+					} else {
+						error = EACL_ENTRY_ERROR;
+					}
+
+					if (error) {
+						free(allocp);
+						acl_free(aclp);
+						return (error);
+					}
+				}
+			}
+		} else {
+			id = -1;
+		}
+
+		/* next field: permission */
+		fieldp = tp + 1;
+		tp = strchr(fieldp, ':');
+		if (tp == NULL) {
+			free(allocp);
+			acl_free(aclp);
+			return (EACL_PERM_MASK_ERROR);
+		} else
+			*tp = '\0';
+
+		perm = 0;
+
+		perm_tok = strtok(fieldp, "/");
+		if (perm_tok == NULL) {
+			perm = 0;
+		} else {
+			do {
+				if (ace_permask(perm_tok, &perm) != 0) {
+					free(allocp);
+					acl_free(aclp);
+					return (EACL_PERM_MASK_ERROR);
+				}
+			} while (perm_tok = strtok(NULL, "/"));
+		}
+
+		/* grab allow/deny */
+		fieldp = tp + 1;
+		tp = strchr(fieldp, ':');
+		if (tp != NULL)
+			*tp = '\0';
+
+		if (strcmp(fieldp, "allow") == 0)
+			type = ACE_ACCESS_ALLOWED_ACE_TYPE;
+		else if (strcmp(fieldp, "deny") == 0)
+			type = ACE_ACCESS_DENIED_ACE_TYPE;
+		else {
+			free(allocp);
+			acl_free(aclp);
+			return (EACL_INVALID_ACCESS_TYPE);
+		}
+
+		/* grab option inherit flags */
+
+		iflags = 0;
+		if (tp != NULL) {
+			fieldp = tp + 1;
+			if (fieldp != NULL) {
+				*tp = '\0';
+				str = fieldp;
+				if (build_iflags(str, &iflags) != 0) {
+					free(allocp);
+					acl_free(aclp);
+					return (EACL_INHERIT_ERROR);
+				}
+			} else {
+				free(allocp);
+				acl_free(aclp);
+				return (EACL_UNKNOWN_DATA);
+			}
+		}
+		/* slap fields into ace_t structure */
+
+		tmpaclp->a_flags = entry_type;
+		tmpaclp->a_flags |= iflags;
+		tmpaclp->a_who = id;
+		tmpaclp->a_access_mask = perm;
+		tmpaclp->a_type = type;
+		aclimport = nextp;
+	}
+	free(allocp);
+	*ret_aclp = aclp;
+	return (0);
+}
+
+char
+*acl_totext(acl_t *aclp)
+{
+	if (aclp == NULL)
+		return (NULL);
+
+	switch (aclp->acl_type) {
+	case ACE_T:
+		return (ace_acltotext(aclp));
+	case ACLENT_T:
+		return (acltotext(aclp->acl_aclp, aclp->acl_cnt));
+	}
+	return (NULL);
+}
+
+int
+acl_fromtext(const char *acltextp, acl_t **ret_aclp)
+{
+	acl_t *aclp;
+	char *token;
+	char *ptr;
+	char *textp;
+	enum acl_type flavor;
+	int colon_cnt = 0;
+	int error;
+
+	/*
+	 * first try and detect what type of acl entries we have
+	 *
+	 * aclent_t can have 1, 2 or 3 colons
+	 * if 3 then must have word default:
+	 *
+	 * ace_t can have 2, 3 or 4
+	 * for 2 then must be owner@, group@ or everyone@
+	 */
+
+	textp = strdup(acltextp);
+	if (textp == NULL)
+		return (-1);
+
+	token = strtok(textp, ",");
+	if (token == NULL) {
+		free(textp);
+		return (-1);
+	}
+
+	for (ptr = token; *ptr; ptr++) {
+		if (*ptr == ':')
+			colon_cnt++;
+	}
+
+	if (colon_cnt == 1 || colon_cnt == 2) {
+		if ((strncmp(acltextp, "owner@", 6) == 0) ||
+		    (strncmp(acltextp, "group@", 6) == 0) ||
+		    (strncmp(acltextp, "everyone@", 9) == 0))
+			flavor = ACE_T;
+		else
+			flavor = ACLENT_T;
+	} else if (colon_cnt == 3) {
+		ptr = strtok(token, ":");
+		if (ptr == NULL) {
+			free(textp);
+			return (EACL_MISSING_FIELDS);
+		} else if (strcmp(ptr, "default") == 0) {
+			flavor = ACLENT_T;
+		} else {
+			flavor = ACE_T;
+		}
+	} else if (colon_cnt == 4) {
+		flavor = ACE_T;
+	} else {
+		free(textp);
+		return (EACL_MISSING_FIELDS);
+	}
+
+
+	free(textp);
+
+	if (flavor == ACLENT_T)
+		error = aclent_aclfromtext((char *)acltextp, &aclp);
+	else
+		error = ace_aclfromtext((char *)acltextp, &aclp);
+
+	*ret_aclp = aclp;
+	return (error);
+}
diff --git a/usr/src/lib/libsec/common/aclutils.c b/usr/src/lib/libsec/common/aclutils.c
new file mode 100644
index 000000000000..f3c8856054f2
--- /dev/null
+++ b/usr/src/lib/libsec/common/aclutils.c
@@ -0,0 +1,1436 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <limits.h>
+#include <grp.h>
+#include <pwd.h>
+#include <sys/types.h>
+#include <sys/acl.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <locale.h>
+#include <aclutils.h>
+#include <acl_common.h>
+
+#define	ACL_PATH	0
+#define	ACL_FD		1
+
+#define	ACE_POSIX_SUPPORTED_BITS (ACE_READ_DATA | \
+    ACE_WRITE_DATA | ACE_APPEND_DATA | ACE_EXECUTE | \
+    ACE_READ_ATTRIBUTES | ACE_READ_ACL | ACE_WRITE_ACL)
+
+
+#define	ACL_SYNCHRONIZE_SET_ALLOW		0x0000002
+#define	ACL_SYNCHRONIZE_SET_DENY		0x0000001
+
+#define	ACL_WRITE_OWNER_SET_ALLOW		0x0000020
+#define	ACL_WRITE_OWNER_SET_DENY		0x0000010
+
+#define	ACL_WRITE_ATTRS_OWNER_SET_ALLOW		0x0002000
+#define	ACL_WRITE_ATTRS_OWNER_SET_DENY		0x0001000
+
+#define	ACL_WRITE_ATTRS_WRITER_SET_DENY		0x0010000
+
+#define	ACL_DELETE_SET_ALLOW			0x0000200
+#define	ACL_DELETE_SET_DENY			0x0000100
+
+#define	ACL_READ_NAMED_READER_SET_ALLOW		0x2000000
+
+#define	ACL_WRITE_NAMED_WRITER_SET_ALLOW	0x0200000
+#define	ACL_WRITE_NAMED_WRITER_SET_DENY		0x0100000
+
+#define	ACL_WRITE_ATTRS_OWNER_SET_ALLOW		0x0002000
+#define	ACL_WRITE_ATTRS_WRITER_SET_ALLOW	0x0020000
+
+#define	ACL_WRITE_OWNER_ERR_DENY		0x0000040
+#define	ACL_READ_NAMED_READER_SET_DENY		0x1000000
+#define	ACL_WRITE_NAMED_WRITER_SET_ALLO		W0x0200000
+typedef union {
+	const char *file;
+	int  fd;
+} acl_inp;
+
+acl_t *
+acl_alloc(enum acl_type type)
+{
+	acl_t *aclp;
+
+	aclp = malloc(sizeof (acl_t));
+
+	if (aclp == NULL)
+		return (NULL);
+
+	aclp->acl_aclp = NULL;
+	aclp->acl_cnt = 0;
+
+	switch (type) {
+	case ACE_T:
+		aclp->acl_type = ACE_T;
+		aclp->acl_entry_size = sizeof (ace_t);
+		break;
+	case ACLENT_T:
+		aclp->acl_type = ACLENT_T;
+		aclp->acl_entry_size = sizeof (aclent_t);
+		break;
+	default:
+		acl_free(aclp);
+		aclp = NULL;
+	}
+	return (aclp);
+}
+
+/*
+ * Free acl_t structure
+ */
+void
+acl_free(acl_t *aclp)
+{
+	if (aclp == NULL)
+		return;
+
+	if (aclp->acl_aclp)
+		free(aclp->acl_aclp);
+	free(aclp);
+}
+
+/*
+ * Determine whether a file has a trivial ACL
+ * returns: 	0 = trivial
+ *		1 = nontrivial
+ *		<0 some other system failure, such as ENOENT or EPERM
+ */
+int
+acl_trivial(const char *filename)
+{
+	int acl_flavor;
+	int aclcnt;
+	int cntcmd;
+	int val = 0;
+	ace_t *acep;
+
+	acl_flavor = pathconf(filename, _PC_ACL_ENABLED);
+	if (acl_flavor == -1)
+		return (-1);
+
+	if (acl_flavor == _ACL_ACE_ENABLED)
+		cntcmd = ACE_GETACLCNT;
+	else
+		cntcmd = GETACLCNT;
+
+	aclcnt = acl(filename, cntcmd, 0, NULL);
+	if (aclcnt > 0) {
+		if (acl_flavor == _ACL_ACE_ENABLED) {
+			if (aclcnt != 6)
+				val = 1;
+			else {
+				acep = malloc(sizeof (ace_t) * aclcnt);
+				if (acep == NULL)
+					return (-1);
+				if (acl(filename, ACE_GETACL,
+				    aclcnt, acep) < 0) {
+					free(acep);
+					return (-1);
+				}
+
+				val = ace_trivial(acep, aclcnt);
+				free(acep);
+			}
+		} else if (aclcnt > MIN_ACL_ENTRIES)
+			val = 1;
+	}
+	return (val);
+}
+
+static uint32_t
+access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow)
+{
+	uint32_t access_mask = 0;
+	int acl_produce;
+	int synchronize_set = 0, write_owner_set = 0;
+	int delete_set = 0, write_attrs_set = 0;
+	int read_named_set = 0, write_named_set = 0;
+
+	acl_produce = (ACL_SYNCHRONIZE_SET_ALLOW |
+	    ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
+	    ACL_WRITE_ATTRS_WRITER_SET_DENY);
+
+	if (isallow) {
+		synchronize_set = ACL_SYNCHRONIZE_SET_ALLOW;
+		write_owner_set = ACL_WRITE_OWNER_SET_ALLOW;
+		delete_set = ACL_DELETE_SET_ALLOW;
+		if (hasreadperm)
+			read_named_set = ACL_READ_NAMED_READER_SET_ALLOW;
+		if (haswriteperm)
+			write_named_set = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
+		if (isowner)
+			write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
+		else if (haswriteperm)
+			write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
+	} else {
+
+		synchronize_set = ACL_SYNCHRONIZE_SET_DENY;
+		write_owner_set = ACL_WRITE_OWNER_SET_DENY;
+		delete_set = ACL_DELETE_SET_DENY;
+		if (hasreadperm)
+			read_named_set = ACL_READ_NAMED_READER_SET_DENY;
+		if (haswriteperm)
+			write_named_set = ACL_WRITE_NAMED_WRITER_SET_DENY;
+		if (isowner)
+			write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_DENY;
+		else if (haswriteperm)
+			write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_DENY;
+		else
+			/*
+			 * If the entity is not the owner and does not
+			 * have write permissions ACE_WRITE_ATTRIBUTES will
+			 * always go in the DENY ACE.
+			 */
+			access_mask |= ACE_WRITE_ATTRIBUTES;
+	}
+
+	if (acl_produce & synchronize_set)
+		access_mask |= ACE_SYNCHRONIZE;
+	if (acl_produce & write_owner_set)
+		access_mask |= ACE_WRITE_OWNER;
+	if (acl_produce & delete_set)
+		access_mask |= ACE_DELETE;
+	if (acl_produce & write_attrs_set)
+		access_mask |= ACE_WRITE_ATTRIBUTES;
+	if (acl_produce & read_named_set)
+		access_mask |= ACE_READ_NAMED_ATTRS;
+	if (acl_produce & write_named_set)
+		access_mask |= ACE_WRITE_NAMED_ATTRS;
+
+	return (access_mask);
+}
+
+/*
+ * Given an mode_t, convert it into an access_mask as used
+ * by nfsace, assuming aclent_t -> nfsace semantics.
+ */
+static uint32_t
+mode_to_ace_access(mode_t mode, int isdir, int isowner, int isallow)
+{
+	uint32_t access = 0;
+	int haswriteperm = 0;
+	int hasreadperm = 0;
+
+	if (isallow) {
+		haswriteperm = (mode & 02);
+		hasreadperm = (mode & 04);
+	} else {
+		haswriteperm = !(mode & 02);
+		hasreadperm = !(mode & 04);
+	}
+
+	/*
+	 * The following call takes care of correctly setting the following
+	 * mask bits in the access_mask:
+	 * ACE_SYNCHRONIZE, ACE_WRITE_OWNER, ACE_DELETE,
+	 * ACE_WRITE_ATTRIBUTES, ACE_WRITE_NAMED_ATTRS, ACE_READ_NAMED_ATTRS
+	 */
+	access = access_mask_set(haswriteperm, hasreadperm, isowner, isallow);
+
+	if (isallow) {
+		access |= ACE_READ_ACL | ACE_READ_ATTRIBUTES;
+		if (isowner)
+			access |= ACE_WRITE_ACL;
+	} else {
+		if (! isowner)
+			access |= ACE_WRITE_ACL;
+	}
+
+	/* read */
+	if (mode & 04) {
+		access |= ACE_READ_DATA;
+	}
+	/* write */
+	if (mode & 02) {
+		access |= ACE_WRITE_DATA |
+		    ACE_APPEND_DATA;
+		if (isdir)
+			access |= ACE_DELETE_CHILD;
+	}
+	/* exec */
+	if (mode & 01) {
+		access |= ACE_EXECUTE;
+	}
+
+	return (access);
+}
+
+/*
+ * Given an nfsace (presumably an ALLOW entry), make a
+ * corresponding DENY entry at the address given.
+ */
+static void
+ace_make_deny(ace_t *allow, ace_t *deny, int isdir, int isowner)
+{
+	(void) memcpy(deny, allow, sizeof (ace_t));
+
+	deny->a_who = allow->a_who;
+
+	deny->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+	deny->a_access_mask ^= ACE_POSIX_SUPPORTED_BITS;
+	if (isdir)
+		deny->a_access_mask ^= ACE_DELETE_CHILD;
+
+	deny->a_access_mask &= ~(ACE_SYNCHRONIZE | ACE_WRITE_OWNER |
+	    ACE_DELETE | ACE_WRITE_ATTRIBUTES | ACE_READ_NAMED_ATTRS |
+	    ACE_WRITE_NAMED_ATTRS);
+	deny->a_access_mask |= access_mask_set((allow->a_access_mask &
+	    ACE_WRITE_DATA), (allow->a_access_mask & ACE_READ_DATA), isowner,
+	    B_FALSE);
+}
+/*
+ * Make an initial pass over an array of aclent_t's.  Gather
+ * information such as an ACL_MASK (if any), number of users,
+ * number of groups, and whether the array needs to be sorted.
+ */
+static int
+ln_aent_preprocess(aclent_t *aclent, int n,
+    int *hasmask, mode_t *mask,
+    int *numuser, int *numgroup, int *needsort)
+{
+	int error = 0;
+	int i;
+	int curtype = 0;
+
+	*hasmask = 0;
+	*mask = 07;
+	*needsort = 0;
+	*numuser = 0;
+	*numgroup = 0;
+
+	for (i = 0; i < n; i++) {
+		if (aclent[i].a_type < curtype)
+			*needsort = 1;
+		else if (aclent[i].a_type > curtype)
+			curtype = aclent[i].a_type;
+		if (aclent[i].a_type & USER)
+			(*numuser)++;
+		if (aclent[i].a_type & (GROUP | GROUP_OBJ))
+			(*numgroup)++;
+		if (aclent[i].a_type & CLASS_OBJ) {
+			if (*hasmask) {
+				error = EINVAL;
+				goto out;
+			} else {
+				*hasmask = 1;
+				*mask = aclent[i].a_perm;
+			}
+		}
+	}
+
+	if ((! *hasmask) && (*numuser + *numgroup > 1)) {
+		error = EINVAL;
+		goto out;
+	}
+
+out:
+	return (error);
+}
+
+/*
+ * Convert an array of aclent_t into an array of nfsace entries,
+ * following POSIX draft -> nfsv4 conversion semantics as outlined in
+ * the IETF draft.
+ */
+static int
+ln_aent_to_ace(aclent_t *aclent, int n, ace_t **acepp, int *rescount, int isdir)
+{
+	int error = 0;
+	mode_t mask;
+	int numuser, numgroup, needsort;
+	int resultsize = 0;
+	int i, groupi = 0, skip;
+	ace_t *acep, *result = NULL;
+	int hasmask;
+
+	error = ln_aent_preprocess(aclent, n, &hasmask, &mask,
+	    &numuser, &numgroup, &needsort);
+	if (error != 0)
+		goto out;
+
+	/* allow + deny for each aclent */
+	resultsize = n * 2;
+	if (hasmask) {
+		/*
+		 * stick extra deny on the group_obj and on each
+		 * user|group for the mask (the group_obj was added
+		 * into the count for numgroup)
+		 */
+		resultsize += numuser + numgroup;
+		/* ... and don't count the mask itself */
+		resultsize -= 2;
+	}
+
+	/* sort the source if necessary */
+	if (needsort)
+		ksort((caddr_t)aclent, n, sizeof (aclent_t), cmp2acls);
+
+	result = acep = calloc(1, resultsize * sizeof (ace_t));
+	if (result == NULL)
+		goto out;
+
+	for (i = 0; i < n; i++) {
+		/*
+		 * don't process CLASS_OBJ (mask); mask was grabbed in
+		 * ln_aent_preprocess()
+		 */
+		if (aclent[i].a_type & CLASS_OBJ)
+			continue;
+
+		/* If we need an ACL_MASK emulator, prepend it now */
+		if ((hasmask) &&
+		    (aclent[i].a_type & (USER | GROUP | GROUP_OBJ))) {
+			acep->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+			acep->a_flags = 0;
+			if (aclent[i].a_type & GROUP_OBJ) {
+				acep->a_who = -1;
+				acep->a_flags |=
+				    (ACE_IDENTIFIER_GROUP|ACE_GROUP);
+			} else if (aclent[i].a_type & USER) {
+				acep->a_who = aclent[i].a_id;
+			} else {
+				acep->a_who = aclent[i].a_id;
+				acep->a_flags |= ACE_IDENTIFIER_GROUP;
+			}
+			if (aclent[i].a_type & ACL_DEFAULT) {
+				acep->a_flags |= ACE_INHERIT_ONLY_ACE |
+				    ACE_FILE_INHERIT_ACE |
+				    ACE_DIRECTORY_INHERIT_ACE;
+			}
+			/*
+			 * Set the access mask for the prepended deny
+			 * ace.  To do this, we invert the mask (found
+			 * in ln_aent_preprocess()) then convert it to an
+			 * DENY ace access_mask.
+			 */
+			acep->a_access_mask = mode_to_ace_access((mask ^ 07),
+			    isdir, 0, 0);
+			acep += 1;
+		}
+
+		/* handle a_perm -> access_mask */
+		acep->a_access_mask = mode_to_ace_access(aclent[i].a_perm,
+		    isdir, aclent[i].a_type & USER_OBJ, 1);
+
+		/* emulate a default aclent */
+		if (aclent[i].a_type & ACL_DEFAULT) {
+			acep->a_flags |= ACE_INHERIT_ONLY_ACE |
+			    ACE_FILE_INHERIT_ACE |
+			    ACE_DIRECTORY_INHERIT_ACE;
+		}
+
+		/*
+		 * handle a_perm and a_id
+		 *
+		 * this must be done last, since it involves the
+		 * corresponding deny aces, which are handled
+		 * differently for each different a_type.
+		 */
+		if (aclent[i].a_type & USER_OBJ) {
+			acep->a_who = -1;
+			acep->a_flags |= ACE_OWNER;
+			ace_make_deny(acep, acep + 1, isdir, B_TRUE);
+			acep += 2;
+		} else if (aclent[i].a_type & USER) {
+			acep->a_who = aclent[i].a_id;
+			ace_make_deny(acep, acep + 1, isdir, B_FALSE);
+			acep += 2;
+		} else if (aclent[i].a_type & (GROUP_OBJ | GROUP)) {
+			if (aclent[i].a_type & GROUP_OBJ) {
+				acep->a_who = -1;
+				acep->a_flags |= ACE_GROUP;
+			} else {
+				acep->a_who = aclent[i].a_id;
+			}
+			acep->a_flags |= ACE_IDENTIFIER_GROUP;
+			/*
+			 * Set the corresponding deny for the group ace.
+			 *
+			 * The deny aces go after all of the groups, unlike
+			 * everything else, where they immediately follow
+			 * the allow ace.
+			 *
+			 * We calculate "skip", the number of slots to
+			 * skip ahead for the deny ace, here.
+			 *
+			 * The pattern is:
+			 * MD1 A1 MD2 A2 MD3 A3 D1 D2 D3
+			 * thus, skip is
+			 * (2 * numgroup) - 1 - groupi
+			 * (2 * numgroup) to account for MD + A
+			 * - 1 to account for the fact that we're on the
+			 * access (A), not the mask (MD)
+			 * - groupi to account for the fact that we have
+			 * passed up groupi number of MD's.
+			 */
+			skip = (2 * numgroup) - 1 - groupi;
+			ace_make_deny(acep, acep + skip, isdir, B_FALSE);
+			/*
+			 * If we just did the last group, skip acep past
+			 * all of the denies; else, just move ahead one.
+			 */
+			if (++groupi >= numgroup)
+				acep += numgroup + 1;
+			else
+				acep += 1;
+		} else if (aclent[i].a_type & OTHER_OBJ) {
+			acep->a_who = -1;
+			acep->a_flags |= ACE_EVERYONE;
+			ace_make_deny(acep, acep + 1, isdir, B_FALSE);
+			acep += 2;
+		} else {
+			error = EINVAL;
+			goto out;
+		}
+	}
+
+	*acepp = result;
+	*rescount = resultsize;
+
+out:
+	if (error != 0) {
+		if ((result != NULL) && (resultsize > 0)) {
+			free(result);
+		}
+	}
+
+	return (error);
+}
+
+static int
+convert_aent_to_ace(aclent_t *aclentp, int aclcnt, int isdir,
+    ace_t **retacep, int *retacecnt)
+{
+	ace_t *acep;
+	ace_t *dfacep;
+	ace_t *newacep;
+	int acecnt = 0;
+	int dfacecnt = 0;
+	int dfaclstart = 0;
+	int dfaclcnt = 0;
+	aclent_t *aclp;
+	int i;
+	int error;
+
+	ksort((caddr_t)aclentp, aclcnt, sizeof (aclent_t), cmp2acls);
+
+	for (i = 0, aclp = aclentp; i < aclcnt; aclp++, i++) {
+		if (aclp->a_type & ACL_DEFAULT)
+			break;
+	}
+
+	if (i < aclcnt) {
+		dfaclstart = aclcnt - i;
+		dfaclcnt = i;
+	}
+
+	if (dfaclcnt && isdir == 0) {
+		return (-1);
+	}
+
+	error = ln_aent_to_ace(aclentp, i,  &acep, &acecnt, isdir);
+	if (error)
+		return (-1);
+
+	if (dfaclcnt) {
+		error = ln_aent_to_ace(&aclentp[dfaclstart], dfaclcnt,
+		    &dfacep, &dfacecnt, isdir);
+		if (error) {
+			if (acep) {
+				free(acep);
+			}
+			return (-1);
+		}
+	}
+
+	newacep = malloc(sizeof (ace_t) * (acecnt + dfacecnt));
+	if (newacep == NULL)
+		return (-1);
+
+	(void) memcpy(newacep, acep, sizeof (ace_t) * acecnt);
+	if (dfaclcnt) {
+		(void) memcpy(newacep + acecnt, dfacep,
+		    sizeof (ace_t) * dfacecnt);
+	}
+	free(acep);
+	if (dfaclcnt)
+		free(dfacep);
+
+	*retacecnt = acecnt + dfacecnt;
+	*retacep = newacep;
+	return (0);
+}
+
+
+static int
+cacl_get(acl_inp inp, int get_flag, int type, acl_t **aclp)
+{
+	const char *fname;
+	int fd;
+	int ace_acl = 0;
+	int error;
+	int getcmd, cntcmd;
+	acl_t *acl_info;
+	int	save_errno;
+	int	stat_error;
+	struct stat64 statbuf;
+
+	*aclp = NULL;
+	if (type == ACL_PATH) {
+		fname = inp.file;
+		ace_acl = pathconf(fname, _PC_ACL_ENABLED);
+	} else {
+		fd = inp.fd;
+		ace_acl = fpathconf(fd, _PC_ACL_ENABLED);
+	}
+
+	if (ace_acl == -1)
+		return (-1);
+
+	/*
+	 * if acl's aren't supported then
+	 * send it through the old GETACL interface
+	 */
+	if (ace_acl == 0) {
+		ace_acl = _ACL_ACLENT_ENABLED;
+	}
+
+	if (ace_acl & _ACL_ACE_ENABLED) {
+		cntcmd = ACE_GETACLCNT;
+		getcmd = ACE_GETACL;
+		acl_info = acl_alloc(ACE_T);
+	} else {
+		cntcmd = GETACLCNT;
+		getcmd = GETACL;
+		acl_info = acl_alloc(ACLENT_T);
+	}
+
+	if (acl_info == NULL)
+		return (-1);
+
+	if (type == ACL_PATH) {
+		acl_info->acl_cnt = acl(fname, cntcmd, 0, NULL);
+	} else {
+		acl_info->acl_cnt = facl(fd, cntcmd, 0, NULL);
+	}
+
+	save_errno = errno;
+	if (acl_info->acl_cnt < 0) {
+		acl_free(acl_info);
+		errno = save_errno;
+		return (-1);
+	}
+
+	if (acl_info->acl_cnt == 0) {
+		acl_free(acl_info);
+		errno = save_errno;
+		return (0);
+	}
+
+	acl_info->acl_aclp =
+	    malloc(acl_info->acl_cnt * acl_info->acl_entry_size);
+	save_errno = errno;
+
+	if (acl_info->acl_aclp == NULL) {
+		acl_free(acl_info);
+		errno = save_errno;
+		return (-1);
+	}
+
+	if (type == ACL_PATH) {
+		stat_error = stat64(fname, &statbuf);
+		error = acl(fname, getcmd, acl_info->acl_cnt,
+		    acl_info->acl_aclp);
+	} else {
+		stat_error = fstat64(fd, &statbuf);
+		error = facl(fd, getcmd, acl_info->acl_cnt,
+		    acl_info->acl_aclp);
+	}
+
+	save_errno = errno;
+	if (error == -1) {
+		acl_free(acl_info);
+		errno = save_errno;
+		return (-1);
+	}
+
+
+	if (stat_error == 0) {
+		acl_info->acl_flags =
+		    (S_ISDIR(statbuf.st_mode) ? ACL_IS_DIR : 0);
+	} else
+		acl_info->acl_flags = 0;
+
+	switch (acl_info->acl_type) {
+	case ACLENT_T:
+		if (acl_info->acl_cnt <= MIN_ACL_ENTRIES)
+			acl_info->acl_flags |= ACL_IS_TRIVIAL;
+		break;
+	case ACE_T:
+		if (ace_trivial(acl_info->acl_aclp, acl_info->acl_cnt) == 0)
+			acl_info->acl_flags |= ACL_IS_TRIVIAL;
+		break;
+	default:
+		errno = EINVAL;
+		acl_free(acl_info);
+		return (-1);
+	}
+
+	if ((acl_info->acl_flags & ACL_IS_TRIVIAL) &&
+	    (get_flag & ACL_NO_TRIVIAL)) {
+		acl_free(acl_info);
+		errno = 0;
+		return (0);
+	}
+
+	*aclp = acl_info;
+	return (0);
+}
+
+/*
+ * return -1 on failure, otherwise the number of acl
+ * entries is returned
+ */
+int
+acl_get(const char *path, int get_flag, acl_t **aclp)
+{
+	acl_inp acl_inp;
+	acl_inp.file = path;
+
+	return (cacl_get(acl_inp, get_flag, ACL_PATH, aclp));
+}
+
+int
+facl_get(int fd, int get_flag, acl_t **aclp)
+{
+
+	acl_inp acl_inp;
+	acl_inp.fd = fd;
+
+	return (cacl_get(acl_inp, get_flag, ACL_FD, aclp));
+}
+
+/*
+ * Set an ACL, translates acl to ace_t when appropriate.
+ */
+static int
+cacl_set(acl_inp *acl_inp, acl_t *aclp, int type)
+{
+	int error = 0;
+	int acl_flavor_target;
+	ace_t *acep = NULL;
+	int acecnt;
+	struct stat64 statbuf;
+	int stat_error;
+	int isdir;
+
+
+	if (type == ACL_PATH) {
+		stat_error = stat64(acl_inp->file, &statbuf);
+		if (stat_error)
+			return (-1);
+		acl_flavor_target = pathconf(acl_inp->file, _PC_ACL_ENABLED);
+	} else {
+		stat_error = fstat64(acl_inp->fd, &statbuf);
+		if (stat_error)
+			return (-1);
+		acl_flavor_target = fpathconf(acl_inp->fd, _PC_ACL_ENABLED);
+	}
+
+	isdir = S_ISDIR(statbuf.st_mode);
+
+	if (acl_flavor_target == -1)
+		return (-1);
+
+	/*
+	 * Translate aclent_t ACL's to ACE ACL's.
+	 */
+	if (acl_flavor_target ==  _ACL_ACE_ENABLED &&
+	    aclp->acl_type == ACLENT_T) {
+		error = convert_aent_to_ace(aclp->acl_aclp,
+		    aclp->acl_cnt, isdir, &acep, &acecnt);
+		if (error) {
+			errno = ENOTSUP;
+			return (-1);
+		}
+		/*
+		 * replace old acl with newly translated acl
+		 */
+		free(aclp->acl_aclp);
+		aclp->acl_aclp = acep;
+		aclp->acl_cnt = acecnt;
+		aclp->acl_type = ACE_T;
+	}
+
+	if (type == ACL_PATH) {
+		error = acl(acl_inp->file,
+		    (aclp->acl_type == ACE_T) ? ACE_SETACL : SETACL,
+		    aclp->acl_cnt, aclp->acl_aclp);
+	} else {
+		error = facl(acl_inp->fd,
+		    (aclp->acl_type == ACE_T) ? ACE_SETACL : SETACL,
+		    aclp->acl_cnt, aclp->acl_aclp);
+	}
+
+	return (error);
+}
+
+int
+acl_set(const char *path, acl_t *aclp)
+{
+	acl_inp acl_inp;
+
+	acl_inp.file = path;
+
+	return (cacl_set(&acl_inp, aclp, ACL_PATH));
+}
+
+int
+facl_set(int fd, acl_t *aclp)
+{
+	acl_inp acl_inp;
+
+	acl_inp.fd = fd;
+
+	return (cacl_set(&acl_inp, aclp, ACL_FD));
+}
+
+int
+acl_cnt(acl_t *aclp)
+{
+	return (aclp->acl_cnt);
+}
+
+int
+acl_type(acl_t *aclp)
+{
+	return (aclp->acl_type);
+}
+
+acl_t *
+acl_dup(acl_t *aclp)
+{
+	acl_t *newaclp;
+
+	newaclp = acl_alloc(aclp->acl_type);
+	if (newaclp == NULL)
+		return (NULL);
+
+	newaclp->acl_aclp = malloc(aclp->acl_entry_size * aclp->acl_cnt);
+	if (newaclp->acl_aclp == NULL) {
+		acl_free(newaclp);
+		return (NULL);
+	}
+
+	(void) memcpy(newaclp->acl_aclp,
+	    aclp->acl_aclp, aclp->acl_entry_size * aclp->acl_cnt);
+	newaclp->acl_cnt = aclp->acl_cnt;
+
+	return (newaclp);
+}
+
+int
+acl_flags(acl_t *aclp)
+{
+	return (aclp->acl_flags);
+}
+
+void *
+acl_data(acl_t *aclp)
+{
+	return (aclp->acl_aclp);
+}
+
+/*
+ * Remove an ACL from a file and create a trivial ACL based
+ * off of the mode argument.  After acl has been set owner/group
+ * are updated to match owner,group arguments
+ */
+int
+acl_strip(const char *file, uid_t owner, gid_t group, mode_t mode)
+{
+	int	error = 0;
+	aclent_t min_acl[MIN_ACL_ENTRIES];
+	ace_t	min_ace_acl[6];	/* owner, group, everyone + complement denies */
+	int	acl_flavor;
+	int	aclcnt;
+
+	acl_flavor = pathconf(file, _PC_ACL_ENABLED);
+
+	if (acl_flavor == -1)
+		return (-1);
+	/*
+	 * force it through aclent flavor when file system doesn't
+	 * understand question
+	 */
+	if (acl_flavor == 0)
+		acl_flavor = _ACL_ACLENT_ENABLED;
+
+	if (acl_flavor & _ACL_ACLENT_ENABLED) {
+		min_acl[0].a_type = USER_OBJ;
+		min_acl[0].a_id   = owner;
+		min_acl[0].a_perm = ((mode & 0700) >> 6);
+		min_acl[1].a_type = GROUP_OBJ;
+		min_acl[1].a_id   = group;
+		min_acl[1].a_perm = ((mode & 0070) >> 3);
+		min_acl[2].a_type = CLASS_OBJ;
+		min_acl[2].a_id   = (uid_t)-1;
+		min_acl[2].a_perm = ((mode & 0070) >> 3);
+		min_acl[3].a_type = OTHER_OBJ;
+		min_acl[3].a_id   = (uid_t)-1;
+		min_acl[3].a_perm = (mode & 0007);
+		aclcnt = 4;
+		error = acl(file, SETACL, aclcnt, min_acl);
+	} else if (acl_flavor & _ACL_ACE_ENABLED) {
+		(void) memcpy(min_ace_acl, trivial_acl, sizeof (ace_t) * 6);
+
+		/*
+		 * Make aces match request mode
+		 */
+		adjust_ace_pair(&min_ace_acl[0], (mode & 0700) >> 6);
+		adjust_ace_pair(&min_ace_acl[2], (mode & 0070) >> 3);
+		adjust_ace_pair(&min_ace_acl[4], mode & 0007);
+
+		error = acl(file, ACE_SETACL, 6, min_ace_acl);
+	} else {
+		errno = EINVAL;
+		error = 1;
+	}
+
+	if (error == 0)
+		error = chown(file, owner, group);
+	return (error);
+}
+
+static int
+ace_match(void *entry1, void *entry2)
+{
+	ace_t *p1 = (ace_t *)entry1;
+	ace_t *p2 = (ace_t *)entry2;
+	ace_t ace1, ace2;
+
+	ace1 = *p1;
+	ace2 = *p2;
+
+	/*
+	 * Need to fixup who field for abstrations for
+	 * accurate comparison, since field is undefined.
+	 */
+	if (ace1.a_flags & (ACE_OWNER|ACE_GROUP|ACE_EVERYONE))
+		ace1.a_who = -1;
+	if (ace2.a_flags & (ACE_OWNER|ACE_GROUP|ACE_EVERYONE))
+		ace2.a_who = -1;
+	return (memcmp(&ace1, &ace2, sizeof (ace_t)));
+}
+
+static int
+aclent_match(void *entry1, void *entry2)
+{
+	aclent_t *aclent1 = (aclent_t *)entry1;
+	aclent_t *aclent2 = (aclent_t *)entry2;
+
+	return (memcmp(aclent1, aclent2, sizeof (aclent_t)));
+}
+
+/*
+ * Find acl entries in acl that correspond to removeacl.  Search
+ * is started from slot.  The flag argument indicates whether to
+ * remove all matches or just the first match.
+ */
+int
+acl_removeentries(acl_t *acl, acl_t *removeacl, int start_slot, int flag)
+{
+	int i, j;
+	int match;
+	int (*acl_match)(void *acl1, void *acl2);
+	void *acl_entry, *remove_entry;
+	void *start;
+	int found = 0;
+
+	if (flag != ACL_REMOVE_ALL && flag != ACL_REMOVE_FIRST)
+		flag = ACL_REMOVE_FIRST;
+
+	if (acl == NULL || removeacl == NULL)
+		return (EACL_NO_ACL_ENTRY);
+
+	if (acl->acl_type != removeacl->acl_type)
+		return (EACL_DIFF_TYPE);
+
+	if (acl->acl_type == ACLENT_T)
+		acl_match = aclent_match;
+	else
+		acl_match = ace_match;
+
+	for (i = 0, remove_entry = removeacl->acl_aclp;
+	    i != removeacl->acl_cnt; i++) {
+
+		j = 0;
+		acl_entry = (char *)acl->acl_aclp +
+		    (acl->acl_entry_size * start_slot);
+		for (;;) {
+			match = acl_match(acl_entry, remove_entry);
+			if (match == 0)  {
+				found++;
+				start = (char *)acl_entry +
+				    acl->acl_entry_size;
+				(void) memmove(acl_entry, start,
+				    acl->acl_entry_size *
+				    acl->acl_cnt-- - (j + 1));
+
+				if (flag == ACL_REMOVE_FIRST)
+					break;
+				/*
+				 * List has changed, restart search from
+				 * beginning.
+				 */
+				acl_entry = acl->acl_aclp;
+				j = 0;
+				continue;
+			}
+			acl_entry = ((char *)acl_entry + acl->acl_entry_size);
+			if (++j >= acl->acl_cnt) {
+				break;
+			}
+		}
+	}
+
+	return ((found == 0) ? EACL_NO_ACL_ENTRY : 0);
+}
+
+/*
+ * Replace entires entries in acl1 with the corresponding entries
+ * in newentries.  The where argument specifies where to begin
+ * the replacement.  If the where argument is 1 greater than the
+ * number of acl entries in acl1 then they are appended.  If the
+ * where argument is 2+ greater than the number of acl entries then
+ * EACL_INVALID_SLOT is returned.
+ */
+int
+acl_modifyentries(acl_t *acl1, acl_t *newentries, int where)
+{
+
+	int slot;
+	int slots_needed;
+	int slots_left;
+	int newsize;
+
+	if (acl1 == NULL || newentries == NULL)
+		return (EACL_NO_ACL_ENTRY);
+
+	if (where < 0 || where >= acl1->acl_cnt)
+		return (EACL_INVALID_SLOT);
+
+	if (acl1->acl_type != newentries->acl_type)
+		return (EACL_DIFF_TYPE);
+
+	slot = where;
+
+	slots_left = acl1->acl_cnt - slot + 1;
+	if (slots_left < newentries->acl_cnt) {
+		slots_needed = newentries->acl_cnt - slots_left;
+		newsize = (acl1->acl_entry_size * acl1->acl_cnt) +
+		    (acl1->acl_entry_size * slots_needed);
+		acl1->acl_aclp = realloc(acl1->acl_aclp, newsize);
+		if (acl1->acl_aclp == NULL)
+			return (-1);
+	}
+	(void) memcpy((char *)acl1->acl_aclp + (acl1->acl_entry_size * slot),
+	    newentries->acl_aclp,
+	    newentries->acl_entry_size * newentries->acl_cnt);
+
+	/*
+	 * Did ACL grow?
+	 */
+
+	if ((slot + newentries->acl_cnt) > acl1->acl_cnt) {
+		acl1->acl_cnt = slot + newentries->acl_cnt;
+	}
+
+	return (0);
+}
+
+/*
+ * Add acl2 entries into acl1.  The where argument specifies where
+ * to add the entries.
+ */
+int
+acl_addentries(acl_t *acl1, acl_t *acl2, int where)
+{
+
+	int newsize;
+	int len;
+	void *start;
+	void *to;
+
+	if (acl1 == NULL || acl2 == NULL)
+		return (EACL_NO_ACL_ENTRY);
+
+	if (acl1->acl_type != acl2->acl_type)
+		return (EACL_DIFF_TYPE);
+
+	/*
+	 * allow where to specify 1 past last slot for an append operation
+	 * but anything greater is an error.
+	 */
+	if (where < 0 || where > acl1->acl_cnt)
+		return (EACL_INVALID_SLOT);
+
+	newsize = (acl2->acl_entry_size * acl2->acl_cnt) +
+	    (acl1->acl_entry_size * acl1->acl_cnt);
+	acl1->acl_aclp = realloc(acl1->acl_aclp, newsize);
+	if (acl1->acl_aclp == NULL)
+		return (-1);
+
+	/*
+	 * first push down entries where new ones will be inserted
+	 */
+
+	to = (void *)((char *)acl1->acl_aclp +
+	    ((where + acl2->acl_cnt) * acl1->acl_entry_size));
+
+	start = (void *)((char *)acl1->acl_aclp +
+	    where * acl1->acl_entry_size);
+
+	if (where < acl1->acl_cnt) {
+		len = (acl1->acl_cnt - where) * acl1->acl_entry_size;
+		(void) memmove(to, start, len);
+	}
+
+	/*
+	 * now stick in new entries.
+	 */
+
+	(void) memmove(start, acl2->acl_aclp,
+	    acl2->acl_cnt * acl2->acl_entry_size);
+
+	acl1->acl_cnt += acl2->acl_cnt;
+	return (0);
+}
+
+static void
+aclent_perms(int perm, char *txt_perms)
+{
+	if (perm & S_IROTH)
+		txt_perms[0] = 'r';
+	else
+		txt_perms[0] = '-';
+	if (perm & S_IWOTH)
+		txt_perms[1] = 'w';
+	else
+		txt_perms[1] = '-';
+	if (perm & S_IXOTH)
+		txt_perms[2] = 'x';
+	else
+		txt_perms[2] = '-';
+	txt_perms[3] = '\0';
+}
+
+static char *
+pruname(uid_t uid)
+{
+	struct passwd	*passwdp;
+	static char	uidp[10];	/* big enough */
+
+	passwdp = getpwuid(uid);
+	if (passwdp == (struct passwd *)NULL) {
+		/* could not get passwd information: display uid instead */
+		(void) sprintf(uidp, "%ld", (long)uid);
+		return (uidp);
+	} else
+		return (passwdp->pw_name);
+}
+
+static char *
+prgname(gid_t gid)
+{
+	struct group	*groupp;
+	static char	gidp[10];	/* big enough */
+
+	groupp = getgrgid(gid);
+	if (groupp == (struct group *)NULL) {
+		/* could not get group information: display gid instead */
+		(void) sprintf(gidp, "%ld", (long)gid);
+		return (gidp);
+	} else
+		return (groupp->gr_name);
+}
+static void
+aclent_printacl(acl_t *aclp)
+{
+	aclent_t *tp;
+	int aclcnt;
+	int mask;
+	int slot = 0;
+	char perm[4];
+
+	/* display ACL: assume it is sorted. */
+	aclcnt = aclp->acl_cnt;
+	for (tp = aclp->acl_aclp; aclcnt--; tp++) {
+		if (tp->a_type == CLASS_OBJ)
+			mask = tp->a_perm;
+	}
+	aclcnt = aclp->acl_cnt;
+	for (tp = aclp->acl_aclp; aclcnt--; tp++) {
+		(void) printf("     %d:", slot++);
+		switch (tp->a_type) {
+		case USER:
+			aclent_perms(tp->a_perm, perm);
+			(void) printf("user:%s:%s\t\t",
+			    pruname(tp->a_id), perm);
+			aclent_perms((tp->a_perm & mask), perm);
+			(void) printf("#effective:%s\n", perm);
+			break;
+		case USER_OBJ:
+			/* no need to display uid */
+			aclent_perms(tp->a_perm, perm);
+			(void) printf("user::%s\n", perm);
+			break;
+		case GROUP:
+			aclent_perms(tp->a_perm, perm);
+			(void) printf("group:%s:%s\t\t",
+			    prgname(tp->a_id), perm);
+			aclent_perms(tp->a_perm & mask, perm);
+			(void) printf("#effective:%s\n", perm);
+			break;
+		case GROUP_OBJ:
+			aclent_perms(tp->a_perm, perm);
+			(void) printf("group::%s\t\t", perm);
+			aclent_perms(tp->a_perm & mask, perm);
+			(void) printf("#effective:%s\n", perm);
+			break;
+		case CLASS_OBJ:
+			aclent_perms(tp->a_perm, perm);
+			(void) printf("mask:%s\n", perm);
+			break;
+		case OTHER_OBJ:
+			aclent_perms(tp->a_perm, perm);
+			(void) printf("other:%s\n", perm);
+			break;
+		case DEF_USER:
+			aclent_perms(tp->a_perm, perm);
+			(void) printf("default:user:%s:%s\n",
+			    pruname(tp->a_id), perm);
+			break;
+		case DEF_USER_OBJ:
+			aclent_perms(tp->a_perm, perm);
+			(void) printf("default:user::%s\n", perm);
+			break;
+		case DEF_GROUP:
+			aclent_perms(tp->a_perm, perm);
+			(void) printf("default:group:%s:%s\n",
+			    prgname(tp->a_id), perm);
+			break;
+		case DEF_GROUP_OBJ:
+			aclent_perms(tp->a_perm, perm);
+			(void) printf("default:group::%s\n", perm);
+			break;
+		case DEF_CLASS_OBJ:
+			aclent_perms(tp->a_perm, perm);
+			(void) printf("default:mask:%s\n", perm);
+			break;
+		case DEF_OTHER_OBJ:
+			aclent_perms(tp->a_perm, perm);
+			(void) printf("default:other:%s\n", perm);
+			break;
+		default:
+			(void) fprintf(stderr,
+			    gettext("unrecognized entry\n"));
+			break;
+		}
+	}
+}
+
+static void
+split_line(char *str, int cols)
+{
+	char *ptr;
+	int len;
+	int i;
+	int last_split;
+	char pad[11];
+	int pad_len;
+
+	len = strlen(str);
+	ptr = str;
+	(void) strcpy(pad, "");
+	pad_len = 0;
+
+	ptr = str;
+	last_split = 0;
+	for (i = 0; i != len; i++) {
+		if ((i + pad_len + 4) >= cols) {
+			(void) printf("%s%.*s\n", pad, last_split, ptr);
+			ptr = &ptr[last_split];
+			len = strlen(ptr);
+			i = 0;
+			pad_len = 4;
+			(void) strcpy(pad, "         ");
+		} else {
+			if (ptr[i] == '/' || ptr[i] == ':') {
+				last_split = i;
+			}
+		}
+	}
+	if (i == len) {
+		(void) printf("%s%s\n", pad, ptr);
+	}
+}
+
+static void
+ace_printacl(acl_t *aclp, int cols)
+{
+	int  slot = 0;
+	char *token;
+	char *acltext;
+
+	acltext = acl_totext(aclp);
+
+	if (acltext == NULL)
+		return;
+
+	token = strtok(acltext, ",");
+	if (token == NULL) {
+		free(acltext);
+		return;
+	}
+
+	do {
+		(void) printf("     %d:", slot++);
+		split_line(token, cols - 5);
+	} while (token = strtok(NULL, ","));
+	free(acltext);
+}
+
+/*
+ * pretty print an ACL.
+ * For aclent_t ACL's the format is
+ * similar to the old format used by getfacl,
+ * with the addition of adding a "slot" number
+ * before each entry.
+ *
+ * for ace_t ACL's the cols variable will break up
+ * the long lines into multiple lines and will also
+ * print a "slot" number.
+ */
+void
+acl_printacl(acl_t *aclp, int cols)
+{
+
+	switch (aclp->acl_type) {
+	case ACLENT_T:
+		aclent_printacl(aclp);
+		break;
+	case ACE_T:
+		ace_printacl(aclp, cols);
+		break;
+	}
+}
+
+
+/*
+ * return text for an ACL error.
+ */
+char *
+acl_strerror(int errnum)
+{
+	switch (errnum) {
+	case EACL_GRP_ERROR:
+		return (dgettext(TEXT_DOMAIN,
+		    "There is more than one user group owner entry"));
+	case EACL_USER_ERROR:
+		return (dgettext(TEXT_DOMAIN,
+		    "There is more than one user owner entry"));
+	case EACL_OTHER_ERROR:
+		return (dgettext(TEXT_DOMAIN,
+		    "There is more than one other entry"));
+	case EACL_CLASS_ERROR:
+		return (dgettext(TEXT_DOMAIN,
+		    "There is more than one mask entry"));
+	case EACL_DUPLICATE_ERROR:
+		return (dgettext(TEXT_DOMAIN,
+		    "Duplicate user or group entries"));
+	case EACL_MISS_ERROR:
+		return (dgettext(TEXT_DOMAIN,
+		    "Missing user/group owner, other, mask entry"));
+	case EACL_MEM_ERROR:
+		return (dgettext(TEXT_DOMAIN,
+		    "Memory error"));
+	case EACL_ENTRY_ERROR:
+		return (dgettext(TEXT_DOMAIN,
+		    "Unrecognized entry type"));
+	case EACL_INHERIT_ERROR:
+		return (dgettext(TEXT_DOMAIN,
+		    "Invalid inheritance flags"));
+	case EACL_FLAGS_ERROR:
+		return (dgettext(TEXT_DOMAIN,
+		    "Unrecognized entry flags"));
+	case EACL_PERM_MASK_ERROR:
+		return (dgettext(TEXT_DOMAIN,
+		    "Invalid ACL permissions"));
+	case EACL_COUNT_ERROR:
+		return (dgettext(TEXT_DOMAIN,
+		    "Invalid ACL count"));
+	case EACL_INVALID_SLOT:
+		return (dgettext(TEXT_DOMAIN,
+		    "Invalid ACL entry number specified"));
+	case EACL_NO_ACL_ENTRY:
+		return (dgettext(TEXT_DOMAIN,
+		    "ACL entry doesn't exist"));
+	case EACL_DIFF_TYPE:
+		return (dgettext(TEXT_DOMAIN,
+		    "ACL type's are different"));
+	case EACL_INVALID_USER_GROUP:
+		return (dgettext(TEXT_DOMAIN, "Invalid user or group"));
+	case EACL_INVALID_STR:
+		return (dgettext(TEXT_DOMAIN, "ACL string is invalid"));
+	case EACL_FIELD_NOT_BLANK:
+		return (dgettext(TEXT_DOMAIN, "Field expected to be blank"));
+	case EACL_INVALID_ACCESS_TYPE:
+		return (dgettext(TEXT_DOMAIN, "Invalid access type"));
+	case EACL_UNKNOWN_DATA:
+		return (dgettext(TEXT_DOMAIN, "Unrecognized entry"));
+	case EACL_MISSING_FIELDS:
+		return (dgettext(TEXT_DOMAIN,
+		    "ACL specification missing required fields"));
+	case EACL_INHERIT_NOTDIR:
+		return (dgettext(TEXT_DOMAIN,
+		    "Inheritance flags are only allowed on directories"));
+	case -1:
+		return (strerror(errno));
+	default:
+		errno = EINVAL;
+		return (dgettext(TEXT_DOMAIN, "Unknown error"));
+	}
+}
diff --git a/usr/src/lib/libsec/common/aclutils.h b/usr/src/lib/libsec/common/aclutils.h
new file mode 100644
index 000000000000..b8e95dfe8007
--- /dev/null
+++ b/usr/src/lib/libsec/common/aclutils.h
@@ -0,0 +1,85 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ACLUTILS_H
+#define	_ACLUTILS_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	ACL_REMOVE_ALL		0x0
+#define	ACL_REMOVE_FIRST	0x1
+
+/*
+ * Hint for whether acl_totext() should use
+ * mneumonics:
+ * read_data/list_directory
+ * write_data/add_file or
+ * append_data/add_subdirectory
+ * when object of ACL is known.
+ */
+#define	ACL_IS_DIR	0x2
+
+typedef enum acl_type {
+	ACLENT_T = 0,
+	ACE_T = 1
+} acl_type_t;
+
+/*
+ * acl flags
+ */
+#define	ACL_IS_TRIVIAL	0x1
+
+struct acl_info {
+	acl_type_t acl_type;		/* style of acl */
+	int acl_cnt;			/* number of acl entries */
+	int acl_entry_size;		/* sizeof acl entry */
+	int acl_flags;			/* special flags about acl */
+	void *acl_aclp;			/* the acl */
+};
+
+
+extern int acl_addentries(acl_t *, acl_t *, int);
+extern int acl_removeentries(acl_t *, acl_t *, int, int);
+extern int acl_modifyentries(acl_t *, acl_t *, int);
+extern void acl_printacl(acl_t *, int);
+extern char *acl_strerror(int);
+extern acl_t *acl_dup(acl_t *);
+extern int acl_type(acl_t *);
+extern int acl_cnt(acl_t *);
+extern int acl_flags(acl_t *);
+extern void *acl_data(acl_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _ACLUTILS_H */
diff --git a/usr/src/lib/libsec/common/llib-lsec b/usr/src/lib/libsec/common/llib-lsec
index 8db1f3875f13..36d04ec197ea 100644
--- a/usr/src/lib/libsec/common/llib-lsec
+++ b/usr/src/lib/libsec/common/llib-lsec
@@ -23,8 +23,8 @@
 /* PROTOLIB1 */
 
 /*
- * Copyright (c) 1997 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
@@ -37,10 +37,4 @@
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/acl.h>
-
-int aclcheck(aclent_t *aclbufp, int nentries, int *which);
-int aclfrommode(aclent_t *aclbufp, int nentries, mode_t *modep);
-aclent_t *aclfromtext(char *aclimport, int *aclcnt);
-int aclsort(int nentries, int calcmask, aclent_t *aclbufp);
-int acltomode(aclent_t *aclbufp, int nentries, mode_t *modep);
-char *acltotext(aclent_t *aclp, int aclcnt);
+#include <aclutils.h>
diff --git a/usr/src/lib/libsec/inc.flg b/usr/src/lib/libsec/inc.flg
new file mode 100644
index 000000000000..46ff3fd65827
--- /dev/null
+++ b/usr/src/lib/libsec/inc.flg
@@ -0,0 +1,29 @@
+#!/bin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+# Copyright 2005 Sun Microsystems, Inc.
+# All rights reserved.  Use is subject to license terms.
+
+find_files "s.*" usr/src/common/acl
diff --git a/usr/src/lib/libsec/spec/acl.spec b/usr/src/lib/libsec/spec/acl.spec
index b69d7109e1c4..afedf74f3428 100644
--- a/usr/src/lib/libsec/spec/acl.spec
+++ b/usr/src/lib/libsec/spec/acl.spec
@@ -41,6 +41,26 @@ exception	($return == GRP_ERROR	|| \
 			$return == MEM_ERROR)
 end		
 
+function	acl_check
+include		<sys/acl.h>
+declaration	int acl_check(acl_t *aclp, int flag);
+version		SUNW_1.2
+errno		EINVAL
+exception	 ($return == EACL_GRP_ERROR   || \
+	$return == EACL_USER_ERROR   || \
+	$return == EACL_OTHER_ERROR  || \
+	$return == EACL_CLASS_ERROR  || \
+	$return == EACL_DUPLICATE_ERROR      || \
+	$return == EACL_MISS_ERROR  || \
+	$return == EACL_MEM_ERROR   || \
+	$return == EACL_ENTRY_ERROR)	|| \
+	$return == EACL_INHERIT_ERROR || \
+	$return == EACL_FLAGS_ERROR || \
+	$return == EACL_PERM_MASK_ERROR || \
+	$return == EACL_COUNT_ERROR 
+end
+
+
 function	aclsort
 include		<sys/acl.h>
 declaration	int aclsort(int nentries, int calclass, aclent_t *aclbufp)
@@ -78,3 +98,124 @@ version		SUNW_0.9
 exception	$return == 0
 end		
 
+function	acl_get
+include		<sys/acl.h>
+declaration	int acl_get(char *, int, acl_t **);
+version		SUNW_1.2
+end		
+
+function	facl_get
+include		<aclutils.h>
+declaration	int facl_get(int, int, acl_t **);
+version		SUNW_1.2
+end		
+
+function	acl_set
+include		<sys/acl.h>
+declaration	int acl_set(char *, acl_t  *);
+version		SUNW_1.2
+end		
+
+function	facl_set
+include		<sys/acl.h>
+declaration	int facl_set(int, acl_t  *);
+version		SUNW_1.2
+end		
+
+function	acl_strip
+include		<sys/acl.h>
+declaration	int acl_strip(char *, uid_t, gid_t, mode_t);
+version		SUNW_1.2
+end		
+
+function	acl_trivial
+include		<sys/acl.h>
+declaration	int acl_trivial(char *file)
+version		SUNW_1.2
+end
+
+function	acl_totext
+include		<sys/acl.h>
+declaration	char *acl_totext(acl_t *acl);
+version		SUNW_1.2
+exception	$return == 0
+end
+
+function	acl_fromtext
+include		<sys/acl.h>
+declaration	int acl_fromtext(char *textp, acl_t **);
+version		SUNW_1.2
+end
+
+function	acl_free
+include		<sys/acl.h>
+declaration	void acl_free(acl_t *aclp);
+version		SUNW_1.2
+end
+
+function	acl_addentries
+include		<sys/acl.h>
+declaration	int acl_addentries(acl_t *acl1, aclt_t *acl2, int slot);
+version		SUNWprivate_1.1
+end
+
+function	acl_removeentries
+include		<sys/acl.h>
+declaration	int acl_removeentries(acl_t *acl1, aclt_t *acl2, int, int);
+version		SUNWprivate_1.1
+end
+
+function	acl_printacl
+include		<sys/acl.h>
+declaration	void acl_printacl(acl_t *aclp, int cols);
+version		SUNWprivate_1.1
+end
+
+function	acl_strerror
+include		<sys/acl.h>
+declaration	char *acl_strerror(int errnum);
+version		SUNWprivate_1.1
+end
+
+function	acl_modifyentries
+include		<sys/acl.h>
+declaration	int acl_modifyentries(acl_t *acl1, acl_t *newentries,
+    int where);
+version		SUNWprivate_1.1
+end
+
+function	acl_alloc
+include		<sys/acl.h>
+declaration	int acl_alloc(enum acl_type);
+version		SUNWprivate_1.1
+end
+
+function	acl_dup
+include		<aclutils.h>
+declaration	acl_t acl_dup(acl_t *);
+version		SUNWprivate_1.1
+end 
+
+function	acl_cnt
+include		<aclutils.h>
+declaration	int acl_cnt(acl_t *);
+version		SUNWprivate_1.1
+end
+
+function	acl_type
+include		<aclutils.h>
+declaration	int acl_type(acl_t *);
+version		SUNWprivate_1.1
+end
+
+function	acl_flags
+include		<aclutils.h>
+declaration	int acl_flags(acl_t *);
+version		SUNWprivate_1.1
+end
+
+function	acl_data
+include		<aclutils.h>
+declaration	void *acl_data(acl_t *);
+version		SUNWprivate_1.1
+end
diff --git a/usr/src/lib/libsec/spec/versions b/usr/src/lib/libsec/spec/versions
index 710b438c7203..c8e4665f4d0e 100644
--- a/usr/src/lib/libsec/spec/versions
+++ b/usr/src/lib/libsec/spec/versions
@@ -30,18 +30,26 @@
 # (when it did contain symbols explicitly) may depend on it.
 #
 i386 {
+	SUNW_1.2:	{SUNW_1.1};
 	SUNW_1.1:	{SUNW_0.9};
 	SUNW_0.9;
+	SUNWprivate_1.1;
 }
 sparcv9 {
+	SUNW_1.2:	{SUNW_1.1};
 	SUNW_1.1:	{SUNW_0.9};
 	SUNW_0.9;
+	SUNWprivate_1.1;
 }
 sparc {
+	SUNW_1.2:	{SUNW_1.1};
 	SUNW_1.1:	{SUNW_0.9};
 	SUNW_0.9;
+	SUNWprivate_1.1;
 }
 amd64 {
+	SUNW_1.2:	{SUNW_1.1};
 	SUNW_1.1:	{SUNW_0.9};
 	SUNW_0.9;
+	SUNWprivate_1.1;
 }
diff --git a/usr/src/lib/libsecdb/exec_attr.txt b/usr/src/lib/libsecdb/exec_attr.txt
index 36eaaabd010b..8ae002321301 100644
--- a/usr/src/lib/libsecdb/exec_attr.txt
+++ b/usr/src/lib/libsecdb/exec_attr.txt
@@ -291,6 +291,8 @@ User Security:solaris:cmd:::/usr/sbin/passmgmt:uid=0
 User Security:suser:cmd:::/usr/sbin/pwck:euid=0
 User Security:suser:cmd:::/usr/sbin/pwconv:euid=0
 DAT Administration:solaris:cmd:::/usr/sbin/datadm:euid=0
+ZFS File System Management:solaris:cmd:::/usr/sbin/zfs:euid=0
+ZFS Storage Management:solaris:cmd:::/usr/sbin/zpool:euid=0
 Zone Management:solaris:cmd:::/usr/sbin/zonecfg:uid=0
 Zone Management:solaris:cmd:::/usr/sbin/zoneadm:uid=0
 Zone Management:solaris:cmd:::/usr/sbin/zlogin:uid=0
diff --git a/usr/src/lib/libsecdb/help/profiles/Makefile b/usr/src/lib/libsecdb/help/profiles/Makefile
index c6891cf9c4a4..1368939ed911 100644
--- a/usr/src/lib/libsecdb/help/profiles/Makefile
+++ b/usr/src/lib/libsecdb/help/profiles/Makefile
@@ -19,7 +19,7 @@
 #
 # CDDL HEADER END
 #
-# Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 #ident	"%Z%%M%	%I%	%E% SMI"
@@ -63,6 +63,8 @@ HTMLENTS = \
 	RtUserMngmnt.html \
 	RtUserSecurity.html \
 	RtDatAdmin.html \
+	RtZFSFileSysMngmnt.html \
+	RtZFSStorageMngmnt.html \
 	RtZoneMngmnt.html \
 	RtDefault.html
 
diff --git a/usr/src/lib/libsecdb/help/profiles/RtZFSFileSysMngmnt.html b/usr/src/lib/libsecdb/help/profiles/RtZFSFileSysMngmnt.html
new file mode 100644
index 000000000000..d33cbab9908a
--- /dev/null
+++ b/usr/src/lib/libsecdb/help/profiles/RtZFSFileSysMngmnt.html
@@ -0,0 +1,43 @@
+<HTML>
+<!--
+    CDDL HEADER START
+
+    The contents of this file are subject to the terms of the
+    Common Development and Distribution License, Version 1.0 only
+    (the "License").  You may not use this file except in compliance
+    with the License.
+
+    You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+    or http://www.opensolaris.org/os/licensing.
+    See the License for the specific language governing permissions
+    and limitations under the License.
+
+    When distributing Covered Code, include this CDDL HEADER in each
+    file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+    If applicable, add the following below this CDDL HEADER, with the
+    fields enclosed by brackets "[]" replaced with your own identifying
+    information: Portions Copyright [yyyy] [name of copyright owner]
+
+    CDDL HEADER END
+
+-- Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+-- Use is subject to license terms.
+-->
+<HEAD>
+	<TITLE> </TITLE>
+	 
+	
+</HEAD>
+<BODY>
+<!-- ident	"%Z%%M%	%I%	%E% SMI" -->
+
+When ZFS File System Management is in the Rights Included column, it grants the
+right to use commands needed to manage ZFS filesystems.  This includes creating
+and destroying filesystems, taking snapshots and clones, getting and setting
+properties, and mounting and unmounting filesystesm.
+<p>
+If ZFS File System Management is grayed, then you are not entitled to Add or
+Remove this right.
+<p>
+</BODY>
+</HTML>
diff --git a/usr/src/lib/libsecdb/help/profiles/RtZFSStorageMngmnt.html b/usr/src/lib/libsecdb/help/profiles/RtZFSStorageMngmnt.html
new file mode 100644
index 000000000000..6317b6202a1e
--- /dev/null
+++ b/usr/src/lib/libsecdb/help/profiles/RtZFSStorageMngmnt.html
@@ -0,0 +1,43 @@
+<HTML>
+<!--
+    CDDL HEADER START
+
+    The contents of this file are subject to the terms of the
+    Common Development and Distribution License, Version 1.0 only
+    (the "License").  You may not use this file except in compliance
+    with the License.
+
+    You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+    or http://www.opensolaris.org/os/licensing.
+    See the License for the specific language governing permissions
+    and limitations under the License.
+
+    When distributing Covered Code, include this CDDL HEADER in each
+    file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+    If applicable, add the following below this CDDL HEADER, with the
+    fields enclosed by brackets "[]" replaced with your own identifying
+    information: Portions Copyright [yyyy] [name of copyright owner]
+
+    CDDL HEADER END
+
+-- Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+-- Use is subject to license terms.
+-->
+<HEAD>
+	<TITLE> </TITLE>
+	 
+	
+</HEAD>
+<BODY>
+<!-- ident	"%Z%%M%	%I%	%E% SMI" -->
+
+When ZFS Storage Management is in the Rights Included column, it grants the
+right to use commands needed to manage ZFS Storage Pools.  This includes
+creating and destroying pools, adding and removing devices, replacing devices,
+an importing and exporting storage pools.
+<p>
+If ZFS Storage Management is grayed, then you are not entitled to Add or Remove
+this right.
+<p>
+</BODY>
+</HTML>
diff --git a/usr/src/lib/libsecdb/prof_attr.txt b/usr/src/lib/libsecdb/prof_attr.txt
index 131184f61891..913d2b94dd78 100644
--- a/usr/src/lib/libsecdb/prof_attr.txt
+++ b/usr/src/lib/libsecdb/prof_attr.txt
@@ -66,6 +66,8 @@ Crypto Management:::Cryptographic Framework Administration:help=RtCryptoMngmnt.h
 Kerberos Client Management:::Maintain and Administer Kerberos excluding the servers:help=RtKerberosClntMngmnt.html
 Kerberos Server Management:::Maintain and Administer Kerberos Servers:profiles=Kerberos Client Management;help=RtKerberosSrvrMngmnt.html
 DAT Administration:::Manage the DAT configuration:help=RtDatAdmin.html
+ZFS File System Management:::Create and Manage ZFS File Systems:help=RtZFSFileSysMngmnt.html
+ZFS Storage Management:::Create and Manage ZFS Storage Pools:help=RtZFSStorageMngmnt.html
 Zone Management:::Zones Virtual Application Environment Administration:help=RtZoneMngmnt.html
 IP Filter Management:::IP Filter Administration:help=RtIPFilterMngmnt.html
 Project Management:::Add/Modify/Remove projects:help=RtProjManagement.html
diff --git a/usr/src/lib/libzfs/Makefile b/usr/src/lib/libzfs/Makefile
new file mode 100644
index 000000000000..5a5e6abd8410
--- /dev/null
+++ b/usr/src/lib/libzfs/Makefile
@@ -0,0 +1,67 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include		../Makefile.lib
+
+HDRS=		libzfs.h
+
+HDRDIR=		common
+
+SUBDIRS=	$(MACH)
+$(BUILD64)SUBDIRS += $(MACH64)
+
+all :=		TARGET= all
+clean :=	TARGET= clean
+clobber :=	TARGET= clobber
+install :=	TARGET= install
+lint :=		TARGET= lint
+
+MSGFILES =	common/libzfs_dataset.c common/libzfs_mount.c \
+		common/libzfs_util.c
+POFILE =	libzfs.po
+
+.KEEP_STATE:
+
+all clean clobber install: spec .WAIT $(SUBDIRS)
+
+$(POFILE):	pofile_MSGFILES
+
+lint: $(SUBDIRS)
+
+install_h: $(ROOTHDRS)
+
+check: $(CHECKHDRS)
+
+_msg: $(MSGDOMAINPOFILE)
+
+$(SUBDIRS) spec: FRC
+	@cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
+include ../../Makefile.msg.targ
diff --git a/usr/src/lib/libzfs/Makefile.com b/usr/src/lib/libzfs/Makefile.com
new file mode 100644
index 000000000000..f8d17fbf1a85
--- /dev/null
+++ b/usr/src/lib/libzfs/Makefile.com
@@ -0,0 +1,68 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+LIBRARY= libzfs.a
+VERS= .1
+
+OBJS_SHARED= zfs_namecheck.o zfs_prop.o
+OBJS_COMMON= libzfs_dataset.o libzfs_util.o libzfs_graph.o libzfs_mount.o \
+	libzfs_pool.o libzfs_changelist.o libzfs_config.o libzfs_import.o \
+	libzfs_status.o
+OBJECTS= $(OBJS_COMMON) $(OBJS_SHARED)
+
+include ../../Makefile.lib
+
+LIBS=	$(DYNLIB) $(LINTLIB)
+
+INCS += -I$(SRCDIR)
+INCS += -I../../../uts/common/fs/zfs
+INCS += -I../../../common/zfs
+
+C99MODE=	-xc99=%all
+C99LMODE=	-Xc99=%all
+LDLIBS +=	-lc -lm -ldevinfo -ldevid -lgen -lnvpair -luutil
+CPPFLAGS +=	$(INCS) -D_REENTRANT
+
+SRCS=	$(OBJS_COMMON:%.o=$(SRCDIR)/%.c)	\
+	$(OBJS_SHARED:%.o=$(SRC)/common/zfs/%.c)
+$(LINTLIB) := SRCS=	$(SRCDIR)/$(LINTSRC)
+
+SRCDIR=		../common
+MAPDIR=		../spec/$(TRANSMACH)
+SPECMAPFILE=	$(MAPDIR)/mapfile
+
+.KEEP_STATE:
+
+all: $(LIBS)
+
+lint: lintcheck
+
+pics/%.o: ../../../common/zfs/%.c
+	$(COMPILE.c) -o $@ $<
+	$(POST_PROCESS_O)
+
+include ../../Makefile.targ
diff --git a/usr/src/lib/libzfs/amd64/Makefile b/usr/src/lib/libzfs/amd64/Makefile
new file mode 100644
index 000000000000..44075ed1bddf
--- /dev/null
+++ b/usr/src/lib/libzfs/amd64/Makefile
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include ../Makefile.com
+include ../../Makefile.lib.64
+
+install: all $(ROOTLIBS64) $(ROOTLINKS64)
diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h
new file mode 100644
index 000000000000..a9caff662c1b
--- /dev/null
+++ b/usr/src/lib/libzfs/common/libzfs.h
@@ -0,0 +1,298 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_LIBZFS_H
+#define	_LIBZFS_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <assert.h>
+#include <libnvpair.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/varargs.h>
+#include <sys/fs/zfs.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Miscellaneous ZFS constants
+ */
+#define	ZFS_MAXNAMELEN		MAXNAMELEN
+#define	ZPOOL_MAXNAMELEN	MAXNAMELEN
+#define	ZFS_MAXPROPLEN		MAXPATHLEN
+
+/*
+ * Basic handle types
+ */
+typedef struct zfs_handle zfs_handle_t;
+typedef struct zpool_handle zpool_handle_t;
+
+/*
+ * Basic handle functions
+ */
+extern zpool_handle_t *zpool_open(const char *);
+extern zpool_handle_t *zpool_open_canfail(const char *);
+extern void zpool_close(zpool_handle_t *);
+extern const char *zpool_get_name(zpool_handle_t *);
+extern uint64_t zpool_get_guid(zpool_handle_t *);
+extern uint64_t zpool_get_space_used(zpool_handle_t *);
+extern uint64_t zpool_get_space_total(zpool_handle_t *);
+extern int zpool_get_root(zpool_handle_t *, char *, size_t);
+extern int zpool_get_state(zpool_handle_t *);
+
+/*
+ * Iterate over all active pools in the system.
+ */
+typedef int (*zpool_iter_f)(zpool_handle_t *, void *);
+extern int zpool_iter(zpool_iter_f, void *);
+
+/*
+ * Functions to create and destroy pools
+ */
+extern int zpool_create(const char *, nvlist_t *, const char *);
+extern int zpool_destroy(zpool_handle_t *);
+extern int zpool_add(zpool_handle_t *, nvlist_t *);
+
+/*
+ * Functions to manipulate pool and vdev state
+ */
+extern int zpool_scrub(zpool_handle_t *, pool_scrub_type_t);
+
+extern int zpool_vdev_online(zpool_handle_t *, const char *);
+extern int zpool_vdev_offline(zpool_handle_t *, const char *);
+extern int zpool_vdev_attach(zpool_handle_t *, const char *, const char *,
+    nvlist_t *, int);
+extern int zpool_vdev_detach(zpool_handle_t *, const char *);
+
+/*
+ * Pool health statistics.
+ */
+typedef enum {
+	/*
+	 * The following correspond to faults as defined in the (fault.fs.zfs.*)
+	 * event namespace.  Each is associated with a correponding message ID.
+	 */
+	ZPOOL_STATUS_CORRUPT_CACHE,	/* corrupt /kernel/drv/zpool.cache */
+	ZPOOL_STATUS_MISSING_DEV_R,	/* missing device with replicas */
+	ZPOOL_STATUS_MISSING_DEV_NR,	/* missing device with no replicas */
+	ZPOOL_STATUS_CORRUPT_LABEL_R,	/* bad device label with replicas */
+	ZPOOL_STATUS_CORRUPT_LABEL_NR,	/* bad device lable with no replicas */
+	ZPOOL_STATUS_BAD_GUID_SUM,	/* sum of device guids didn't match */
+	ZPOOL_STATUS_CORRUPT_POOL,	/* pool metadata is corrupted */
+	ZPOOL_STATUS_CORRUPT_DATA,	/* data errors in user (meta)data */
+	ZPOOL_STATUS_FAILING_DEV,	/* device experiencing errors */
+	ZPOOL_STATUS_VERSION_MISMATCH,	/* bad on-disk version */
+
+	/*
+	 * The following are not faults per se, but still an error possibly
+	 * requiring adminsitrative attention.  There is no corresponding
+	 * message ID.
+	 */
+	ZPOOL_STATUS_RESILVERING,	/* device being resilvered */
+	ZPOOL_STATUS_OFFLINE_DEV,	/* device online */
+
+	/*
+	 * Finally, the following indicates a healthy pool.
+	 */
+	ZPOOL_STATUS_OK
+} zpool_status_t;
+
+extern zpool_status_t zpool_get_status(zpool_handle_t *, char **msgid);
+extern zpool_status_t zpool_import_status(nvlist_t *, char **msgid);
+
+/*
+ * Statistics and configuration functions.
+ */
+extern nvlist_t *zpool_get_config(zpool_handle_t *);
+extern int zpool_refresh_stats(zpool_handle_t *,
+    nvlist_t **oldconfig, nvlist_t **newconfig);
+
+/*
+ * Import and export functions
+ */
+extern int zpool_export(zpool_handle_t *);
+extern int zpool_import(nvlist_t *, const char *, const char *);
+
+/*
+ * Search for pools to import
+ */
+extern nvlist_t *zpool_find_import(int argc, char **argv);
+
+/*
+ * Basic handle manipulations.  These functions do not create or destroy the
+ * underlying datasets, only the references to them.
+ */
+extern zfs_handle_t *zfs_open(const char *, int);
+extern void zfs_close(zfs_handle_t *);
+extern zfs_type_t zfs_get_type(const zfs_handle_t *);
+extern const char *zfs_get_name(const zfs_handle_t *);
+
+typedef enum {
+	ZFS_SRC_NONE = 0x1,
+	ZFS_SRC_DEFAULT = 0x2,
+	ZFS_SRC_TEMPORARY = 0x4,
+	ZFS_SRC_LOCAL = 0x8,
+	ZFS_SRC_INHERITED = 0x10
+} zfs_source_t;
+
+#define	ZFS_SRC_ALL	0x1f
+
+/*
+ * Property management functions.  Some functions are shared with the kernel,
+ * and are found in fs/zfs.h.
+ */
+const char *zfs_prop_to_name(zfs_prop_t);
+int zfs_prop_set(zfs_handle_t *, zfs_prop_t, const char *);
+int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, zfs_source_t *,
+    char *, size_t, int);
+int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *, zfs_source_t *,
+    char *, size_t);
+uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t);
+int zfs_prop_validate(zfs_prop_t, const char *, uint64_t *);
+int zfs_prop_inheritable(zfs_prop_t);
+int zfs_prop_inherit(zfs_handle_t *, zfs_prop_t);
+const char *zfs_prop_values(zfs_prop_t);
+int zfs_prop_valid_for_type(zfs_prop_t, int);
+void zfs_prop_default_string(zfs_prop_t prop, char *buf, size_t buflen);
+uint64_t zfs_prop_default_numeric(zfs_prop_t);
+int zfs_prop_is_string(zfs_prop_t prop);
+const char *zfs_prop_column_name(zfs_prop_t);
+const char *zfs_prop_column_format(zfs_prop_t);
+char ** zfs_prop_column_subopts(void);
+char ** zfs_prop_column_short_subopts(void);
+
+#define	ZFS_MOUNTPOINT_NONE	"none"
+#define	ZFS_MOUNTPOINT_LEGACY	"legacy"
+
+/*
+ * Iterator functions.
+ */
+typedef int (*zfs_iter_f)(zfs_handle_t *, void *);
+extern int zfs_iter_root(zfs_iter_f, void *);
+extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *);
+extern int zfs_iter_dependents(zfs_handle_t *, zfs_iter_f, void *);
+
+/*
+ * Functions to create and destroy datasets.
+ */
+extern int zfs_create(const char *, zfs_type_t, const char *, const char *);
+extern int zfs_destroy(zfs_handle_t *);
+extern int zfs_clone(zfs_handle_t *, const char *);
+extern int zfs_snapshot(const char *);
+extern int zfs_rollback(zfs_handle_t *);
+extern int zfs_rename(zfs_handle_t *, const char *);
+extern int zfs_backup(zfs_handle_t *, zfs_handle_t *);
+extern int zfs_restore(const char *, int, int, int);
+
+/*
+ * Miscellaneous functions.
+ */
+extern const char *zfs_type_to_name(zfs_type_t);
+extern void zfs_refresh_properties(zfs_handle_t *);
+extern int zfs_name_valid(const char *, zfs_type_t);
+
+/*
+ * Mount support functions.
+ */
+extern int zfs_is_mounted(zfs_handle_t *, char **);
+extern int zfs_mount(zfs_handle_t *, const char *, int);
+extern int zfs_unmount(zfs_handle_t *, const char *, int);
+extern int zfs_unmountall(zfs_handle_t *, int);
+
+/*
+ * Share support functions.
+ */
+extern int zfs_is_shared(zfs_handle_t *, char **);
+extern int zfs_share(zfs_handle_t *);
+extern int zfs_unshare(zfs_handle_t *, const char *);
+extern int zfs_unshareall(zfs_handle_t *);
+
+/*
+ * For clients that need to capture error output.
+ */
+extern void zfs_set_error_handler(void (*)(const char *, va_list));
+
+/*
+ * When dealing with nvlists, verify() is extremely useful
+ */
+#ifdef NDEBUG
+#define	verify(EX)	((void)(EX))
+#else
+#define	verify(EX)	assert(EX)
+#endif
+
+/*
+ * Utility function to convert a number to a human-readable form.
+ */
+extern void zfs_nicenum(uint64_t, char *, size_t);
+extern int zfs_nicestrtonum(const char *, uint64_t *);
+
+/*
+ * Pool destroy special.  Remove the device information without destroying
+ * the underlying dataset.
+ */
+extern int zfs_remove_link(zfs_handle_t *);
+
+/*
+ * Given a device or file, determine if it is part of a pool.
+ */
+extern int zpool_in_use(int fd, char **state,
+    char **name);
+
+/*
+ * ftyp special.  Read the label from a given device.
+ */
+extern nvlist_t *zpool_read_label(int fd);
+
+/*
+ * Create and remove zvol /dev links
+ */
+extern int zpool_create_zvol_links(zpool_handle_t *);
+extern int zpool_remove_zvol_links(zpool_handle_t *);
+
+/*
+ * zoneadmd hack
+ */
+extern void zfs_init(void);
+
+/*
+ * Useful defines
+ */
+#ifndef TRUE
+#define	TRUE	1
+#endif
+#ifndef FALSE
+#define	FALSE	0
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _LIBZFS_H */
diff --git a/usr/src/lib/libzfs/common/libzfs_changelist.c b/usr/src/lib/libzfs/common/libzfs_changelist.c
new file mode 100644
index 000000000000..497461e19f99
--- /dev/null
+++ b/usr/src/lib/libzfs/common/libzfs_changelist.c
@@ -0,0 +1,416 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <libintl.h>
+#include <libuutil.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <zone.h>
+
+#include <libzfs.h>
+
+#include "libzfs_impl.h"
+
+/*
+ * Structure to keep track of dataset state.  Before changing the 'sharenfs' or
+ * 'mountpoint' property, we record whether the filesystem was previously
+ * mounted/shared.  This prior state dictates whether we remount/reshare the
+ * dataset after the property has been changed.
+ *
+ * The interface consists of the following sequence of functions:
+ *
+ * 	changelist_gather()
+ * 	changelist_prefix()
+ * 	< change property >
+ * 	changelist_postfix()
+ * 	changelist_free()
+ *
+ * Other interfaces:
+ *
+ * changelist_rename() - renames all datasets appropriately when doing a rename
+ * changelist_unshare() - unshares all the nodes in a given changelist
+ * changelist_haszonedchild() - check if there is any child exported to
+ *				a local zone
+ */
+typedef struct prop_changenode {
+	zfs_handle_t		*cn_handle;
+	int			cn_shared;
+	int			cn_mounted;
+	int			cn_zoned;
+	uu_list_node_t		cn_listnode;
+} prop_changenode_t;
+
+struct prop_changelist {
+	zfs_prop_t		cl_prop;
+	zfs_prop_t		cl_realprop;
+	uu_list_pool_t		*cl_pool;
+	uu_list_t		*cl_list;
+	int			cl_waslegacy;
+	int			cl_allchildren;
+	int			cl_flags;
+	int			cl_haszonedchild;
+};
+
+/*
+ * If the property is 'mountpoint', go through and unmount filesystems as
+ * necessary.  We don't do the same for 'sharenfs', because we can just re-share
+ * with different options without interrupting service.
+ */
+int
+changelist_prefix(prop_changelist_t *clp)
+{
+	prop_changenode_t *cn;
+	int ret = 0;
+
+	if (clp->cl_prop != ZFS_PROP_MOUNTPOINT)
+		return (0);
+
+	for (cn = uu_list_first(clp->cl_list); cn != NULL;
+	    cn = uu_list_next(clp->cl_list, cn)) {
+		/*
+		 * if we are in a global zone, but this dataset is exported to
+		 * a local zone, do nothing.
+		 */
+		if ((getzoneid() == GLOBAL_ZONEID) && cn->cn_zoned)
+			continue;
+
+		/*
+		 * If we have a volume and this was a rename, remove the
+		 * /dev/zvol links
+		 */
+		if (cn->cn_handle->zfs_volblocksize &&
+		    clp->cl_realprop == ZFS_PROP_NAME) {
+			if (zvol_remove_link(cn->cn_handle->zfs_name) != 0)
+				ret = -1;
+		} else if (zfs_unmount(cn->cn_handle, NULL, clp->cl_flags) != 0)
+			ret = -1;
+	}
+
+	return (ret);
+}
+
+/*
+ * If the proeprty is 'mountpoint' or 'sharenfs', go through and remount and/or
+ * reshare the filesystems as necessary.  In changelist_gather() we recorded
+ * whether the filesystem was previously shared or mounted.  The action we take
+ * depends on the previous state, and whether the value was previously 'legacy'.
+ * For non-legacy properties, we only remount/reshare the filesystem if it was
+ * previously mounted/shared.  Otherwise, we always remount/reshare the
+ * filesystem.
+ */
+int
+changelist_postfix(prop_changelist_t *clp)
+{
+	prop_changenode_t *cn;
+	int ret = 0;
+
+	/*
+	 * If we're changing the mountpoint, attempt to destroy the underlying
+	 * mountpoint.  All other datasets will have inherited from this dataset
+	 * (in which case their mountpoints exist in the filesystem in the new
+	 * location), or have explicit mountpoints set (in which case they won't
+	 * be in the changelist).
+	 */
+	if ((cn = uu_list_last(clp->cl_list)) == NULL)
+		return (0);
+
+	if (clp->cl_prop == ZFS_PROP_MOUNTPOINT)
+		remove_mountpoint(cn->cn_handle);
+
+	/*
+	 * We walk the datasets in reverse, because we want to mount any parent
+	 * datasets before mounting the children.
+	 */
+	for (cn = uu_list_last(clp->cl_list); cn != NULL;
+	    cn = uu_list_prev(clp->cl_list, cn)) {
+		/*
+		 * if we are in a global zone, but this dataset is exported to
+		 * a local zone, do nothing.
+		 */
+		if ((getzoneid() == GLOBAL_ZONEID) && cn->cn_zoned)
+			continue;
+
+		zfs_refresh_properties(cn->cn_handle);
+
+		/*
+		 * If this is a volume and we're doing a rename, recreate the
+		 * /dev/zvol links.
+		 */
+		if (cn->cn_handle->zfs_volblocksize &&
+		    clp->cl_realprop == ZFS_PROP_NAME) {
+			if (zvol_create_link(cn->cn_handle->zfs_name) != 0)
+				ret = -1;
+			continue;
+		}
+
+		if ((clp->cl_waslegacy || cn->cn_mounted) &&
+		    !zfs_is_mounted(cn->cn_handle, NULL) &&
+		    zfs_mount(cn->cn_handle, NULL, 0) != 0)
+			ret = -1;
+
+		/*
+		 * We always re-share even if the filesystem is currently
+		 * shared, so that we can adopt any new options.
+		 */
+		if ((cn->cn_shared ||
+		    (clp->cl_prop == ZFS_PROP_SHARENFS && clp->cl_waslegacy))) {
+			char shareopts[ZFS_MAXPROPLEN];
+			if (zfs_prop_get(cn->cn_handle, ZFS_PROP_SHARENFS,
+			    shareopts, sizeof (shareopts), NULL, NULL, 0,
+			    FALSE) == 0 && strcmp(shareopts, "off") == 0)
+				ret = zfs_unshare(cn->cn_handle, NULL);
+			else
+				ret = zfs_share(cn->cn_handle);
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * If we rename a filesystem, and child filesystem handles are no longer valid,
+ * since we identify datasets by their name in the ZFS namespace.  So, we have
+ * to go through and fix up all the names appropriately.  We could do this
+ * automatically if libzfs kept track of all open handles, but this is a lot
+ * less work.
+ */
+void
+changelist_rename(prop_changelist_t *clp, const char *src, const char *dst)
+{
+	prop_changenode_t *cn;
+	char newname[ZFS_MAXNAMELEN];
+
+	for (cn = uu_list_first(clp->cl_list); cn != NULL;
+	    cn = uu_list_next(clp->cl_list, cn)) {
+		/*
+		 * Destroy the previous mountpoint if needed.
+		 */
+		remove_mountpoint(cn->cn_handle);
+
+		(void) strlcpy(newname, dst, sizeof (newname));
+		(void) strcat(newname, cn->cn_handle->zfs_name + strlen(src));
+
+		(void) strlcpy(cn->cn_handle->zfs_name, newname,
+		    sizeof (cn->cn_handle->zfs_name));
+	}
+}
+
+/*
+ * Given a gathered changelist for the "sharenfs" property,
+ * unshare all the nodes in the list.
+ */
+int
+changelist_unshare(prop_changelist_t *clp)
+{
+	prop_changenode_t *cn;
+	int ret = 0;
+
+	if (clp->cl_prop != ZFS_PROP_SHARENFS)
+		return (0);
+
+	for (cn = uu_list_first(clp->cl_list); cn != NULL;
+	    cn = uu_list_next(clp->cl_list, cn)) {
+
+		if (zfs_unshare(cn->cn_handle, NULL) != 0)
+			ret = -1;
+	}
+
+	return (ret);
+}
+
+/*
+ * Check if there is any child exported to a local zone in a
+ * given changelist. This information has already been recorded
+ * while gathering the changelist via changelist_gather().
+ */
+int
+changelist_haszonedchild(prop_changelist_t *clp)
+{
+	return (clp->cl_haszonedchild);
+}
+
+/*
+ * Release any memory associated with a changelist.
+ */
+void
+changelist_free(prop_changelist_t *clp)
+{
+	prop_changenode_t *cn;
+	uu_list_walk_t *walk;
+
+	verify((walk = uu_list_walk_start(clp->cl_list,
+	    UU_WALK_ROBUST)) != NULL);
+
+	while ((cn = uu_list_walk_next(walk)) != NULL) {
+
+		uu_list_remove(clp->cl_list, cn);
+
+		zfs_close(cn->cn_handle);
+		free(cn);
+	}
+
+	uu_list_pool_destroy(clp->cl_pool);
+
+	free(clp);
+}
+
+static int
+change_one(zfs_handle_t *zhp, void *data)
+{
+	prop_changelist_t *clp = data;
+	char property[ZFS_MAXPROPLEN];
+	char where[64];
+	prop_changenode_t *cn;
+	zfs_source_t sourcetype;
+
+	/*
+	 * We only want to unmount/unshare those filesystems which may
+	 * inherit from the target filesystem.  If we find any filesystem
+	 * with a locally set mountpoint, we ignore any children since changing
+	 * the property will not affect them.  If this is a rename, we iterate
+	 * over all children regardless, since we need them unmounted in order
+	 * to do the rename.  Also, if this is a volume and we're doing a
+	 * rename, then always add it to the changelist.
+	 */
+
+	if (!(zhp->zfs_volblocksize && clp->cl_realprop == ZFS_PROP_NAME) &&
+	    zfs_prop_get(zhp, clp->cl_prop, property,
+	    sizeof (property), &sourcetype, where, sizeof (where),
+	    FALSE) != 0)
+		return (0);
+
+	if (clp->cl_allchildren || sourcetype == ZFS_SRC_DEFAULT ||
+	    sourcetype == ZFS_SRC_INHERITED) {
+		cn = zfs_malloc(sizeof (prop_changenode_t));
+
+		cn->cn_handle = zhp;
+		cn->cn_mounted = zfs_is_mounted(zhp, NULL);
+		cn->cn_shared = zfs_is_shared(zhp, NULL);
+		cn->cn_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
+
+		/* indicate if any child is exported to a local zone */
+		if ((getzoneid() == GLOBAL_ZONEID) && cn->cn_zoned)
+			clp->cl_haszonedchild = TRUE;
+
+		uu_list_node_init(cn, &cn->cn_listnode, clp->cl_pool);
+		verify(uu_list_insert_before(clp->cl_list,
+		    uu_list_first(clp->cl_list), cn) == 0);
+
+		return (zfs_iter_children(zhp, change_one, data));
+	} else {
+		zfs_close(zhp);
+	}
+
+	return (0);
+}
+
+
+/*
+ * Given a ZFS handle and a property, construct a complete list of datasets that
+ * need to be modified as part of this process.  For anything but the
+ * 'mountpoint' and 'sharenfs' properties, this just returns an empty list.
+ * Otherwise, we iterate over all children and look for any datasets which
+ * inherit this property.  For each such dataset, we add it to the list and mark
+ * whether it was shared beforehand.
+ */
+prop_changelist_t *
+changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int flags)
+{
+	prop_changelist_t *clp = zfs_malloc(sizeof (prop_changelist_t));
+	prop_changenode_t *cn;
+	zfs_handle_t *temp;
+	char property[ZFS_MAXPROPLEN];
+
+	clp->cl_pool = uu_list_pool_create("changelist_pool",
+	    sizeof (prop_changenode_t),
+	    offsetof(prop_changenode_t, cn_listnode),
+	    NULL, 0);
+	assert(clp->cl_pool != NULL);
+
+	clp->cl_list = uu_list_create(clp->cl_pool, NULL, 0);
+	clp->cl_flags = flags;
+
+	/*
+	 * If this is a rename or the 'zoned' property, we pretend we're
+	 * changing the mountpoint and flag it so we can catch all children in
+	 * change_one().
+	 */
+	if (prop == ZFS_PROP_NAME || prop == ZFS_PROP_ZONED) {
+		clp->cl_prop = ZFS_PROP_MOUNTPOINT;
+		clp->cl_allchildren = TRUE;
+	} else {
+		clp->cl_prop = prop;
+	}
+	clp->cl_realprop = prop;
+
+	if (clp->cl_prop != ZFS_PROP_MOUNTPOINT &&
+	    clp->cl_prop != ZFS_PROP_SHARENFS)
+		return (clp);
+
+	if (zfs_iter_children(zhp, change_one, clp) != 0) {
+		changelist_free(clp);
+		return (NULL);
+	}
+
+	/*
+	 * We have to re-open ourselves because we auto-close all the handles
+	 * and can't tell the difference.
+	 */
+	if ((temp = zfs_open(zfs_get_name(zhp), ZFS_TYPE_ANY)) == NULL) {
+		free(clp);
+		return (NULL);
+	}
+
+	/*
+	 * Always add ourself to the list.  We add ourselves to the end so that
+	 * we're the last to be unmounted.
+	 */
+	cn = zfs_malloc(sizeof (prop_changenode_t));
+	cn->cn_handle = temp;
+	cn->cn_mounted = zfs_is_mounted(temp, NULL);
+	cn->cn_shared = zfs_is_shared(temp, NULL);
+	cn->cn_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
+
+	uu_list_node_init(cn, &cn->cn_listnode, clp->cl_pool);
+	verify(uu_list_insert_after(clp->cl_list,
+	    uu_list_last(clp->cl_list), cn) == 0);
+
+	/*
+	 * If the property was previously 'legacy' or 'none', record this fact,
+	 * as the behavior of changelist_postfix() will be different.
+	 */
+	if (zfs_prop_get(zhp, prop, property, sizeof (property),
+	    NULL, NULL, 0, FALSE) == 0 &&
+	    (strcmp(property, "legacy") == 0 || strcmp(property, "none") == 0 ||
+	    strcmp(property, "off") == 0))
+		clp->cl_waslegacy = TRUE;
+
+	return (clp);
+}
diff --git a/usr/src/lib/libzfs/common/libzfs_config.c b/usr/src/lib/libzfs/common/libzfs_config.c
new file mode 100644
index 000000000000..4c5a22a45957
--- /dev/null
+++ b/usr/src/lib/libzfs/common/libzfs_config.c
@@ -0,0 +1,309 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * The pool configuration repository is stored in /etc/zfs/zpool.cache as a
+ * single packed nvlist.  While it would be nice to just read in this
+ * file from userland, this wouldn't work from a local zone.  So we have to have
+ * a zpool ioctl to return the complete configuration for all pools.  In the
+ * global zone, this will be identical to reading the file and unpacking it in
+ * userland.
+ */
+
+#include <errno.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <string.h>
+#include <unistd.h>
+#include <libintl.h>
+#include <libuutil.h>
+
+#include "libzfs_impl.h"
+
+static uu_avl_t *namespace_avl;
+static uint64_t namespace_generation;
+
+typedef struct config_node {
+	char		*cn_name;
+	nvlist_t	*cn_config;
+	uu_avl_node_t	cn_avl;
+} config_node_t;
+
+/* ARGSUSED */
+static int
+config_node_compare(const void *a, const void *b, void *unused)
+{
+	int ret;
+
+	const config_node_t *ca = (config_node_t *)a;
+	const config_node_t *cb = (config_node_t *)b;
+
+	ret = strcmp(ca->cn_name, cb->cn_name);
+
+	if (ret < 0)
+		return (-1);
+	else if (ret > 0)
+		return (1);
+	else
+		return (0);
+}
+
+/*
+ * Loads the pool namespace, or re-loads it if the cache has changed.
+ */
+static void
+namespace_reload()
+{
+	nvlist_t *config;
+	config_node_t *cn;
+	nvpair_t *elem;
+	zfs_cmd_t zc = { 0 };
+	uu_avl_walk_t *walk;
+
+	if (namespace_generation == 0) {
+		/*
+		 * This is the first time we've accessed the configuration
+		 * cache.  Initialize the AVL tree and then fall through to the
+		 * common code.
+		 */
+		uu_avl_pool_t *pool;
+
+		if ((pool = uu_avl_pool_create("config_pool",
+		    sizeof (config_node_t),
+		    offsetof(config_node_t, cn_avl),
+		    config_node_compare, UU_DEFAULT)) == NULL)
+			no_memory();
+
+		if ((namespace_avl = uu_avl_create(pool, NULL,
+		    UU_DEFAULT)) == NULL)
+			no_memory();
+	}
+
+	/*
+	 * Issue the ZFS_IOC_POOL_CONFIGS ioctl.
+	 * This can fail for one of two reasons:
+	 *
+	 * 	EEXIST		The generation counts match, nothing to do.
+	 * 	ENOMEM		The zc_config_dst buffer isn't large enough to
+	 * 			hold the config; zc_config_dst_size will have
+	 *			been modified to tell us how much to allocate.
+	 */
+	zc.zc_config_dst_size = 1024;
+	zc.zc_config_dst = (uint64_t)(uintptr_t)
+	    zfs_malloc(zc.zc_config_dst_size);
+	for (;;) {
+		zc.zc_cookie = namespace_generation;
+		if (ioctl(zfs_fd, ZFS_IOC_POOL_CONFIGS, &zc) != 0) {
+			switch (errno) {
+			case EEXIST:
+				/*
+				 * The namespace hasn't changed.
+				 */
+				free((void *)(uintptr_t)zc.zc_config_dst);
+				return;
+
+			case ENOMEM:
+				free((void *)(uintptr_t)zc.zc_config_dst);
+				zc.zc_config_dst = (uint64_t)(uintptr_t)
+				    zfs_malloc(zc.zc_config_dst_size);
+				break;
+
+			default:
+				zfs_baderror(errno);
+			}
+		} else {
+			namespace_generation = zc.zc_cookie;
+			break;
+		}
+	}
+
+	verify(nvlist_unpack((void *)(uintptr_t)zc.zc_config_dst,
+	    zc.zc_config_dst_size, &config, 0) == 0);
+
+	free((void *)(uintptr_t)zc.zc_config_dst);
+
+	/*
+	 * Clear out any existing configuration information.
+	 */
+	if ((walk = uu_avl_walk_start(namespace_avl, UU_WALK_ROBUST)) == NULL)
+		no_memory();
+
+	while ((cn = uu_avl_walk_next(walk)) != NULL) {
+		uu_avl_remove(namespace_avl, cn);
+		nvlist_free(cn->cn_config);
+		free(cn->cn_name);
+		free(cn);
+	}
+
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(config, elem)) != NULL) {
+		nvlist_t *child;
+		uu_avl_index_t where;
+
+		cn = zfs_malloc(sizeof (config_node_t));
+		cn->cn_name = zfs_strdup(nvpair_name(elem));
+
+		verify(nvpair_value_nvlist(elem, &child) == 0);
+		verify(nvlist_dup(child, &cn->cn_config, 0) == 0);
+		verify(uu_avl_find(namespace_avl, cn, NULL, &where) == NULL);
+
+		uu_avl_insert(namespace_avl, cn, where);
+	}
+
+	nvlist_free(config);
+}
+
+/*
+ * Retrive the configuration for the given pool.  The configuration is a nvlist
+ * describing the vdevs, as well as the statistics associated with each one.
+ */
+nvlist_t *
+zpool_get_config(zpool_handle_t *zhp)
+{
+	return (zhp->zpool_config);
+}
+
+/*
+ * Refresh the vdev statistics associated with the given pool.  This is used in
+ * iostat to show configuration changes and determine the delta from the last
+ * time the function was called.  This function can fail, in case the pool has
+ * been destroyed.
+ */
+int
+zpool_refresh_stats(zpool_handle_t *zhp, nvlist_t **oldconfig,
+    nvlist_t **newconfig)
+{
+	zfs_cmd_t zc = { 0 };
+	int error;
+
+	(void) strcpy(zc.zc_name, zhp->zpool_name);
+
+	if (zhp->zpool_config_size == 0)
+		zhp->zpool_config_size = 1 << 16;
+
+	zc.zc_config_dst_size = zhp->zpool_config_size;
+	zc.zc_config_dst = (uint64_t)(uintptr_t)
+	    zfs_malloc(zc.zc_config_dst_size);
+
+	while ((error = ioctl(zfs_fd, ZFS_IOC_POOL_STATS, &zc)) != 0) {
+		error = errno;
+
+		if (error == ENXIO) {
+			/*
+			 * We can't open one or more top-level vdevs,
+			 * but we have the config.
+			 */
+			break;
+		}
+
+		free((void *)(uintptr_t)zc.zc_config_dst);
+
+		if (error == ENOENT || error == EINVAL) {
+			/*
+			 * There's no such pool (ENOENT)
+			 * or the config is bogus (EINVAL).
+			 */
+			return (error);
+		}
+
+		if (error != ENOMEM)
+			zfs_baderror(error);
+
+		zc.zc_config_dst =
+		    (uint64_t)(uintptr_t)zfs_malloc(zc.zc_config_dst_size);
+	}
+
+	verify(nvlist_unpack((void *)(uintptr_t)zc.zc_config_dst,
+	    zc.zc_config_dst_size, newconfig, 0) == 0);
+
+	zhp->zpool_config_size = zc.zc_config_dst_size;
+	free((void *)(uintptr_t)zc.zc_config_dst);
+
+	set_pool_health(*newconfig);
+
+	if (oldconfig != NULL)
+		*oldconfig = zhp->zpool_config;
+	else
+		nvlist_free(zhp->zpool_config);
+
+	zhp->zpool_config = *newconfig;
+
+	return (error);
+}
+
+/*
+ * Iterate over all pools in the system.
+ */
+int
+zpool_iter(zpool_iter_f func, void *data)
+{
+	config_node_t *cn;
+	zpool_handle_t *zhp;
+	int ret;
+
+	namespace_reload();
+
+	for (cn = uu_avl_first(namespace_avl); cn != NULL;
+	    cn = uu_avl_next(namespace_avl, cn)) {
+
+		if ((zhp = zpool_open_silent(cn->cn_name)) == NULL)
+			continue;
+
+		if ((ret = func(zhp, data)) != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+/*
+ * Iterate over root datasets, calling the given function for each.  The zfs
+ * handle passed each time must be explicitly closed by the callback.
+ */
+int
+zfs_iter_root(zfs_iter_f func, void *data)
+{
+	config_node_t *cn;
+	zfs_handle_t *zhp;
+	int ret;
+
+	namespace_reload();
+
+	for (cn = uu_avl_first(namespace_avl); cn != NULL;
+	    cn = uu_avl_next(namespace_avl, cn)) {
+
+		if ((zhp = make_dataset_handle(cn->cn_name)) == NULL)
+			continue;
+
+		if ((ret = func(zhp, data)) != 0)
+			return (ret);
+	}
+
+	return (0);
+}
diff --git a/usr/src/lib/libzfs/common/libzfs_dataset.c b/usr/src/lib/libzfs/common/libzfs_dataset.c
new file mode 100644
index 000000000000..5a4b1d92be19
--- /dev/null
+++ b/usr/src/lib/libzfs/common/libzfs_dataset.c
@@ -0,0 +1,2939 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <libdevinfo.h>
+#include <libintl.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <zone.h>
+#include <sys/mntent.h>
+#include <sys/mnttab.h>
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <libzfs.h>
+
+#include "zfs_namecheck.h"
+#include "zfs_prop.h"
+#include "libzfs_impl.h"
+
+/*
+ * Given a single type (not a mask of types), return the type in a human
+ * readable form.
+ */
+const char *
+zfs_type_to_name(zfs_type_t type)
+{
+	switch (type) {
+	case ZFS_TYPE_FILESYSTEM:
+		return (dgettext(TEXT_DOMAIN, "filesystem"));
+	case ZFS_TYPE_SNAPSHOT:
+		return (dgettext(TEXT_DOMAIN, "snapshot"));
+	case ZFS_TYPE_VOLUME:
+		return (dgettext(TEXT_DOMAIN, "volume"));
+	}
+
+	zfs_baderror(type);
+	return (NULL);
+}
+
+/*
+ * Given a path and mask of ZFS types, return a string describing this dataset.
+ * This is used when we fail to open a dataset and we cannot get an exact type.
+ * We guess what the type would have been based on the path and the mask of
+ * acceptable types.
+ */
+static const char *
+path_to_str(const char *path, int types)
+{
+	/*
+	 * When given a single type, always report the exact type.
+	 */
+	if (types == ZFS_TYPE_SNAPSHOT)
+		return (dgettext(TEXT_DOMAIN, "snapshot"));
+	if (types == ZFS_TYPE_FILESYSTEM)
+		return (dgettext(TEXT_DOMAIN, "filesystem"));
+	if (types == ZFS_TYPE_VOLUME)
+		return (dgettext(TEXT_DOMAIN, "volume"));
+
+	/*
+	 * The user is requesting more than one type of dataset.  If this is the
+	 * case, consult the path itself.  If we're looking for a snapshot, and
+	 * a '@' is found, then report it as "snapshot".  Otherwise, remove the
+	 * snapshot attribute and try again.
+	 */
+	if (types & ZFS_TYPE_SNAPSHOT) {
+		if (strchr(path, '@') != NULL)
+			return (dgettext(TEXT_DOMAIN, "snapshot"));
+		return (path_to_str(path, types & ~ZFS_TYPE_SNAPSHOT));
+	}
+
+
+	/*
+	 * The user has requested either filesystems or volumes.
+	 * We have no way of knowing a priori what type this would be, so always
+	 * report it as "filesystem" or "volume", our two primitive types.
+	 */
+	if (types & ZFS_TYPE_FILESYSTEM)
+		return (dgettext(TEXT_DOMAIN, "filesystem"));
+
+	assert(types & ZFS_TYPE_VOLUME);
+	return (dgettext(TEXT_DOMAIN, "volume"));
+}
+
+/*
+ * Validate a ZFS path.  This is used even before trying to open the dataset, to
+ * provide a more meaningful error message.  We place a more useful message in
+ * 'buf' detailing exactly why the name was not valid.
+ */
+static int
+zfs_validate_name(const char *path, int type, char *buf, size_t buflen)
+{
+	namecheck_err_t why;
+	char what;
+
+	if (dataset_namecheck(path, &why, &what) != 0) {
+		if (buf != NULL) {
+			switch (why) {
+			case NAME_ERR_LEADING_SLASH:
+				(void) strlcpy(buf, dgettext(TEXT_DOMAIN,
+				    "leading slash"), buflen);
+				break;
+
+			case NAME_ERR_EMPTY_COMPONENT:
+				(void) strlcpy(buf, dgettext(TEXT_DOMAIN,
+				    "empty component"), buflen);
+				break;
+
+			case NAME_ERR_TRAILING_SLASH:
+				(void) strlcpy(buf, dgettext(TEXT_DOMAIN,
+				    "trailing slash"), buflen);
+				break;
+
+			case NAME_ERR_INVALCHAR:
+				(void) snprintf(buf, buflen,
+				    dgettext(TEXT_DOMAIN, "invalid character "
+				    "'%c'"), what);
+				break;
+
+			case NAME_ERR_MULTIPLE_AT:
+				(void) strlcpy(buf, dgettext(TEXT_DOMAIN,
+				    "multiple '@' delimiters"), buflen);
+				break;
+			}
+		}
+
+		return (0);
+	}
+
+	if (!(type & ZFS_TYPE_SNAPSHOT) && strchr(path, '@') != NULL) {
+		if (buf != NULL)
+			(void) strlcpy(buf,
+			    dgettext(TEXT_DOMAIN,
+			    "snapshot delimiter '@'"), buflen);
+		return (0);
+	}
+
+	return (1);
+}
+
+int
+zfs_name_valid(const char *name, zfs_type_t type)
+{
+	return (zfs_validate_name(name, type, NULL, NULL));
+}
+
+/*
+ * Utility function to gather stats (objset and zpl) for the given object.
+ */
+static int
+get_stats(zfs_handle_t *zhp)
+{
+	zfs_cmd_t zc = { 0 };
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+
+	/*
+	 * get the generic DMU stats and per-type (zfs, zvol) stats
+	 */
+	if (ioctl(zfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0)
+		return (-1);
+
+	bcopy(&zc.zc_objset_stats, &zhp->zfs_dmustats,
+	    sizeof (zc.zc_objset_stats));
+
+	bcopy(&zc.zc_zfs_stats, &zhp->zfs_zplstats, sizeof (zc.zc_zfs_stats));
+
+	zhp->zfs_volsize = zc.zc_volsize;
+	zhp->zfs_volblocksize = zc.zc_volblocksize;
+
+	return (0);
+}
+
+/*
+ * Refresh the properties currently stored in the handle.
+ */
+void
+zfs_refresh_properties(zfs_handle_t *zhp)
+{
+	(void) get_stats(zhp);
+}
+
+/*
+ * Makes a handle from the given dataset name.  Used by zfs_open() and
+ * zfs_iter_* to create child handles on the fly.
+ */
+zfs_handle_t *
+make_dataset_handle(const char *path)
+{
+	zfs_handle_t *zhp = zfs_malloc(sizeof (zfs_handle_t));
+
+	(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
+
+	if (get_stats(zhp) != 0) {
+		free(zhp);
+		return (NULL);
+	}
+
+	/*
+	 * We've managed to open the dataset and gather statistics.  Determine
+	 * the high-level type.
+	 */
+	if (zhp->zfs_dmustats.dds_is_snapshot)
+		zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
+	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL)
+		zhp->zfs_type = ZFS_TYPE_VOLUME;
+	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS)
+		zhp->zfs_type = ZFS_TYPE_FILESYSTEM;
+	else
+		/* we should never see any other dataset types */
+		zfs_baderror(zhp->zfs_dmustats.dds_type);
+
+	return (zhp);
+}
+
+/*
+ * Opens the given snapshot, filesystem, or volume.   The 'types'
+ * argument is a mask of acceptable types.  The function will print an
+ * appropriate error message and return NULL if it can't be opened.
+ */
+zfs_handle_t *
+zfs_open(const char *path, int types)
+{
+	zfs_handle_t *zhp;
+
+	/*
+	 * If the path is longer than the maximum dataset length, treat it as
+	 * ENOENT because we know there can't be any dataset with that path.
+	 */
+	if (strlen(path) >= ZFS_MAXNAMELEN) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot open '%s': no such %s"), path,
+		    path_to_str(path, types));
+		return (NULL);
+	}
+
+	/*
+	 * Validate the name before we even try to open it.  We don't care about
+	 * the verbose invalid messages here; just report a generic error.
+	 */
+	if (!zfs_validate_name(path, types, NULL, 0)) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot open '%s': invalid %s name"), path,
+		    path_to_str(path, types));
+		return (NULL);
+	}
+
+	/*
+	 * Try to get stats for the dataset, which will tell us if it exists.
+	 */
+	errno = 0;
+	if ((zhp = make_dataset_handle(path)) == NULL) {
+		switch (errno) {
+		case ENOENT:
+			/*
+			 * The dataset doesn't exist.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot open '%s': no such %s"), path,
+			    path_to_str(path, types));
+			break;
+
+		case EBUSY:
+			/*
+			 * We were able to open the dataset but couldn't
+			 * get the stats.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot open '%s': %s is busy"), path,
+			    path_to_str(path, types));
+			break;
+
+		default:
+			zfs_baderror(errno);
+
+		}
+		return (NULL);
+	}
+
+	if (!(types & zhp->zfs_type)) {
+		zfs_error(dgettext(TEXT_DOMAIN, "cannot open '%s': operation "
+		    "not supported for %ss"), path,
+		    zfs_type_to_name(zhp->zfs_type));
+		free(zhp);
+		return (NULL);
+	}
+
+	return (zhp);
+}
+
+/*
+ * Release a ZFS handle.  Nothing to do but free the associated memory.
+ */
+void
+zfs_close(zfs_handle_t *zhp)
+{
+	if (zhp->zfs_mntopts)
+		free(zhp->zfs_mntopts);
+	free(zhp);
+}
+
+struct {
+	const char *name;
+	uint64_t value;
+} checksum_table[] = {
+	{ "on",		ZIO_CHECKSUM_ON },
+	{ "off",	ZIO_CHECKSUM_OFF },
+	{ "fletcher2",	ZIO_CHECKSUM_FLETCHER_2 },
+	{ "fletcher4",	ZIO_CHECKSUM_FLETCHER_4 },
+	{ "sha256",	ZIO_CHECKSUM_SHA256 },
+	{ NULL }
+};
+
+struct {
+	const char *name;
+	uint64_t value;
+} compress_table[] = {
+	{ "on",		ZIO_COMPRESS_ON },
+	{ "off",	ZIO_COMPRESS_OFF },
+	{ "lzjb",	ZIO_COMPRESS_LZJB },
+	{ NULL }
+};
+
+struct {
+	const char *name;
+	uint64_t value;
+} snapdir_table[] = {
+	{ "hidden",	HIDDEN },
+	{ "visible",	VISIBLE },
+	{ NULL }
+};
+
+struct {
+	const char *name;
+	uint64_t value;
+} acl_mode_table[] = {
+	{ "discard",	DISCARD },
+	{ "groupmask",	GROUPMASK },
+	{ "passthrough", PASSTHROUGH },
+	{ NULL }
+};
+
+struct {
+	const char *name;
+	uint64_t value;
+} acl_inherit_table[] = {
+	{ "discard",	DISCARD },
+	{ "noallow",	NOALLOW },
+	{ "secure",	SECURE },
+	{ "passthrough", PASSTHROUGH },
+	{ NULL }
+};
+
+
+/*
+ * Given a numeric suffix, convert the value into a number of bits that the
+ * resulting value must be shifted.
+ */
+static int
+str2shift(const char *buf, char *reason, size_t len)
+{
+	const char *ends = "BKMGTPEZ";
+	int i;
+
+	if (buf[0] == '\0')
+		return (0);
+	for (i = 0; i < strlen(ends); i++) {
+		if (toupper(buf[0]) == ends[i])
+			break;
+	}
+	if (i == strlen(ends)) {
+		(void) snprintf(reason, len, dgettext(TEXT_DOMAIN, "invalid "
+		    "numeric suffix '%s'"), buf);
+		return (-1);
+	}
+
+	/*
+	 * We want to allow trailing 'b' characters for 'GB' or 'Mb'.  But don't
+	 * allow 'BB' - that's just weird.
+	 */
+	if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0' &&
+	    toupper(buf[0]) != 'B')) {
+		return (10*i);
+	}
+
+	(void) snprintf(reason, len, dgettext(TEXT_DOMAIN, "invalid numeric "
+	    "suffix '%s'"), buf);
+	return (-1);
+}
+
+/*
+ * Convert a string of the form '100G' into a real number.  Used when setting
+ * properties or creating a volume.  'buf' is used to place an extended error
+ * message for the caller to use.
+ */
+static int
+nicestrtonum(const char *value, uint64_t *num, char *buf, size_t buflen)
+{
+	char *end;
+	int shift;
+
+	*num = 0;
+
+	/* Check to see if this looks like a number.  */
+	if ((value[0] < '0' || value[0] > '9') && value[0] != '.') {
+		(void) strlcpy(buf, dgettext(TEXT_DOMAIN,
+		    "must be a numeric value"), buflen);
+		return (-1);
+	}
+
+	/* Rely on stroll() to process the numeric portion.  */
+	errno = 0;
+	*num = strtoll(value, &end, 10);
+
+	/*
+	 * Check for ERANGE, which indicates that the value is too large to fit
+	 * in a 64-bit value.
+	 */
+	if (errno == ERANGE) {
+		(void) strlcpy(buf, dgettext(TEXT_DOMAIN,
+		    "value is too large"), buflen);
+		return (-1);
+	}
+
+	/*
+	 * If we have a decimal value, then do the computation with floating
+	 * point arithmetic.  Otherwise, use standard arithmetic.
+	 */
+	if (*end == '.') {
+		double fval = strtod(value, &end);
+
+		if ((shift = str2shift(end, buf, buflen)) == -1)
+			return (-1);
+
+		fval *= pow(2, shift);
+
+		if (fval > UINT64_MAX) {
+			(void) strlcpy(buf, dgettext(TEXT_DOMAIN,
+			    "value is too large"), buflen);
+			return (-1);
+		}
+
+		*num = (uint64_t)fval;
+	} else {
+		if ((shift = str2shift(end, buf, buflen)) == -1)
+			return (-1);
+
+		/* Check for overflow */
+		if (shift >= 64 || (*num << shift) >> shift != *num) {
+			(void) strlcpy(buf, dgettext(TEXT_DOMAIN,
+			    "value is too large"), buflen);
+			return (-1);
+		}
+
+		*num <<= shift;
+	}
+
+	return (0);
+}
+
+int
+zfs_nicestrtonum(const char *str, uint64_t *val)
+{
+	char buf[1];
+
+	return (nicestrtonum(str, val, buf, sizeof (buf)));
+}
+
+/*
+ * Given a property type and value, verify that the value is appropriate.  Used
+ * by zfs_prop_set() and some libzfs consumers.
+ */
+int
+zfs_prop_validate(zfs_prop_t prop, const char *value, uint64_t *intval)
+{
+	const char *propname = zfs_prop_to_name(prop);
+	uint64_t number;
+	char reason[64];
+	int i;
+
+	/*
+	 * Check to see if this a read-only property.
+	 */
+	if (zfs_prop_readonly(prop)) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot set %s property: read-only property"), propname);
+		return (-1);
+	}
+
+	/* See if the property value is too long */
+	if (strlen(value) >= ZFS_MAXPROPLEN) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "bad %s value '%s': value is too long"), propname,
+		    value);
+		return (-1);
+	}
+
+	/* Perform basic checking based on property type */
+	switch (zfs_prop_get_type(prop)) {
+	case prop_type_boolean:
+		if (strcmp(value, "on") == 0) {
+			number = 1;
+		} else if (strcmp(value, "off") == 0) {
+			number = 0;
+		} else {
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "bad %s value '%s': must be 'on' or 'off'"),
+			    propname, value);
+			return (-1);
+		}
+		break;
+
+	case prop_type_number:
+		/* treat 'none' as 0 */
+		if (strcmp(value, "none") == 0) {
+			number = 0;
+			break;
+		}
+
+		if (nicestrtonum(value, &number, reason,
+		    sizeof (reason)) != 0) {
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "bad %s value '%s': %s"), propname, value,
+			    reason);
+			return (-1);
+		}
+
+		/* don't allow 0 for quota, use 'none' instead */
+		if (prop == ZFS_PROP_QUOTA && number == 0 &&
+		    strcmp(value, "none") != 0) {
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "bad %s value '%s': use '%s=none' to disable"),
+			    propname, value, propname);
+			return (-1);
+		}
+
+		/* must be power of two within SPA_{MIN,MAX}BLOCKSIZE */
+		if (prop == ZFS_PROP_RECORDSIZE ||
+		    prop == ZFS_PROP_VOLBLOCKSIZE) {
+			if (number < SPA_MINBLOCKSIZE ||
+			    number > SPA_MAXBLOCKSIZE || !ISP2(number)) {
+				zfs_error(dgettext(TEXT_DOMAIN,
+				    "bad %s value '%s': "
+				    "must be power of 2 from %u to %uk"),
+				    propname, value,
+				    (uint_t)SPA_MINBLOCKSIZE,
+				    (uint_t)SPA_MAXBLOCKSIZE >> 10);
+				return (-1);
+			}
+		}
+
+		break;
+
+	case prop_type_string:
+	case prop_type_index:
+		/*
+		 * The two writable string values, 'mountpoint' and
+		 * 'checksum' need special consideration.  The 'index' types are
+		 * specified as strings by the user, but passed to the kernel as
+		 * integers.
+		 */
+		switch (prop) {
+		case ZFS_PROP_MOUNTPOINT:
+			if (strcmp(value, ZFS_MOUNTPOINT_NONE) == 0 ||
+			    strcmp(value, ZFS_MOUNTPOINT_LEGACY) == 0)
+				break;
+
+			if (value[0] != '/') {
+				zfs_error(dgettext(TEXT_DOMAIN,
+				    "bad %s value '%s': must be an absolute "
+				    "path, 'none', or 'legacy'"),
+				    propname, value);
+				return (-1);
+			}
+			break;
+
+		case ZFS_PROP_CHECKSUM:
+			for (i = 0; checksum_table[i].name != NULL; i++) {
+				if (strcmp(value, checksum_table[i].name)
+				    == 0) {
+					number = checksum_table[i].value;
+					break;
+				}
+			}
+
+			if (checksum_table[i].name == NULL) {
+				zfs_error(dgettext(TEXT_DOMAIN,
+				    "bad %s value '%s': must be 'on', 'off', "
+				    "'fletcher2', 'fletcher4', or 'sha256'"),
+				    propname, value);
+				return (-1);
+			}
+			break;
+
+		case ZFS_PROP_COMPRESSION:
+			for (i = 0; compress_table[i].name != NULL; i++) {
+				if (strcmp(value, compress_table[i].name)
+				    == 0) {
+					number = compress_table[i].value;
+					break;
+				}
+			}
+
+			if (compress_table[i].name == NULL) {
+				zfs_error(dgettext(TEXT_DOMAIN,
+				    "bad %s value '%s': must be 'on', 'off', "
+				    "or 'lzjb'"),
+				    propname, value);
+				return (-1);
+			}
+			break;
+
+		case ZFS_PROP_SNAPDIR:
+			for (i = 0; snapdir_table[i].name != NULL; i++) {
+				if (strcmp(value, snapdir_table[i].name) == 0) {
+					number = snapdir_table[i].value;
+					break;
+				}
+			}
+
+			if (snapdir_table[i].name == NULL) {
+				zfs_error(dgettext(TEXT_DOMAIN,
+				    "bad %s value '%s': must be 'hidden' "
+				    "or 'visible'"),
+				    propname, value);
+				return (-1);
+			}
+			break;
+
+		case ZFS_PROP_ACLMODE:
+			for (i = 0; acl_mode_table[i].name != NULL; i++) {
+				if (strcmp(value, acl_mode_table[i].name)
+				    == 0) {
+					number = acl_mode_table[i].value;
+					break;
+				}
+			}
+
+			if (acl_mode_table[i].name == NULL) {
+				zfs_error(dgettext(TEXT_DOMAIN,
+				    "bad %s value '%s': must be 'discard', "
+				    "'groupmask' or 'passthrough'"),
+				    propname, value);
+				return (-1);
+			}
+			break;
+
+		case ZFS_PROP_ACLINHERIT:
+			for (i = 0; acl_inherit_table[i].name != NULL; i++) {
+				if (strcmp(value, acl_inherit_table[i].name)
+				    == 0) {
+					number = acl_inherit_table[i].value;
+					break;
+				}
+			}
+
+			if (acl_inherit_table[i].name == NULL) {
+				zfs_error(dgettext(TEXT_DOMAIN,
+				    "bad %s value '%s': must be 'discard', "
+				    "'noallow', 'groupmask' or 'passthrough'"),
+				    propname, value);
+				return (-1);
+			}
+			break;
+
+		case ZFS_PROP_SHARENFS:
+			/*
+			 * Nothing to do for 'sharenfs', this gets passed on to
+			 * share(1M) verbatim.
+			 */
+			break;
+		}
+	}
+
+	if (intval != NULL)
+		*intval = number;
+
+	return (0);
+}
+
+/*
+ * Given a property name and value, set the property for the given dataset.
+ */
+int
+zfs_prop_set(zfs_handle_t *zhp, zfs_prop_t prop, const char *propval)
+{
+	const char *propname = zfs_prop_to_name(prop);
+	uint64_t number;
+	zfs_cmd_t zc = { 0 };
+	int ret;
+	prop_changelist_t *cl;
+
+	if (zfs_prop_validate(prop, propval, &number) != 0)
+		return (-1);
+
+	/*
+	 * Check to see if the value applies to this type
+	 */
+	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot set %s for '%s': property does not apply to %ss"),
+		    propname, zhp->zfs_name, zfs_type_to_name(zhp->zfs_type));
+		return (-1);
+	}
+
+	/*
+	 * For the mountpoint and sharenfs properties, check if it can be set
+	 * in a global/non-global zone based on the zoned property value:
+	 *
+	 *		global zone	    non-global zone
+	 * -----------------------------------------------------
+	 * zoned=on	mountpoint (no)	    mountpoint (yes)
+	 *		sharenfs (no)	    sharenfs (no)
+	 *
+	 * zoned=off	mountpoint (yes)	N/A
+	 *		sharenfs (yes)
+	 */
+	if (prop == ZFS_PROP_MOUNTPOINT || prop == ZFS_PROP_SHARENFS) {
+		if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
+			if (getzoneid() == GLOBAL_ZONEID) {
+				zfs_error(dgettext(TEXT_DOMAIN,
+				    "cannot set %s for '%s', "
+				    "dataset is used in a non-global zone"),
+				    propname, zhp->zfs_name);
+				return (-1);
+			} else if (prop == ZFS_PROP_SHARENFS) {
+				zfs_error(dgettext(TEXT_DOMAIN,
+				    "cannot set %s for '%s', filesystems "
+				    "cannot be shared in a non-global zone"),
+				    propname, zhp->zfs_name);
+				return (-1);
+			}
+		} else if (getzoneid() != GLOBAL_ZONEID) {
+			/*
+			 * If zoned property is 'off', this must be in
+			 * a globle zone. If not, something is wrong.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot set %s for '%s', dataset is "
+			    "used in a non-global zone, but 'zoned' "
+			    "property is not set"),
+			    propname, zhp->zfs_name);
+			return (-1);
+		}
+	}
+
+	if ((cl = changelist_gather(zhp, prop, 0)) == NULL)
+		return (-1);
+
+	if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) {
+		zfs_error(dgettext(TEXT_DOMAIN, "cannot set %s for '%s', "
+			"child dataset with inherited mountpoint is used "
+			"in a non-global zone"),
+			propname, zhp->zfs_name);
+		ret = -1;
+		goto error;
+	}
+
+	if ((ret = changelist_prefix(cl)) != 0)
+		goto error;
+
+	/*
+	 * Execute the corresponding ioctl() to set this property.
+	 */
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+
+	switch (prop) {
+	case ZFS_PROP_QUOTA:
+		zc.zc_cookie = number;
+		ret = ioctl(zfs_fd, ZFS_IOC_SET_QUOTA, &zc);
+		break;
+	case ZFS_PROP_RESERVATION:
+		zc.zc_cookie = number;
+		ret = ioctl(zfs_fd, ZFS_IOC_SET_RESERVATION, &zc);
+		break;
+	case ZFS_PROP_MOUNTPOINT:
+	case ZFS_PROP_SHARENFS:
+		/*
+		 * These properties are passed down as real strings.
+		 */
+		(void) strlcpy(zc.zc_prop_name, propname,
+		    sizeof (zc.zc_prop_name));
+		(void) strlcpy(zc.zc_prop_value, propval,
+		    sizeof (zc.zc_prop_value));
+		zc.zc_intsz = 1;
+		zc.zc_numints = strlen(propval) + 1;
+		ret = ioctl(zfs_fd, ZFS_IOC_SET_PROP, &zc);
+		break;
+	case ZFS_PROP_VOLSIZE:
+		zc.zc_volsize = number;
+		ret = ioctl(zfs_fd, ZFS_IOC_SET_VOLSIZE, &zc);
+		break;
+	case ZFS_PROP_VOLBLOCKSIZE:
+		zc.zc_volblocksize = number;
+		ret = ioctl(zfs_fd, ZFS_IOC_SET_VOLBLOCKSIZE, &zc);
+		break;
+	default:
+		(void) strlcpy(zc.zc_prop_name, propname,
+		    sizeof (zc.zc_prop_name));
+		/* LINTED - alignment */
+		*(uint64_t *)zc.zc_prop_value = number;
+		zc.zc_intsz = 8;
+		zc.zc_numints = 1;
+		ret = ioctl(zfs_fd, ZFS_IOC_SET_PROP, &zc);
+		break;
+	}
+
+	if (ret != 0) {
+		switch (errno) {
+
+		case EPERM:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot set %s for '%s': permission "
+			    "denied"), propname, zhp->zfs_name);
+			break;
+
+		case ENOENT:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot open '%s': no such %s"), zhp->zfs_name,
+			    zfs_type_to_name(zhp->zfs_type));
+			break;
+
+		case ENOSPC:
+			/*
+			 * For quotas and reservations, ENOSPC indicates
+			 * something different; setting a quota or reservation
+			 * doesn't use any disk space.
+			 */
+			switch (prop) {
+			case ZFS_PROP_QUOTA:
+				zfs_error(dgettext(TEXT_DOMAIN, "cannot set %s "
+				    "for '%s': size is less than current "
+				    "used or reserved space"), propname,
+				    zhp->zfs_name);
+				break;
+
+			case ZFS_PROP_RESERVATION:
+				zfs_error(dgettext(TEXT_DOMAIN, "cannot set %s "
+				    "for '%s': size is greater than available "
+				    "space"), propname, zhp->zfs_name);
+				break;
+
+			default:
+				zfs_error(dgettext(TEXT_DOMAIN,
+				    "cannot set %s for '%s': out of space"),
+				    propname, zhp->zfs_name);
+				break;
+			}
+			break;
+
+		case EBUSY:
+			if (prop == ZFS_PROP_VOLBLOCKSIZE) {
+				zfs_error(dgettext(TEXT_DOMAIN,
+				    "cannot set %s for '%s': "
+				    "volume already contains data"),
+				    propname, zhp->zfs_name);
+			} else {
+				zfs_baderror(errno);
+			}
+			break;
+
+		case EOVERFLOW:
+			/*
+			 * This platform can't address a volume this big.
+			 */
+#ifdef _ILP32
+			if (prop == ZFS_PROP_VOLSIZE) {
+				zfs_error(dgettext(TEXT_DOMAIN,
+				    "cannot set %s for '%s': "
+				    "max volume size is 1TB on 32-bit systems"),
+				    propname, zhp->zfs_name);
+				break;
+			}
+#endif
+			zfs_baderror(errno);
+		default:
+			zfs_baderror(errno);
+		}
+	} else {
+		/*
+		 * Refresh the statistics so the new property value
+		 * is reflected.
+		 */
+		if ((ret = changelist_postfix(cl)) != 0)
+			goto error;
+
+		(void) get_stats(zhp);
+	}
+
+error:
+	changelist_free(cl);
+	return (ret);
+}
+
+/*
+ * Given a property, inherit the value from the parent dataset.
+ */
+int
+zfs_prop_inherit(zfs_handle_t *zhp, zfs_prop_t prop)
+{
+	const char *propname = zfs_prop_to_name(prop);
+	zfs_cmd_t zc = { 0 };
+	int ret;
+	prop_changelist_t *cl;
+
+	/*
+	 * Verify that this property is inheritable.
+	 */
+	if (zfs_prop_readonly(prop)) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot inherit %s for '%s': property is read-only"),
+		    propname, zhp->zfs_name);
+		return (-1);
+	}
+
+	if (!zfs_prop_inheritable(prop)) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot inherit %s for '%s': property is not inheritable"),
+		    propname, zhp->zfs_name);
+		return (-1);
+	}
+
+	/*
+	 * Check to see if the value applies to this type
+	 */
+	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot inherit %s for '%s': property does "
+		    "not apply to %ss"), propname, zhp->zfs_name,
+		    zfs_type_to_name(zhp->zfs_type));
+		return (-1);
+	}
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+	(void) strlcpy(zc.zc_prop_name, propname, sizeof (zc.zc_prop_name));
+
+	if (prop == ZFS_PROP_MOUNTPOINT && getzoneid() == GLOBAL_ZONEID &&
+	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
+		zfs_error(dgettext(TEXT_DOMAIN, "cannot inherit %s for '%s', "
+		    "dataset is used in a non-global zone"), propname,
+		    zhp->zfs_name);
+		return (-1);
+	}
+
+	/*
+	 * Determine datasets which will be affected by this change, if any.
+	 */
+	if ((cl = changelist_gather(zhp, prop, 0)) == NULL)
+		return (-1);
+
+	if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) {
+		zfs_error(dgettext(TEXT_DOMAIN, "cannot inherit %s for '%s', "
+			"child dataset with inherited mountpoint is "
+			"used in a non-global zone"),
+			propname, zhp->zfs_name);
+		ret = -1;
+		goto error;
+	}
+
+	if ((ret = changelist_prefix(cl)) != 0)
+		goto error;
+
+	zc.zc_numints = 0;
+
+	if ((ret = ioctl(zfs_fd, ZFS_IOC_SET_PROP, &zc)) != 0) {
+		switch (errno) {
+		case EPERM:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot inherit %s for '%s': permission "
+			    "denied"), propname, zhp->zfs_name);
+			break;
+		case ENOENT:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot open '%s': no such %s"), zhp->zfs_name,
+			    zfs_type_to_name(zhp->zfs_type));
+			break;
+		case ENOSPC:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot inherit %s for '%s': "
+			    "out of space"), propname, zhp->zfs_name);
+			break;
+		default:
+			zfs_baderror(errno);
+		}
+
+	} else {
+
+		if ((ret = changelist_postfix(cl)) != 0)
+			goto error;
+
+		/*
+		 * Refresh the statistics so the new property is reflected.
+		 */
+		(void) get_stats(zhp);
+	}
+
+
+error:
+	changelist_free(cl);
+	return (ret);
+}
+
+static void
+nicebool(int value, char *buf, size_t buflen)
+{
+	if (value)
+		(void) strlcpy(buf, "on", buflen);
+	else
+		(void) strlcpy(buf, "off", buflen);
+}
+
+/*
+ * Internal function for getting a numeric property.  Both zfs_prop_get() and
+ * zfs_prop_get_int() are built using this interface.
+ *
+ * Certain properties can be overridden using 'mount -o'.  In this case, scan
+ * the contents of the /etc/mnttab entry, searching for the appropriate options.
+ * If they differ from the on-disk values, report the current values and mark
+ * the source "temporary".
+ */
+static uint64_t
+get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zfs_source_t *src,
+    char **source)
+{
+	uint64_t val;
+	struct mnttab mnt;
+
+	*source = NULL;
+
+	if (zhp->zfs_mntopts == NULL)
+		mnt.mnt_mntopts = "";
+	else
+		mnt.mnt_mntopts = zhp->zfs_mntopts;
+
+	switch (prop) {
+	case ZFS_PROP_ATIME:
+		*source = zhp->zfs_zplstats.zs_atime_setpoint;
+		val = zhp->zfs_zplstats.zs_devices;
+
+		if (hasmntopt(&mnt, MNTOPT_ATIME) && !val) {
+			val = TRUE;
+			if (src)
+				*src = ZFS_SRC_TEMPORARY;
+		} else if (hasmntopt(&mnt, MNTOPT_NOATIME) && val) {
+			val = FALSE;
+			if (src)
+				*src = ZFS_SRC_TEMPORARY;
+		}
+		return (zhp->zfs_zplstats.zs_atime);
+
+	case ZFS_PROP_AVAILABLE:
+		return (zhp->zfs_dmustats.dds_available);
+
+	case ZFS_PROP_DEVICES:
+		*source = zhp->zfs_zplstats.zs_devices_setpoint;
+		val = zhp->zfs_zplstats.zs_devices;
+
+		if (hasmntopt(&mnt, MNTOPT_DEVICES) && !val) {
+			val = TRUE;
+			if (src)
+				*src = ZFS_SRC_TEMPORARY;
+		} else if (hasmntopt(&mnt, MNTOPT_NODEVICES) && val) {
+			val = FALSE;
+			if (src)
+				*src = ZFS_SRC_TEMPORARY;
+		}
+		return (val);
+
+	case ZFS_PROP_EXEC:
+		*source = zhp->zfs_zplstats.zs_exec_setpoint;
+		val = zhp->zfs_zplstats.zs_exec;
+
+		if (hasmntopt(&mnt, MNTOPT_EXEC) && !val) {
+			val = TRUE;
+			if (src)
+				*src = ZFS_SRC_TEMPORARY;
+		} else if (hasmntopt(&mnt, MNTOPT_NOEXEC) && val) {
+			val = FALSE;
+			if (src)
+				*src = ZFS_SRC_TEMPORARY;
+		}
+		return (val);
+
+	case ZFS_PROP_RECORDSIZE:
+		*source = zhp->zfs_zplstats.zs_recordsize_setpoint;
+		return (zhp->zfs_zplstats.zs_recordsize);
+
+	case ZFS_PROP_COMPRESSION:
+		*source = zhp->zfs_dmustats.dds_compression_setpoint;
+		return (zhp->zfs_dmustats.dds_compression);
+
+	case ZFS_PROP_READONLY:
+		*source = zhp->zfs_zplstats.zs_readonly_setpoint;
+		val = zhp->zfs_zplstats.zs_readonly;
+
+		if (hasmntopt(&mnt, MNTOPT_RO) && !val) {
+			val = TRUE;
+			if (src)
+				*src = ZFS_SRC_TEMPORARY;
+		} else if (hasmntopt(&mnt, MNTOPT_RW) && val) {
+			val = FALSE;
+			if (src)
+				*src = ZFS_SRC_TEMPORARY;
+		}
+		return (val);
+
+	case ZFS_PROP_QUOTA:
+		if (zhp->zfs_dmustats.dds_quota == 0)
+			*source = "";	/* default */
+		else
+			*source = zhp->zfs_name;
+		return (zhp->zfs_dmustats.dds_quota);
+
+	case ZFS_PROP_RESERVATION:
+		if (zhp->zfs_dmustats.dds_reserved == 0)
+			*source = "";	/* default */
+		else
+			*source = zhp->zfs_name;
+		return (zhp->zfs_dmustats.dds_reserved);
+
+	case ZFS_PROP_COMPRESSRATIO:
+		/*
+		 * Using physical space and logical space, calculate the
+		 * compression ratio.  We return the number as a multiple of
+		 * 100, so '2.5x' would be returned as 250.
+		 */
+		if (zhp->zfs_dmustats.dds_compressed_bytes == 0)
+			return (100ULL);
+		else
+			return (zhp->zfs_dmustats.dds_uncompressed_bytes * 100 /
+			    zhp->zfs_dmustats.dds_compressed_bytes);
+
+	case ZFS_PROP_REFERENCED:
+		/*
+		 * 'referenced' refers to the amount of physical space
+		 * referenced (possibly shared) by this object.
+		 */
+		return (zhp->zfs_dmustats.dds_space_refd);
+
+	case ZFS_PROP_SETUID:
+		*source = zhp->zfs_zplstats.zs_setuid_setpoint;
+		val = zhp->zfs_zplstats.zs_setuid;
+
+		if (hasmntopt(&mnt, MNTOPT_SETUID) && !val) {
+			val = TRUE;
+			if (src)
+				*src = ZFS_SRC_TEMPORARY;
+		} else if (hasmntopt(&mnt, MNTOPT_NOSETUID) && val) {
+			val = FALSE;
+			if (src)
+				*src = ZFS_SRC_TEMPORARY;
+		}
+		return (val);
+
+	case ZFS_PROP_VOLSIZE:
+		return (zhp->zfs_volsize);
+
+	case ZFS_PROP_VOLBLOCKSIZE:
+		return (zhp->zfs_volblocksize);
+
+	case ZFS_PROP_ZONED:
+		*source = zhp->zfs_dmustats.dds_zoned_setpoint;
+		return (zhp->zfs_dmustats.dds_zoned);
+
+	case ZFS_PROP_USED:
+		return (zhp->zfs_dmustats.dds_space_used);
+
+	case ZFS_PROP_CREATETXG:
+		return (zhp->zfs_dmustats.dds_creation_txg);
+
+	case ZFS_PROP_MOUNTED:
+		/*
+		 * Unlike other properties, we defer calculation of 'MOUNTED'
+		 * until actually requested.  This is because the getmntany()
+		 * call can be extremely expensive on systems with a large
+		 * number of filesystems, and the property isn't needed in
+		 * normal use cases.
+		 */
+		if (zhp->zfs_mntopts == NULL) {
+			struct mnttab search = { 0 }, entry;
+
+			search.mnt_special = (char *)zhp->zfs_name;
+			rewind(mnttab_file);
+
+			if (getmntany(mnttab_file, &entry, &search) == 0)
+				zhp->zfs_mntopts =
+				    zfs_strdup(entry.mnt_mntopts);
+		}
+		return (zhp->zfs_mntopts != NULL);
+
+	default:
+		zfs_baderror(EINVAL);
+	}
+
+	return (0);
+}
+
+/*
+ * Calculate the source type, given the raw source string.
+ */
+static void
+get_source(zfs_handle_t *zhp, zfs_source_t *srctype, char *source,
+    char *statbuf, size_t statlen)
+{
+	if (statbuf == NULL || *srctype == ZFS_SRC_TEMPORARY)
+		return;
+
+	if (source == NULL) {
+		*srctype = ZFS_SRC_NONE;
+	} else if (source[0] == '\0') {
+		*srctype = ZFS_SRC_DEFAULT;
+	} else {
+		if (strcmp(source, zhp->zfs_name) == 0) {
+			*srctype = ZFS_SRC_LOCAL;
+		} else {
+			(void) strlcpy(statbuf, source, statlen);
+			*srctype = ZFS_SRC_INHERITED;
+		}
+	}
+
+}
+
+/*
+ * Retrieve a property from the given object.  If 'literal' is specified, then
+ * numbers are left as exact values.  Otherwise, numbers are converted to a
+ * human-readable form.
+ *
+ * Returns 0 on success, or -1 on error.
+ */
+int
+zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
+    zfs_source_t *src, char *statbuf, size_t statlen, int literal)
+{
+	char *source = NULL;
+	uint64_t val;
+	char *str;
+	int i;
+	const char *root;
+
+	/*
+	 * Check to see if this property applies to our object
+	 */
+	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
+		return (-1);
+
+	if (src)
+		*src = ZFS_SRC_NONE;
+
+	switch (prop) {
+	case ZFS_PROP_ATIME:
+	case ZFS_PROP_READONLY:
+	case ZFS_PROP_SETUID:
+	case ZFS_PROP_ZONED:
+	case ZFS_PROP_DEVICES:
+	case ZFS_PROP_EXEC:
+		/*
+		 * Basic boolean values are built on top of
+		 * get_numeric_property().
+		 */
+		nicebool(get_numeric_property(zhp, prop, src, &source),
+		    propbuf, proplen);
+
+		break;
+
+	case ZFS_PROP_AVAILABLE:
+	case ZFS_PROP_RECORDSIZE:
+	case ZFS_PROP_CREATETXG:
+	case ZFS_PROP_REFERENCED:
+	case ZFS_PROP_USED:
+	case ZFS_PROP_VOLSIZE:
+	case ZFS_PROP_VOLBLOCKSIZE:
+		/*
+		 * Basic numeric values are built on top of
+		 * get_numeric_property().
+		 */
+		val = get_numeric_property(zhp, prop, src, &source);
+		if (literal)
+			(void) snprintf(propbuf, proplen, "%llu", val);
+		else
+			zfs_nicenum(val, propbuf, proplen);
+		break;
+
+	case ZFS_PROP_COMPRESSION:
+		for (i = 0; compress_table[i].name != NULL; i++) {
+			if (compress_table[i].value ==
+			    zhp->zfs_dmustats.dds_compression)
+				break;
+		}
+		assert(compress_table[i].name != NULL);
+		(void) strlcpy(propbuf, compress_table[i].name, proplen);
+		source = zhp->zfs_dmustats.dds_compression_setpoint;
+		break;
+
+	case ZFS_PROP_CHECKSUM:
+		for (i = 0; checksum_table[i].name != NULL; i++) {
+			if (checksum_table[i].value ==
+			    zhp->zfs_dmustats.dds_checksum)
+				break;
+		}
+		assert(checksum_table[i].name != NULL);
+		(void) strlcpy(propbuf, checksum_table[i].name, proplen);
+		source = zhp->zfs_dmustats.dds_checksum_setpoint;
+		break;
+
+	case ZFS_PROP_SNAPDIR:
+		for (i = 0; snapdir_table[i].name != NULL; i++) {
+			if (snapdir_table[i].value ==
+			    zhp->zfs_zplstats.zs_snapdir)
+				break;
+		}
+		assert(snapdir_table[i].name != NULL);
+		(void) strlcpy(propbuf, snapdir_table[i].name, proplen);
+		source = zhp->zfs_zplstats.zs_snapdir_setpoint;
+		break;
+
+	case ZFS_PROP_ACLMODE:
+		for (i = 0; acl_mode_table[i].name != NULL; i++) {
+			if (acl_mode_table[i].value ==
+			    zhp->zfs_zplstats.zs_acl_mode)
+				break;
+		}
+		assert(acl_mode_table[i].name != NULL);
+		(void) strlcpy(propbuf, acl_mode_table[i].name, proplen);
+		source = zhp->zfs_zplstats.zs_acl_mode_setpoint;
+		break;
+
+	case ZFS_PROP_ACLINHERIT:
+		for (i = 0; acl_inherit_table[i].name != NULL; i++) {
+			if (acl_inherit_table[i].value ==
+			    zhp->zfs_zplstats.zs_acl_inherit)
+				break;
+		}
+		assert(acl_inherit_table[i].name != NULL);
+		(void) strlcpy(propbuf, acl_inherit_table[i].name, proplen);
+		source = zhp->zfs_zplstats.zs_acl_inherit_setpoint;
+		break;
+
+	case ZFS_PROP_CREATION:
+		/*
+		 * 'creation' is a time_t stored in the statistics.  We convert
+		 * this into a string unless 'literal' is specified.
+		 */
+		{
+			time_t time = (time_t)
+			    zhp->zfs_dmustats.dds_creation_time;
+			struct tm t;
+
+			if (literal ||
+			    localtime_r(&time, &t) == NULL ||
+			    strftime(propbuf, proplen, "%a %b %e %k:%M %Y",
+			    &t) == 0)
+				(void) snprintf(propbuf, proplen, "%llu",
+				    zhp->zfs_dmustats.dds_creation_time);
+		}
+		break;
+
+	case ZFS_PROP_MOUNTPOINT:
+		/*
+		 * Getting the precise mountpoint can be tricky.
+		 *
+		 *  - for 'none' or 'legacy', return those values.
+		 *  - for default mountpoints, construct it as /zfs/<dataset>
+		 *  - for inherited mountpoints, we want to take everything
+		 *    after our ancestor and append it to the inherited value.
+		 *
+		 * If the pool has an alternate root, we want to prepend that
+		 * root to any values we return.
+		 */
+		root = zhp->zfs_dmustats.dds_altroot;
+
+		if (zhp->zfs_zplstats.zs_mountpoint[0] == '\0') {
+			(void) snprintf(propbuf, proplen, "%s/zfs/%s",
+			    root, zhp->zfs_name);
+		} else if (zhp->zfs_zplstats.zs_mountpoint[0] == '/') {
+			const char *relpath = zhp->zfs_name +
+			    strlen(zhp->zfs_zplstats.zs_mountpoint_setpoint);
+			const char *mntpoint = zhp->zfs_zplstats.zs_mountpoint;
+
+			if (relpath[0] == '/')
+				relpath++;
+			if (mntpoint[1] == '\0')
+				mntpoint++;
+
+			if (relpath[0] == '\0')
+				(void) snprintf(propbuf, proplen, "%s%s",
+				    root, mntpoint);
+			else
+				(void) snprintf(propbuf, proplen, "%s%s%s%s",
+				    root, mntpoint,
+				    relpath[0] == '@' ? "" : "/",
+				    relpath);
+		} else {
+			/* 'legacy' or 'none' */
+			(void) strlcpy(propbuf, zhp->zfs_zplstats.zs_mountpoint,
+			    proplen);
+		}
+
+		source = zhp->zfs_zplstats.zs_mountpoint_setpoint;
+		break;
+
+	case ZFS_PROP_SHARENFS:
+		(void) strlcpy(propbuf, zhp->zfs_zplstats.zs_sharenfs, proplen);
+		source = zhp->zfs_zplstats.zs_sharenfs_setpoint;
+		break;
+
+	case ZFS_PROP_ORIGIN:
+		(void) strlcpy(propbuf, zhp->zfs_dmustats.dds_clone_of,
+		    proplen);
+		/*
+		 * If there is no parent at all, return failure to indicate that
+		 * it doesn't apply to this dataset.
+		 */
+		if (propbuf[0] == '\0')
+			return (-1);
+		break;
+
+	case ZFS_PROP_QUOTA:
+	case ZFS_PROP_RESERVATION:
+		val = get_numeric_property(zhp, prop, src, &source);
+
+		/*
+		 * If quota or reservation is 0, we translate this into 'none'
+		 * (unless literal is set), and indicate that it's the default
+		 * value.  Otherwise, we print the number nicely and indicate
+		 * that its set locally.
+		 */
+		if (val == 0) {
+			if (literal)
+				(void) strlcpy(propbuf, "0", proplen);
+			else
+				(void) strlcpy(propbuf, "none", proplen);
+		} else {
+			if (literal)
+				(void) snprintf(propbuf, proplen, "%llu", val);
+			else
+				zfs_nicenum(val, propbuf, proplen);
+		}
+		break;
+
+	case ZFS_PROP_COMPRESSRATIO:
+		val = get_numeric_property(zhp, prop, src, &source);
+		(void) snprintf(propbuf, proplen, "%lld.%02lldx", val / 100,
+		    val % 100);
+		break;
+
+	case ZFS_PROP_TYPE:
+		switch (zhp->zfs_type) {
+		case ZFS_TYPE_FILESYSTEM:
+			str = "filesystem";
+			break;
+		case ZFS_TYPE_VOLUME:
+			str = "volume";
+			break;
+		case ZFS_TYPE_SNAPSHOT:
+			str = "snapshot";
+			break;
+		default:
+			zfs_baderror(zhp->zfs_type);
+		}
+		(void) snprintf(propbuf, proplen, "%s", str);
+		break;
+
+	case ZFS_PROP_MOUNTED:
+		/*
+		 * The 'mounted' property is a pseudo-property that described
+		 * whether the filesystem is currently mounted.  Even though
+		 * it's a boolean value, the typical values of "on" and "off"
+		 * don't make sense, so we translate to "yes" and "no".
+		 */
+		if (get_numeric_property(zhp, ZFS_PROP_MOUNTED, src, &source))
+			(void) strlcpy(propbuf, "yes", proplen);
+		else
+			(void) strlcpy(propbuf, "no", proplen);
+		break;
+
+	case ZFS_PROP_NAME:
+		/*
+		 * The 'name' property is a pseudo-property derived from the
+		 * dataset name.  It is presented as a real property to simplify
+		 * consumers.
+		 */
+		(void) strlcpy(propbuf, zhp->zfs_name, proplen);
+		break;
+
+	default:
+		zfs_baderror(EINVAL);
+	}
+
+	get_source(zhp, src, source, statbuf, statlen);
+
+	return (0);
+}
+
+/*
+ * Utility function to get the given numeric property.  Does no validation that
+ * the given property is the appropriate type; should only be used with
+ * hard-coded property types.
+ */
+uint64_t
+zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop)
+{
+	char *source;
+	zfs_source_t sourcetype = ZFS_SRC_NONE;
+
+	return (get_numeric_property(zhp, prop, &sourcetype, &source));
+}
+
+/*
+ * Similar to zfs_prop_get(), but returns the value as an integer.
+ */
+int
+zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value,
+    zfs_source_t *src, char *statbuf, size_t statlen)
+{
+	char *source;
+
+	/*
+	 * Check to see if this property applies to our object
+	 */
+	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
+		return (-1);
+
+	if (src)
+		*src = ZFS_SRC_NONE;
+
+	*value = get_numeric_property(zhp, prop, src, &source);
+
+	get_source(zhp, src, source, statbuf, statlen);
+
+	return (0);
+}
+
+/*
+ * Returns the name of the given zfs handle.
+ */
+const char *
+zfs_get_name(const zfs_handle_t *zhp)
+{
+	return (zhp->zfs_name);
+}
+
+/*
+ * Returns the type of the given zfs handle.
+ */
+zfs_type_t
+zfs_get_type(const zfs_handle_t *zhp)
+{
+	return (zhp->zfs_type);
+}
+
+/*
+ * Iterate over all children, datasets and snapshots.
+ */
+int
+zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data)
+{
+	zfs_cmd_t zc = { 0 };
+	zfs_handle_t *nzhp;
+	int ret;
+
+	for ((void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+	    ioctl(zfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0;
+	    (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name))) {
+		/*
+		 * Ignore private dataset names.
+		 */
+		if (dataset_name_hidden(zc.zc_name))
+			continue;
+
+		/*
+		 * Silently ignore errors, as the only plausible explanation is
+		 * that the pool has since been removed.
+		 */
+		if ((nzhp = make_dataset_handle(zc.zc_name)) == NULL)
+			continue;
+
+		if ((ret = func(nzhp, data)) != 0)
+			return (ret);
+	}
+
+	/*
+	 * An errno value of ESRCH indicates normal completion.  If ENOENT is
+	 * returned, then the underlying dataset has been removed since we
+	 * obtained the handle.
+	 */
+	if (errno != ESRCH && errno != ENOENT)
+		zfs_baderror(errno);
+
+	bzero(&zc, sizeof (zc));
+
+	for ((void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+	    ioctl(zfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0;
+	    (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name))) {
+
+		if ((nzhp = make_dataset_handle(zc.zc_name)) == NULL)
+			continue;
+
+		if ((ret = func(nzhp, data)) != 0)
+			return (ret);
+	}
+
+	/*
+	 * An errno value of ESRCH indicates normal completion.  If ENOENT is
+	 * returned, then the underlying dataset has been removed since we
+	 * obtained the handle.  Silently ignore this case, and return success.
+	 */
+	if (errno != ESRCH && errno != ENOENT)
+		zfs_baderror(errno);
+
+	return (0);
+}
+
+/*
+ * Given a complete name, return just the portion that refers to the parent.
+ * Can return NULL if this is a pool.
+ */
+static int
+parent_name(const char *path, char *buf, size_t buflen)
+{
+	char *loc;
+
+	if ((loc = strrchr(path, '/')) == NULL)
+		return (-1);
+
+	(void) strncpy(buf, path, MIN(buflen, loc - path));
+	buf[loc - path] = '\0';
+
+	return (0);
+}
+
+/*
+ * Checks to make sure that the given path has a parent, and that it exists.
+ */
+static int
+check_parents(const char *path, zfs_type_t type)
+{
+	zfs_cmd_t zc = { 0 };
+	char parent[ZFS_MAXNAMELEN];
+	char *slash;
+
+	/* get parent, and check to see if this is just a pool */
+	if (parent_name(path, parent, sizeof (parent)) != 0) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot create '%s': missing dataset name"),
+		    path, zfs_type_to_name(type));
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "use 'zpool create' to create a storage pool"));
+		return (-1);
+	}
+
+	/* check to see if the pool exists */
+	if ((slash = strchr(parent, '/')) == NULL)
+		slash = parent + strlen(parent);
+	(void) strncpy(zc.zc_name, parent, slash - parent);
+	zc.zc_name[slash - parent] = '\0';
+	if (ioctl(zfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0 &&
+	    errno == ENOENT) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot create '%s': no such pool '%s'"), path, zc.zc_name);
+		return (-1);
+	}
+
+	/* check to see if the parent dataset exists */
+	(void) strlcpy(zc.zc_name, parent, sizeof (zc.zc_name));
+	if (ioctl(zfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) {
+		switch (errno) {
+		case ENOENT:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot create '%s': parent does not exist"), path);
+			return (-1);
+
+		default:
+			zfs_baderror(errno);
+		}
+	}
+
+	/* we are in a non-global zone, but parent is in the global zone */
+	if (getzoneid() != GLOBAL_ZONEID && !zc.zc_objset_stats.dds_zoned) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot create '%s': permission denied"), path);
+		return (-1);
+	}
+
+	/* make sure parent is a filesystem */
+	if (zc.zc_objset_stats.dds_type != DMU_OST_ZFS) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot create '%s': parent is not a filesystem"),
+		    path);
+		return (-1);
+	}
+
+	return (0);
+}
+
+/*
+ * Create a new filesystem or volume.  'sizestr' and 'blocksizestr' are used
+ * only for volumes, and indicate the size and blocksize of the volume.
+ */
+int
+zfs_create(const char *path, zfs_type_t type,
+	const char *sizestr, const char *blocksizestr)
+{
+	char reason[64];
+	zfs_cmd_t zc = { 0 };
+	int ret;
+	uint64_t size = 0;
+	uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
+
+	/* convert sizestr into integer size */
+	if (sizestr != NULL && nicestrtonum(sizestr, &size,
+	    reason, sizeof (reason)) != 0) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "bad volume size '%s': %s"), sizestr, reason);
+		return (-1);
+	}
+
+	/* convert blocksizestr into integer blocksize */
+	if (blocksizestr != NULL && nicestrtonum(blocksizestr, &blocksize,
+	    reason, sizeof (reason)) != 0) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "bad volume blocksize '%s': %s"), blocksizestr, reason);
+		return (-1);
+	}
+
+	/* make sure the name is not too long */
+	if (strlen(path) >= ZFS_MAXNAMELEN) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot create '%s': %s name is too long"),
+		    path, zfs_type_to_name(type));
+		return (-1);
+	}
+
+	/* validate the path, taking care to note the extended error message */
+	if (!zfs_validate_name(path, type, reason, sizeof (reason))) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot create '%s': %s in %s name"), path, reason,
+		    zfs_type_to_name(type));
+		if (strstr(reason, "snapshot") != NULL)
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "use 'zfs snapshot' to create a snapshot"));
+		return (-1);
+	}
+
+	/* validate parents exist */
+	if (check_parents(path, type) != 0)
+		return (-1);
+
+	/*
+	 * The failure modes when creating a dataset of a different type over
+	 * one that already exists is a little strange.  In particular, if you
+	 * try to create a dataset on top of an existing dataset, the ioctl()
+	 * will return ENOENT, not EEXIST.  To prevent this from happening, we
+	 * first try to see if the dataset exists.
+	 */
+	(void) strlcpy(zc.zc_name, path, sizeof (zc.zc_name));
+	if (ioctl(zfs_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot create '%s': dataset exists"), path);
+		return (-1);
+	}
+
+	if (type == ZFS_TYPE_VOLUME)
+		zc.zc_objset_type = DMU_OST_ZVOL;
+	else
+		zc.zc_objset_type = DMU_OST_ZFS;
+
+	if (type == ZFS_TYPE_VOLUME) {
+		if (size == 0) {
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "bad volume size '%s': cannot be zero"), sizestr);
+			return (-1);
+		}
+
+		zc.zc_volsize = size;
+		zc.zc_volblocksize = blocksize;
+	}
+
+	/* create the dataset */
+
+	ret = ioctl(zfs_fd, ZFS_IOC_CREATE, &zc);
+
+	if (ret == 0 && type == ZFS_TYPE_VOLUME)
+		ret = zvol_create_link(path);
+
+	/* check for failure */
+	if (ret != 0) {
+		char parent[ZFS_MAXNAMELEN];
+		(void) parent_name(path, parent, sizeof (parent));
+
+		switch (errno) {
+		case ENOENT:
+			/*
+			 * The parent dataset has been deleted since our
+			 * previous check.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot create '%s': no such parent '%s'"),
+			    path, parent);
+			break;
+
+		case EPERM:
+			/*
+			 * The user doesn't have permission to create a new
+			 * dataset here.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot create '%s': permission denied"), path);
+			break;
+
+		case EDQUOT:
+		case ENOSPC:
+			/*
+			 * The parent dataset does not have enough free space
+			 * to create a new dataset.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot create '%s': not enough space in '%s'"),
+			    path, parent);
+			break;
+
+		case EEXIST:
+			/*
+			 * The target dataset already exists.  We should have
+			 * caught this above, but there may be some unexplained
+			 * race condition.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot create '%s': dataset exists"), path);
+			break;
+
+		case EINVAL:
+			/*
+			 * The target dataset does not support children.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot create '%s': children unsupported in '%s'"),
+			    path, parent);
+			break;
+
+		case EDOM:
+			zfs_error(dgettext(TEXT_DOMAIN, "bad %s value '%s': "
+			    "must be power of 2 from %u to %uk"),
+			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+			    blocksizestr ? blocksizestr : "<unknown>",
+			    (uint_t)SPA_MINBLOCKSIZE,
+			    (uint_t)SPA_MAXBLOCKSIZE >> 10);
+			break;
+#ifdef _ILP32
+		case EOVERFLOW:
+			/*
+			 * This platform can't address a volume this big.
+			 */
+			if (type == ZFS_TYPE_VOLUME) {
+				zfs_error(dgettext(TEXT_DOMAIN,
+				    "cannot create '%s': "
+				    "max volume size is 1TB on 32-bit systems"),
+				    path);
+				break;
+			}
+#endif
+
+		default:
+			zfs_baderror(errno);
+		}
+
+		return (-1);
+	}
+
+	return (0);
+}
+
+/*
+ * Destroys the given dataset.  The caller must make sure that the filesystem
+ * isn't mounted, and that there are no active dependents.
+ */
+int
+zfs_destroy(zfs_handle_t *zhp)
+{
+	zfs_cmd_t zc = { 0 };
+	int ret;
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+
+	/*
+	 * We use the check for 'zfs_volblocksize' instead of ZFS_TYPE_VOLUME
+	 * so that we do the right thing for snapshots of volumes.
+	 */
+	if (zhp->zfs_volblocksize != 0) {
+		if (zvol_remove_link(zhp->zfs_name) != 0)
+			return (-1);
+
+		zc.zc_objset_type = DMU_OST_ZVOL;
+	} else {
+		zc.zc_objset_type = DMU_OST_ZFS;
+	}
+
+	ret = ioctl(zfs_fd, ZFS_IOC_DESTROY, &zc);
+
+	if (ret != 0) {
+		switch (errno) {
+
+		case EPERM:
+			/*
+			 * We don't have permission to destroy this dataset.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot destroy '%s': permission denied"),
+			    zhp->zfs_name);
+			break;
+
+		case ENOENT:
+			/*
+			 * We've hit a race condition where the dataset has been
+			 * destroyed since we opened it.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot destroy '%s': no such %s"),
+			    zhp->zfs_name, zfs_type_to_name(zhp->zfs_type));
+			break;
+
+		case EBUSY:
+			/*
+			 * Even if we destroy all children, there is a chance we
+			 * can hit this case if:
+			 *
+			 * 	- A child dataset has since been created
+			 * 	- A filesystem is mounted
+			 *
+			 * This error message is awful, but hopefully we've
+			 * already caught the common cases (and aborted more
+			 * appropriately) before calling this function.  There's
+			 * nothing else we can do at this point.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot destroy '%s': %s is busy"),
+			    zhp->zfs_name, zfs_type_to_name(zhp->zfs_type));
+			break;
+
+		default:
+			zfs_baderror(errno);
+		}
+
+		return (-1);
+	}
+
+	remove_mountpoint(zhp);
+
+	return (0);
+}
+
+/*
+ * Clones the given dataset.  The target must be of the same type as the source.
+ */
+int
+zfs_clone(zfs_handle_t *zhp, const char *target)
+{
+	char reason[64];
+	zfs_cmd_t zc = { 0 };
+	char parent[ZFS_MAXNAMELEN];
+	int ret;
+
+	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
+
+	/* validate the target name */
+	if (!zfs_validate_name(target, ZFS_TYPE_FILESYSTEM, reason,
+	    sizeof (reason))) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot create '%s': %s in filesystem name"), target,
+		    reason, zfs_type_to_name(ZFS_TYPE_FILESYSTEM));
+		return (-1);
+	}
+
+	/* validate parents exist */
+	if (check_parents(target, zhp->zfs_type) != 0)
+		return (-1);
+
+	(void) parent_name(target, parent, sizeof (parent));
+
+	/* do the clone */
+	if (zhp->zfs_volblocksize != 0)
+		zc.zc_objset_type = DMU_OST_ZVOL;
+	else
+		zc.zc_objset_type = DMU_OST_ZFS;
+
+	(void) strlcpy(zc.zc_name, target, sizeof (zc.zc_name));
+	(void) strlcpy(zc.zc_filename, zhp->zfs_name, sizeof (zc.zc_filename));
+	ret = ioctl(zfs_fd, ZFS_IOC_CREATE, &zc);
+
+	if (ret != 0) {
+		switch (errno) {
+		case EPERM:
+			/*
+			 * The user doesn't have permission to create the clone.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot create '%s': permission denied"),
+			    target);
+			break;
+
+		case ENOENT:
+			/*
+			 * The parent doesn't exist.  We should have caught this
+			 * above, but there may a race condition that has since
+			 * destroyed the parent.
+			 *
+			 * At this point, we don't know whether it's the source
+			 * that doesn't exist anymore, or whether the target
+			 * dataset doesn't exist.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot create '%s': no such parent '%s'"),
+			    target, parent);
+			break;
+
+		case EDQUOT:
+		case ENOSPC:
+			/*
+			 * There is not enough space in the target dataset
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot create '%s': not enough space in '%s'"),
+			    target, parent);
+			break;
+
+		case EEXIST:
+			/*
+			 * The target already exists.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot create '%s': dataset exists"), target);
+			break;
+
+		case EXDEV:
+			/*
+			 * The source and target pools differ.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot create '%s': "
+			    "source and target pools differ"), target);
+			break;
+
+		default:
+			zfs_baderror(errno);
+		}
+	} else if (zhp->zfs_volblocksize != 0) {
+		ret = zvol_create_link(target);
+	}
+
+	return (ret);
+}
+
+/*
+ * Takes a snapshot of the given dataset
+ */
+int
+zfs_snapshot(const char *path)
+{
+	char reason[64];
+	const char *delim;
+	char *parent;
+	zfs_handle_t *zhp;
+	zfs_cmd_t zc = { 0 };
+	int ret;
+
+	/* validate the snapshot name */
+	if (!zfs_validate_name(path, ZFS_TYPE_SNAPSHOT, reason,
+	    sizeof (reason))) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot snapshot '%s': %s in snapshot name"), path,
+		    reason);
+		return (-1);
+	}
+
+	/* make sure we have a snapshot */
+	if ((delim = strchr(path, '@')) == NULL) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot snapshot '%s': missing '@' delim in snapshot "
+		    "name"), path);
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "use 'zfs create' to create a filesystem"));
+		return (-1);
+	}
+
+	/* make sure the parent exists and is of the appropriate type */
+	parent = zfs_malloc(delim - path + 1);
+	(void) strncpy(parent, path, delim - path);
+	parent[delim - path] = '\0';
+
+	if ((zhp = zfs_open(parent, ZFS_TYPE_FILESYSTEM |
+	    ZFS_TYPE_VOLUME)) == NULL) {
+		free(parent);
+		return (-1);
+	}
+
+	(void) strlcpy(zc.zc_name, path, sizeof (zc.zc_name));
+
+	if (zhp->zfs_type == ZFS_TYPE_VOLUME)
+		zc.zc_objset_type = DMU_OST_ZVOL;
+	else
+		zc.zc_objset_type = DMU_OST_ZFS;
+
+	ret = ioctl(zfs_fd, ZFS_IOC_CREATE, &zc);
+
+	if (ret == 0 && zhp->zfs_type == ZFS_TYPE_VOLUME) {
+		ret = zvol_create_link(path);
+		if (ret != 0)
+			(void) ioctl(zfs_fd, ZFS_IOC_DESTROY, &zc);
+	}
+
+	if (ret != 0) {
+		switch (errno) {
+		case EPERM:
+			/*
+			 * User doesn't have permission to create a snapshot
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot create '%s': "
+			    "permission denied"), path);
+			break;
+
+		case EDQUOT:
+		case ENOSPC:
+			/*
+			 * Out of space in parent.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot create '%s': "
+			    "not enough space in '%s'"), path, parent);
+			break;
+
+		case EEXIST:
+			/*
+			 * Snapshot already exists.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot create '%s': "
+			    "snapshot exists"), path);
+			break;
+
+		case ENOENT:
+			/*
+			 * Shouldn't happen because we verified the parent
+			 * above.  But there may be a race condition where it
+			 * has since been removed.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot open '%s': "
+			    "no such %s"), parent,
+			    zfs_type_to_name(zhp->zfs_type));
+			break;
+
+		default:
+			zfs_baderror(errno);
+		}
+	}
+
+	free(parent);
+	zfs_close(zhp);
+
+	return (ret);
+}
+
+/*
+ * Dumps a backup of tosnap, incremental from fromsnap if it isn't NULL.
+ */
+int
+zfs_backup(zfs_handle_t *zhp_to, zfs_handle_t *zhp_from)
+{
+	zfs_cmd_t zc = { 0 };
+	int ret;
+
+	/* do the ioctl() */
+	(void) strlcpy(zc.zc_name, zhp_to->zfs_name, sizeof (zc.zc_name));
+	if (zhp_from) {
+		(void) strlcpy(zc.zc_prop_value, zhp_from->zfs_name,
+		    sizeof (zc.zc_name));
+	} else {
+		zc.zc_prop_value[0] = '\0';
+	}
+	zc.zc_cookie = STDOUT_FILENO;
+
+	ret = ioctl(zfs_fd, ZFS_IOC_SENDBACKUP, &zc);
+	if (ret != 0) {
+		switch (errno) {
+		case EPERM:
+			/*
+			 * User doesn't have permission to do a backup
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot backup '%s': "
+			    "permission denied"), zhp_to->zfs_name);
+			break;
+
+		case EXDEV:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot do incremental backup from %s:\n"
+			    "it is not an earlier snapshot from the "
+			    "same fs as %s"),
+			    zhp_from->zfs_name, zhp_to->zfs_name);
+			break;
+
+		case ENOENT:
+			/*
+			 * Shouldn't happen because we verified the parent
+			 * above.  But there may be a race condition where it
+			 * has since been removed.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot open: "
+			    "no such snapshot"));
+			break;
+
+		case EDQUOT:
+		case EFBIG:
+		case EIO:
+		case ENOLINK:
+		case ENOSPC:
+		case ENOSTR:
+		case ENXIO:
+		case EPIPE:
+		case ERANGE:
+		case EFAULT:
+		case EROFS:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot write backup stream: %s"),
+			    strerror(errno));
+			break;
+
+		case EINTR:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "backup failed: signal recieved"));
+			break;
+
+		default:
+			zfs_baderror(errno);
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * Restores a backup of tosnap from stdin.
+ */
+int
+zfs_restore(const char *tosnap, int isprefix, int verbose, int dryrun)
+{
+	zfs_cmd_t zc = { 0 };
+	time_t begin_time;
+	int err, bytes;
+	char *cp;
+	dmu_replay_record_t drr;
+	struct drr_begin *drrb = &zc.zc_begin_record;
+
+	begin_time = time(NULL);
+
+	/* trim off snapname, if any */
+	(void) strcpy(zc.zc_name, tosnap);
+	cp = strchr(zc.zc_name, '@');
+	if (cp)
+		*cp = '\0';
+
+	/* read in the BEGIN record */
+	cp = (char *)&drr;
+	bytes = 0;
+	do {
+		err = read(STDIN_FILENO, cp, sizeof (drr) - bytes);
+		cp += err;
+		bytes += err;
+	} while (err > 0);
+
+	if (err < 0 || bytes != sizeof (drr)) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "Can't restore: invalid backup stream "
+		    "(couldn't read first record)"));
+		return (-1);
+	}
+
+	zc.zc_begin_record = drr.drr_u.drr_begin;
+
+	if (drrb->drr_magic != DMU_BACKUP_MAGIC &&
+	    drrb->drr_magic != BSWAP_64(DMU_BACKUP_MAGIC)) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "Can't restore: invalid backup stream "
+		    "(invalid magic number)"));
+		return (-1);
+	}
+
+	if (drrb->drr_version != DMU_BACKUP_VERSION &&
+	    drrb->drr_version != BSWAP_64(DMU_BACKUP_VERSION)) {
+		if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
+			drrb->drr_version = BSWAP_64(drrb->drr_version);
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "Can't restore: only backup version 0x%llx is supported, "
+		    "stream is version %llx."),
+		    DMU_BACKUP_VERSION, drrb->drr_version);
+		return (-1);
+	}
+
+	/*
+	 * Determine name of destination snapshot.
+	 */
+	(void) strcpy(drrb->drr_toname, tosnap);
+	if (isprefix) {
+		if (strchr(tosnap, '@') != NULL) {
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "Can't restore: "
+			    "argument to -d must be a filesystem"));
+			return (-1);
+		}
+
+		cp = strchr(drr.drr_u.drr_begin.drr_toname, '/');
+		if (cp == NULL)
+			cp = drr.drr_u.drr_begin.drr_toname;
+		else
+			cp++;
+
+		(void) strcat(drrb->drr_toname, "/");
+		(void) strcat(drrb->drr_toname, cp);
+	} else if (strchr(tosnap, '@') == NULL) {
+		/*
+		 * they specified just a filesystem; tack on the
+		 * snapname from the backup.
+		 */
+		cp = strchr(drr.drr_u.drr_begin.drr_toname, '@');
+		if (cp == NULL || strlen(tosnap) + strlen(cp) >= MAXNAMELEN) {
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "Can't restore: invalid backup stream "
+			    "(invalid snapshot name)"));
+			return (-1);
+		}
+		(void) strcat(drrb->drr_toname, cp);
+	}
+
+	if (drrb->drr_fromguid) {
+		zfs_handle_t *h;
+		/* incremental backup stream */
+
+		/* do the ioctl to the containing fs */
+		(void) strcpy(zc.zc_name, drrb->drr_toname);
+		cp = strchr(zc.zc_name, '@');
+		*cp = '\0';
+
+		/* make sure destination fs exists */
+		h = zfs_open(zc.zc_name, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+		if (h == NULL) {
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "Can't restore incrememtal backup: destination\n"
+			    "filesystem %s does not exist"),
+			    zc.zc_name);
+			return (-1);
+		}
+		/* unmount destination fs */
+		if (!dryrun)
+			(void) zfs_unmount(h, NULL, 0);
+		zfs_close(h);
+
+
+	} else {
+		/* full backup stream */
+
+		/* do the ioctl to the containing fs's parent */
+		(void) strcpy(zc.zc_name, drrb->drr_toname);
+		cp = strrchr(zc.zc_name, '/');
+		if (cp == NULL) {
+			cp = strchr(zc.zc_name, '@');
+			if (cp)
+				*cp = '\0';
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "Can't restore: destination fs %s already exists"),
+			    zc.zc_name);
+			return (-1);
+		}
+		*cp = '\0';
+
+		/* make sure destination fs exists */
+
+		if (isprefix) {
+			/* make sure prefix exists */
+			zfs_handle_t *h;
+
+			/* make sure destination fs exists */
+			h = zfs_open(tosnap, ZFS_TYPE_FILESYSTEM |
+			    ZFS_TYPE_VOLUME);
+			if (h == NULL) {
+				zfs_error(dgettext(TEXT_DOMAIN,
+				    "Can't restore:"
+				    "filesystem %s does not exist"),
+				    tosnap);
+				return (-1);
+			}
+
+			/* create any necessary ancestors up to prefix */
+			cp = zc.zc_name + strlen(tosnap) + 1;
+			while (cp = strchr(cp, '/')) {
+				*cp = '\0';
+				err = ioctl(zfs_fd, ZFS_IOC_CREATE, &zc);
+				if (err && err != ENOENT && err != EEXIST) {
+					zfs_error(dgettext(TEXT_DOMAIN,
+					    "Can't restore:"
+					    "couldn't create ancestor %s"),
+					    zc.zc_name);
+					return (-1);
+				}
+			}
+		}
+	}
+
+	(void) strcpy(zc.zc_prop_value, tosnap);
+	zc.zc_cookie = STDIN_FILENO;
+	zc.zc_intsz = isprefix;
+	if (verbose) {
+		(void) printf("%s %s backup of %s into %s\n",
+		    dryrun ? "would restore" : "restoring",
+		    drrb->drr_fromguid ? "incremental" : "full",
+		    drr.drr_u.drr_begin.drr_toname,
+		    zc.zc_begin_record.drr_toname);
+		(void) fflush(stdout);
+	}
+	if (dryrun)
+		return (0);
+	err = ioctl(zfs_fd, ZFS_IOC_RECVBACKUP, &zc);
+	if (err != 0) {
+		switch (errno) {
+		case ENODEV:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "Can't restore: "
+			    "Most recent snapshot does not "
+			    "match incremental backup source"));
+			break;
+		case ETXTBSY:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "Can't restore: "
+			    "Destination has been modified since "
+			    "most recent snapshot.\n"
+			    "Use 'zfs rollback' to discard changes."));
+			break;
+		case EEXIST:
+			if (drrb->drr_fromguid == 0) {
+				/* it's the containing fs that exists */
+				cp = strchr(drrb->drr_toname, '@');
+				*cp = '\0';
+			}
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "Can't restore to %s: Destination already exists"),
+			    drrb->drr_toname);
+			break;
+		case ENOENT:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "Can't restore: "
+			    "Destination does not exist"));
+			break;
+		case EBUSY:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "Can't restore: "
+			    "Destination is in use"));
+			break;
+		case ENOSPC:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "Can't restore: "
+			    "Out of space"));
+			break;
+		case EDQUOT:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "Can't restore: "
+			    "Quota exceeded"));
+			break;
+		case EINTR:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "Restore failed: signal recieved"));
+			break;
+		case EINVAL:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "Can't restore: "
+			    "invalid backup stream"));
+			break;
+		default:
+			zfs_baderror(errno);
+		}
+	}
+
+	/*
+	 * Mount or recreate the /dev links for the target filesystem.
+	 */
+	cp = strchr(drrb->drr_toname, '@');
+	if (cp && (err == 0 || drrb->drr_fromguid)) {
+		zfs_handle_t *h;
+
+		*cp = '\0';
+		h = zfs_open(drrb->drr_toname,
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+		if (h) {
+			if (h->zfs_type == ZFS_TYPE_FILESYSTEM)
+				err = zfs_mount(h, NULL, 0);
+			else
+				err = zvol_create_link(h->zfs_name);
+			zfs_close(h);
+		}
+	}
+
+	/*
+	 * If the destination snapshot was also specified, and it was a volume,
+	 * make sure that the appropriate /dev link was created as well.
+	 */
+	if (err == 0) {
+		zfs_handle_t *h;
+
+		if (cp)
+			*cp = '@';
+
+		h = zfs_open(drrb->drr_toname, ZFS_TYPE_SNAPSHOT |
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+		if (h) {
+			if (h->zfs_volblocksize)
+				err = zvol_create_link(h->zfs_name);
+			zfs_close(h);
+		}
+	}
+
+	if (err)
+		return (err);
+
+	if (verbose) {
+		char buf1[64];
+		char buf2[64];
+		uint64_t bytes = zc.zc_cookie;
+		time_t delta = time(NULL) - begin_time;
+		if (delta == 0)
+			delta = 1;
+		zfs_nicenum(bytes, buf1, sizeof (buf1));
+		zfs_nicenum(bytes/delta, buf2, sizeof (buf1));
+
+		(void) printf("restored %sb backup in %lu seconds (%sb/sec)\n",
+		    buf1, delta, buf2);
+	}
+	return (0);
+}
+
+/*
+ * Rollback the given dataset to the previous snapshot.  It is up to the caller
+ * to verify that there is a previous snapshot available.
+ */
+int
+zfs_rollback(zfs_handle_t *zhp)
+{
+	int ret;
+	zfs_cmd_t zc = { 0 };
+
+	assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM ||
+	    zhp->zfs_type == ZFS_TYPE_VOLUME);
+
+	if (zhp->zfs_type == ZFS_TYPE_VOLUME &&
+	    zvol_remove_link(zhp->zfs_name) != 0)
+		return (-1);
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+
+	if (zhp->zfs_volblocksize != 0)
+		zc.zc_objset_type = DMU_OST_ZVOL;
+	else
+		zc.zc_objset_type = DMU_OST_ZFS;
+
+	/*
+	 * We rely on the consumer to verify that there are no newer snapshots
+	 * for the given dataset.  Given these constraints, we can simply pass
+	 * the name on to the ioctl() call.  There is still an unlikely race
+	 * condition where the user has taken a snapshot since we verified that
+	 * this was the most recent.
+	 */
+	if ((ret = ioctl(zfs_fd, ZFS_IOC_ROLLBACK, &zc)) != 0) {
+		switch (errno) {
+		case EPERM:
+			/*
+			 * The user doesn't have permission to rollback the
+			 * given dataset.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot rollback '%s': "
+			    "permission denied"), zhp->zfs_name);
+			break;
+
+		case EDQUOT:
+		case ENOSPC:
+			/*
+			 * The parent dataset doesn't have enough space to
+			 * rollback to the last snapshot.
+			 */
+			{
+				char parent[ZFS_MAXNAMELEN];
+				(void) parent_name(zhp->zfs_name, parent,
+				    sizeof (parent));
+				zfs_error(dgettext(TEXT_DOMAIN, "cannot "
+				    "rollback '%s': out of space"), parent);
+			}
+			break;
+
+		case ENOENT:
+			/*
+			 * The dataset doesn't exist.  This shouldn't happen
+			 * except in race conditions.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot rollback '%s': "
+			    "no such %s"), zhp->zfs_name,
+			    zfs_type_to_name(zhp->zfs_type));
+			break;
+
+		case EBUSY:
+			/*
+			 * The filesystem is busy.  This should have been caught
+			 * by the caller before getting here, but there may be
+			 * an unexpected problem.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot rollback '%s': "
+			    "%s is busy"), zhp->zfs_name,
+			    zfs_type_to_name(zhp->zfs_type));
+			break;
+
+		default:
+			zfs_baderror(errno);
+		}
+	} else if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
+		ret = zvol_create_link(zhp->zfs_name);
+	}
+
+	return (ret);
+}
+
+/*
+ * Iterate over all dependents for a given dataset.  This includes both
+ * hierarchical dependents (children) and data dependents (snapshots and
+ * clones).  The bulk of the processing occurs in get_dependents() in
+ * libzfs_graph.c.
+ */
+int
+zfs_iter_dependents(zfs_handle_t *zhp, zfs_iter_f func, void *data)
+{
+	char **dependents;
+	size_t count;
+	int i;
+	zfs_handle_t *child;
+	int ret = 0;
+
+	dependents = get_dependents(zhp->zfs_name, &count);
+	for (i = 0; i < count; i++) {
+		if ((child = make_dataset_handle(dependents[i])) == NULL)
+			continue;
+
+		if ((ret = func(child, data)) != 0)
+			break;
+	}
+
+	for (i = 0; i < count; i++)
+		free(dependents[i]);
+	free(dependents);
+
+	return (ret);
+}
+
+/*
+ * Renames the given dataset.
+ */
+int
+zfs_rename(zfs_handle_t *zhp, const char *target)
+{
+	int ret;
+	zfs_cmd_t zc = { 0 };
+	char reason[64];
+	char *delim;
+	prop_changelist_t *cl;
+	char parent[ZFS_MAXNAMELEN];
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+	(void) strlcpy(zc.zc_prop_value, target, sizeof (zc.zc_prop_value));
+
+	/* if we have the same exact name, just return success */
+	if (strcmp(zhp->zfs_name, target) == 0)
+		return (0);
+
+	/*
+	 * Make sure the target name is valid
+	 */
+	if (!zfs_validate_name(target, zhp->zfs_type, reason,
+	    sizeof (reason))) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot create '%s': %s in %s name"), target, reason,
+		    zfs_type_to_name(zhp->zfs_type));
+		return (-1);
+	}
+
+	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
+		if ((delim = strchr(target, '@')) == NULL) {
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot rename to '%s': not a snapshot"), target);
+			return (-1);
+		}
+
+		/*
+		 * Make sure we're renaming within the same dataset.
+		 */
+		if (strncmp(zhp->zfs_name, target, delim - target) != 0 ||
+		    zhp->zfs_name[delim - target] != '@') {
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot rename to '%s': snapshots must be part "
+			    "of same dataset"), target);
+			return (-1);
+		}
+
+		(void) strncpy(parent, target, delim - target);
+		parent[delim - target] = '\0';
+	} else {
+		/* validate parents */
+		if (check_parents(target, zhp->zfs_type) != 0)
+			return (-1);
+
+		(void) parent_name(target, parent, sizeof (parent));
+
+		/* make sure we're in the same pool */
+		verify((delim = strchr(target, '/')) != NULL);
+		if (strncmp(zhp->zfs_name, target, delim - target) != 0 ||
+		    zhp->zfs_name[delim - target] != '/') {
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot rename to '%s': "
+			    "datasets must be within same pool"), target);
+			return (-1);
+		}
+	}
+
+	if (getzoneid() == GLOBAL_ZONEID &&
+	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
+		zfs_error(dgettext(TEXT_DOMAIN, "cannot rename %s, "
+		    "dataset is used in a non-global zone"), zhp->zfs_name);
+		return (-1);
+	}
+
+	if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0)) == NULL)
+		return (1);
+
+	if (changelist_haszonedchild(cl)) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot rename '%s': child dataset with inherited "
+		    "mountpoint is used in a non-global zone"), zhp->zfs_name);
+		ret = -1;
+		goto error;
+	}
+
+	if ((ret = changelist_prefix(cl)) != 0)
+		goto error;
+
+	if (zhp->zfs_volblocksize != 0)
+		zc.zc_objset_type = DMU_OST_ZVOL;
+	else
+		zc.zc_objset_type = DMU_OST_ZFS;
+
+	if ((ret = ioctl(zfs_fd, ZFS_IOC_RENAME, &zc)) != 0) {
+		switch (errno) {
+		case EPERM:
+			/*
+			 * The user doesn't have permission to rename the
+			 * given dataset.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot rename '%s': "
+			    "permission denied"), zhp->zfs_name);
+			break;
+
+		case EDQUOT:
+		case ENOSPC:
+			/*
+			 * Not enough space in the parent dataset.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot "
+			    "rename '%s': not enough space in '%s'"),
+			    zhp->zfs_name, parent);
+			break;
+
+		case ENOENT:
+			/*
+			 * The destination doesn't exist.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot rename '%s' "
+			    "to '%s': destination doesn't exist"),
+			    zhp->zfs_name, target);
+			break;
+
+		case EEXIST:
+			/*
+			 * The destination already exists.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot rename '%s' "
+			    "to '%s': destination already exists"),
+			    zhp->zfs_name, target);
+			break;
+
+		case EBUSY:
+			/*
+			 * The filesystem is busy.  This should have been caught
+			 * by the caller before getting here, but there may be
+			 * an unexpected problem.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot rename '%s': "
+			    "%s is busy"), zhp->zfs_name,
+			    zfs_type_to_name(zhp->zfs_type));
+			break;
+
+		default:
+			zfs_baderror(errno);
+		}
+
+		/*
+		 * On failure, we still want to remount any filesystems that
+		 * were previously mounted, so we don't alter the system state.
+		 */
+		(void) changelist_postfix(cl);
+	} else {
+		changelist_rename(cl, zfs_get_name(zhp), target);
+
+		ret = changelist_postfix(cl);
+	}
+
+error:
+	changelist_free(cl);
+	return (ret);
+}
+
+/*
+ * Given a zvol dataset, issue the ioctl to create the appropriate minor node,
+ * poke devfsadm to create the /dev link, and then wait for the link to appear.
+ */
+int
+zvol_create_link(const char *dataset)
+{
+	zfs_cmd_t zc = { 0 };
+	di_devlink_handle_t hdl;
+
+	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
+
+	/*
+	 * Issue the appropriate ioctl.
+	 */
+	if (ioctl(zfs_fd, ZFS_IOC_CREATE_MINOR, &zc) != 0) {
+		switch (errno) {
+		case EPERM:
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot create "
+			    "device links for '%s': permission denied"),
+			    dataset);
+			break;
+
+		case EEXIST:
+			/*
+			 * Silently ignore the case where the link already
+			 * exists.  This allows 'zfs volinit' to be run multiple
+			 * times without errors.
+			 */
+			return (0);
+
+		default:
+			zfs_baderror(errno);
+		}
+
+		return (-1);
+	}
+
+	/*
+	 * Call devfsadm and wait for the links to magically appear.
+	 */
+	if ((hdl = di_devlink_init(ZFS_DRIVER, DI_MAKE_LINK)) == NULL) {
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "cannot create device links for '%s'"), dataset);
+		(void) ioctl(zfs_fd, ZFS_IOC_REMOVE_MINOR, &zc);
+		return (-1);
+	} else {
+		(void) di_devlink_fini(&hdl);
+	}
+
+	return (0);
+}
+
+/*
+ * Remove a minor node for the given zvol and the associated /dev links.
+ */
+int
+zvol_remove_link(const char *dataset)
+{
+	zfs_cmd_t zc = { 0 };
+
+	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
+
+	if (ioctl(zfs_fd, ZFS_IOC_REMOVE_MINOR, &zc) != 0) {
+		switch (errno) {
+		case EPERM:
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot remove "
+			    "device links for '%s': permission denied"),
+			    dataset);
+			break;
+
+		case EBUSY:
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot remove "
+			    "device links for '%s': volume is in use"),
+			    dataset);
+			break;
+
+		case ENXIO:
+			/*
+			 * Silently ignore the case where the link no longer
+			 * exists, so that 'zfs volfini' can be run multiple
+			 * times without errors.
+			 */
+			return (0);
+
+		default:
+			zfs_baderror(errno);
+		}
+
+		return (-1);
+	}
+
+	return (0);
+}
diff --git a/usr/src/lib/libzfs/common/libzfs_graph.c b/usr/src/lib/libzfs/common/libzfs_graph.c
new file mode 100644
index 000000000000..65b115879b67
--- /dev/null
+++ b/usr/src/lib/libzfs/common/libzfs_graph.c
@@ -0,0 +1,527 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Iterate over all children of the current object.  This includes the normal
+ * dataset hierarchy, but also arbitrary hierarchies due to clones.  We want to
+ * walk all datasets in the pool, and construct a directed graph of the form:
+ *
+ * 			home
+ *                        |
+ *                   +----+----+
+ *                   |         |
+ *                   v         v             ws
+ *                  bar       baz             |
+ *                             |              |
+ *                             v              v
+ *                          @yesterday ----> foo
+ *
+ * In order to construct this graph, we have to walk every dataset in the pool,
+ * because the clone parent is stored as a property of the child, not the
+ * parent.  The parent only keeps track of the number of clones.
+ *
+ * In the normal case (without clones) this would be rather expensive.  To avoid
+ * unnecessary computation, we first try a walk of the subtree hierarchy
+ * starting from the initial node.  At each dataset, we construct a node in the
+ * graph and an edge leading from its parent.  If we don't see any snapshots
+ * with a non-zero clone count, then we are finished.
+ *
+ * If we do find a cloned snapshot, then we finish the walk of the current
+ * subtree, but indicate that we need to do a complete walk.  We then perform a
+ * global walk of all datasets, avoiding the subtree we already processed.
+ *
+ * At the end of this, we'll end up with a directed graph of all relevant (and
+ * possible some irrelevant) datasets in the system.  We need to both find our
+ * limiting subgraph and determine a safe ordering in which to destroy the
+ * datasets.  We do a topological ordering of our graph starting at our target
+ * dataset, and then walk the results in reverse.
+ *
+ * When removing datasets, we want to destroy the snapshots in chronological
+ * order (because this is the most efficient method).  In order to accomplish
+ * this, we store the creation transaction group with each vertex and keep each
+ * vertex's edges sorted according to this value.  The topological sort will
+ * automatically walk the snapshots in the correct order.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <libzfs.h>
+
+#include "libzfs_impl.h"
+#include "zfs_namecheck.h"
+
+#define	MIN_EDGECOUNT	4
+
+/*
+ * Vertex structure.  Indexed by dataset name, this structure maintains a list
+ * of edges to other vertices.
+ */
+struct zfs_edge;
+typedef struct zfs_vertex {
+	char			zv_dataset[ZFS_MAXNAMELEN];
+	struct zfs_vertex	*zv_next;
+	int			zv_visited;
+	uint64_t		zv_txg;
+	struct zfs_edge		**zv_edges;
+	int			zv_edgecount;
+	int			zv_edgealloc;
+} zfs_vertex_t;
+
+/*
+ * Edge structure.  Simply maintains a pointer to the destination vertex.  There
+ * is no need to store the source vertex, since we only use edges in the context
+ * of the source vertex.
+ */
+typedef struct zfs_edge {
+	zfs_vertex_t		*ze_dest;
+	struct zfs_edge		*ze_next;
+} zfs_edge_t;
+
+#define	ZFS_GRAPH_SIZE		1027	/* this could be dynamic some day */
+
+/*
+ * Graph structure.  Vertices are maintained in a hash indexed by dataset name.
+ */
+typedef struct zfs_graph {
+	zfs_vertex_t		**zg_hash;
+	size_t			zg_size;
+	size_t			zg_nvertex;
+} zfs_graph_t;
+
+/*
+ * Allocate a new edge pointing to the target vertex.
+ */
+static zfs_edge_t *
+zfs_edge_create(zfs_vertex_t *dest)
+{
+	zfs_edge_t *zep = zfs_malloc(sizeof (zfs_edge_t));
+
+	zep->ze_dest = dest;
+
+	return (zep);
+}
+
+/*
+ * Destroy an edge.
+ */
+static void
+zfs_edge_destroy(zfs_edge_t *zep)
+{
+	free(zep);
+}
+
+/*
+ * Allocate a new vertex with the given name.
+ */
+static zfs_vertex_t *
+zfs_vertex_create(const char *dataset)
+{
+	zfs_vertex_t *zvp = zfs_malloc(sizeof (zfs_vertex_t));
+
+	assert(strlen(dataset) < ZFS_MAXNAMELEN);
+
+	(void) strlcpy(zvp->zv_dataset, dataset, sizeof (zvp->zv_dataset));
+
+	zvp->zv_edges = zfs_malloc(MIN_EDGECOUNT * sizeof (void *));
+	zvp->zv_edgealloc = MIN_EDGECOUNT;
+
+	return (zvp);
+}
+
+/*
+ * Destroy a vertex.  Frees up any associated edges.
+ */
+static void
+zfs_vertex_destroy(zfs_vertex_t *zvp)
+{
+	int i;
+
+	for (i = 0; i < zvp->zv_edgecount; i++)
+		zfs_edge_destroy(zvp->zv_edges[i]);
+
+	free(zvp->zv_edges);
+	free(zvp);
+}
+
+/*
+ * Given a vertex, add an edge to the destination vertex.
+ */
+static void
+zfs_vertex_add_edge(zfs_vertex_t *zvp, zfs_vertex_t *dest)
+{
+	zfs_edge_t *zep = zfs_edge_create(dest);
+
+	if (zvp->zv_edgecount == zvp->zv_edgealloc) {
+		zfs_edge_t **newedges = zfs_malloc(zvp->zv_edgealloc * 2 *
+		    sizeof (void *));
+
+		bcopy(zvp->zv_edges, newedges,
+		    zvp->zv_edgealloc * sizeof (void *));
+
+		zvp->zv_edgealloc *= 2;
+		free(zvp->zv_edges);
+		zvp->zv_edges = newedges;
+	}
+
+	zvp->zv_edges[zvp->zv_edgecount++] = zep;
+}
+
+static int
+zfs_edge_compare(const void *a, const void *b)
+{
+	const zfs_edge_t *ea = *((zfs_edge_t **)a);
+	const zfs_edge_t *eb = *((zfs_edge_t **)b);
+
+	if (ea->ze_dest->zv_txg < eb->ze_dest->zv_txg)
+		return (-1);
+	if (ea->ze_dest->zv_txg > eb->ze_dest->zv_txg)
+		return (1);
+	return (0);
+}
+
+/*
+ * Sort the given vertex edges according to the creation txg of each vertex.
+ */
+static void
+zfs_vertex_sort_edges(zfs_vertex_t *zvp)
+{
+	if (zvp->zv_edgecount == 0)
+		return;
+
+	qsort(zvp->zv_edges, zvp->zv_edgecount, sizeof (void *),
+	    zfs_edge_compare);
+}
+
+/*
+ * Construct a new graph object.  We allow the size to be specified as a
+ * parameter so in the future we can size the hash according to the number of
+ * datasets in the pool.
+ */
+static zfs_graph_t *
+zfs_graph_create(size_t size)
+{
+	zfs_graph_t *zgp = zfs_malloc(sizeof (zfs_graph_t));
+
+	zgp->zg_size = size;
+	zgp->zg_hash = zfs_malloc(size * sizeof (zfs_vertex_t *));
+
+	return (zgp);
+}
+
+/*
+ * Destroy a graph object.  We have to iterate over all the hash chains,
+ * destroying each vertex in the process.
+ */
+static void
+zfs_graph_destroy(zfs_graph_t *zgp)
+{
+	int i;
+	zfs_vertex_t *current, *next;
+
+	for (i = 0; i < zgp->zg_size; i++) {
+		current = zgp->zg_hash[i];
+		while (current != NULL) {
+			next = current->zv_next;
+			zfs_vertex_destroy(current);
+			current = next;
+		}
+	}
+
+	free(zgp->zg_hash);
+	free(zgp);
+}
+
+/*
+ * Graph hash function.  Classic bernstein k=33 hash function, taken from
+ * usr/src/cmd/sgs/tools/common/strhash.c
+ */
+static size_t
+zfs_graph_hash(zfs_graph_t *zgp, const char *str)
+{
+	size_t hash = 5381;
+	int c;
+
+	while ((c = *str++) != 0)
+		hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
+
+	return (hash % zgp->zg_size);
+}
+
+/*
+ * Given a dataset name, finds the associated vertex, creating it if necessary.
+ */
+static zfs_vertex_t *
+zfs_graph_lookup(zfs_graph_t *zgp, const char *dataset, uint64_t txg)
+{
+	size_t idx = zfs_graph_hash(zgp, dataset);
+	zfs_vertex_t *zvp;
+
+	for (zvp = zgp->zg_hash[idx]; zvp != NULL; zvp = zvp->zv_next) {
+		if (strcmp(zvp->zv_dataset, dataset) == 0) {
+			if (zvp->zv_txg == 0)
+				zvp->zv_txg = txg;
+			return (zvp);
+		}
+	}
+
+	zvp = zfs_vertex_create(dataset);
+	zvp->zv_next = zgp->zg_hash[idx];
+	zvp->zv_txg = txg;
+	zgp->zg_hash[idx] = zvp;
+	zgp->zg_nvertex++;
+
+	return (zvp);
+}
+
+/*
+ * Given two dataset names, create an edge between them.  For the source vertex,
+ * mark 'zv_visited' to indicate that we have seen this vertex, and not simply
+ * created it as a destination of another edge.  If 'dest' is NULL, then this
+ * is an individual vertex (i.e. the starting vertex), so don't add an edge.
+ */
+static void
+zfs_graph_add(zfs_graph_t *zgp, const char *source, const char *dest,
+    uint64_t txg)
+{
+	zfs_vertex_t *svp, *dvp;
+
+	svp = zfs_graph_lookup(zgp, source, 0);
+	svp->zv_visited = 1;
+	if (dest != NULL) {
+		dvp = zfs_graph_lookup(zgp, dest, txg);
+		zfs_vertex_add_edge(svp, dvp);
+	}
+}
+
+/*
+ * Iterate over all children of the given dataset, adding any vertices as
+ * necessary.  Returns 0 if no cloned snapshots were seen, 1 otherwise.  This is
+ * a simple recursive algorithm - the ZFS namespace typically is very flat.  We
+ * manually invoke the necessary ioctl() calls to avoid the overhead and
+ * additional semantics of zfs_open().
+ */
+static int
+iterate_children(zfs_graph_t *zgp, const char *dataset)
+{
+	zfs_cmd_t zc = { 0 };
+	int ret = 0;
+	zfs_vertex_t *zvp;
+
+	/*
+	 * Look up the source vertex, and avoid it if we've seen it before.
+	 */
+	zvp = zfs_graph_lookup(zgp, dataset, 0);
+	if (zvp->zv_visited)
+		return (0);
+
+	for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
+	    ioctl(zfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0;
+	    (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) {
+
+		/*
+		 * Ignore private dataset names.
+		 */
+		if (dataset_name_hidden(zc.zc_name))
+			continue;
+
+		/*
+		 * Get statistics for this dataset, to determine the type of the
+		 * dataset and clone statistics.  If this fails, the dataset has
+		 * since been removed, and we're pretty much screwed anyway.
+		 */
+		if (ioctl(zfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0)
+			continue;
+
+		/*
+		 * Add an edge between the parent and the child.
+		 */
+		zfs_graph_add(zgp, dataset, zc.zc_name,
+		    zc.zc_objset_stats.dds_creation_txg);
+
+		/*
+		 * If this dataset has a clone parent, add an appropriate edge.
+		 */
+		if (zc.zc_objset_stats.dds_clone_of[0] != '\0')
+			zfs_graph_add(zgp, zc.zc_objset_stats.dds_clone_of,
+			    zc.zc_name, zc.zc_objset_stats.dds_creation_txg);
+
+		/*
+		 * Iterate over all children
+		 */
+		ret |= iterate_children(zgp, zc.zc_name);
+
+		/*
+		 * Indicate if we found a dataset with a non-zero clone count.
+		 */
+		if (zc.zc_objset_stats.dds_num_clones != 0)
+			ret |= 1;
+	}
+
+	/*
+	 * Now iterate over all snapshots.
+	 */
+	bzero(&zc, sizeof (zc));
+
+	for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
+	    ioctl(zfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0;
+	    (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) {
+
+		/*
+		 * Get statistics for this dataset, to determine the type of the
+		 * dataset and clone statistics.  If this fails, the dataset has
+		 * since been removed, and we're pretty much screwed anyway.
+		 */
+		if (ioctl(zfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0)
+			continue;
+
+		/*
+		 * Add an edge between the parent and the child.
+		 */
+		zfs_graph_add(zgp, dataset, zc.zc_name,
+		    zc.zc_objset_stats.dds_creation_txg);
+
+		/*
+		 * Indicate if we found a dataset with a non-zero clone count.
+		 */
+		if (zc.zc_objset_stats.dds_num_clones != 0)
+			ret |= 1;
+	}
+
+	zvp->zv_visited = 1;
+
+	return (ret);
+}
+
+/*
+ * Construct a complete graph of all necessary vertices.  First, we iterate over
+ * only our object's children.  If we don't find any cloned snapshots, then we
+ * simple return that.  Otherwise, we have to start at the pool root and iterate
+ * over all datasets.
+ */
+static zfs_graph_t *
+construct_graph(const char *dataset)
+{
+	zfs_graph_t *zgp = zfs_graph_create(ZFS_GRAPH_SIZE);
+	zfs_cmd_t zc = { 0 };
+
+	/*
+	 * We need to explicitly check whether this dataset has clones or not,
+	 * since iterate_children() only checks the children.
+	 */
+	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
+	(void) ioctl(zfs_fd, ZFS_IOC_OBJSET_STATS, &zc);
+
+	if (zc.zc_objset_stats.dds_num_clones != 0 ||
+	    iterate_children(zgp, dataset) != 0) {
+		/*
+		 * Determine pool name and try again.
+		 */
+		char *pool, *slash;
+
+		if ((slash = strchr(dataset, '/')) != NULL ||
+		    (slash = strchr(dataset, '@')) != NULL) {
+			pool = zfs_malloc(slash - dataset + 1);
+			(void) strncpy(pool, dataset, slash - dataset);
+			pool[slash - dataset] = '\0';
+
+			(void) iterate_children(zgp, pool);
+			zfs_graph_add(zgp, pool, NULL, 0);
+
+			free(pool);
+		}
+	}
+	zfs_graph_add(zgp, dataset, NULL, 0);
+
+	return (zgp);
+}
+
+/*
+ * Given a graph, do a recursive topological sort into the given array.  This is
+ * really just a depth first search, so that the deepest nodes appear first.
+ * hijack the 'zv_visited' marker to avoid visiting the same vertex twice.
+ */
+static void
+topo_sort(char **result, size_t *idx, zfs_vertex_t *zgv)
+{
+	int i;
+
+	/* avoid doing a search if we don't have to */
+	if (zgv->zv_visited == 2)
+		return;
+
+	zfs_vertex_sort_edges(zgv);
+	for (i = 0; i < zgv->zv_edgecount; i++)
+		topo_sort(result, idx, zgv->zv_edges[i]->ze_dest);
+
+	/* we may have visited this in the course of the above */
+	if (zgv->zv_visited == 2)
+		return;
+
+	result[*idx] = zfs_malloc(strlen(zgv->zv_dataset) + 1);
+	(void) strcpy(result[*idx], zgv->zv_dataset);
+	*idx += 1;
+	zgv->zv_visited = 2;
+}
+
+/*
+ * The only public interface for this file.  Do the dirty work of constructing a
+ * child list for the given object.  Construct the graph, do the toplogical
+ * sort, and then return the array of strings to the caller.
+ */
+char **
+get_dependents(const char *dataset, size_t *count)
+{
+	char **result;
+	zfs_graph_t *zgp;
+	zfs_vertex_t *zvp;
+
+	zgp = construct_graph(dataset);
+	result = zfs_malloc(zgp->zg_nvertex * sizeof (char *));
+
+	zvp = zfs_graph_lookup(zgp, dataset, 0);
+
+	*count = 0;
+	topo_sort(result, count, zvp);
+
+	/*
+	 * Get rid of the last entry, which is our starting vertex and not
+	 * strictly a dependent.
+	 */
+	assert(*count > 0);
+	free(result[*count - 1]);
+	(*count)--;
+
+	zfs_graph_destroy(zgp);
+
+	return (result);
+}
diff --git a/usr/src/lib/libzfs/common/libzfs_impl.h b/usr/src/lib/libzfs/common/libzfs_impl.h
new file mode 100644
index 000000000000..3fdd98c9979b
--- /dev/null
+++ b/usr/src/lib/libzfs/common/libzfs_impl.h
@@ -0,0 +1,103 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_LIBFS_IMPL_H
+#define	_LIBFS_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_acl.h>
+#include <sys/nvpair.h>
+
+#include <libzfs.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct zfs_handle {
+	char zfs_name[ZFS_MAXNAMELEN];
+	zfs_type_t zfs_type;
+	dmu_objset_stats_t zfs_dmustats;
+	zfs_stats_t zfs_zplstats;
+	uint64_t zfs_volsize;
+	uint64_t zfs_volblocksize;
+	char *zfs_mntopts;
+};
+
+struct zpool_handle {
+	char zpool_name[ZPOOL_MAXNAMELEN];
+	int zpool_state;
+	size_t zpool_config_size;
+	nvlist_t *zpool_config;
+};
+
+void zfs_error(const char *, ...);
+void zfs_fatal(const char *, ...);
+void *zfs_malloc(size_t);
+char *zfs_strdup(const char *);
+void no_memory(void);
+
+#define	zfs_baderror(err)						\
+	(zfs_fatal(dgettext(TEXT_DOMAIN,				\
+	"internal error: unexpected error %d at line %d of %s"),	\
+	(err), (__LINE__), (__FILE__)))
+
+int zfs_fd;
+
+char **get_dependents(const char *, size_t *);
+
+FILE *mnttab_file;
+FILE *sharetab_file;
+
+typedef struct prop_changelist prop_changelist_t;
+
+int changelist_prefix(prop_changelist_t *);
+int changelist_postfix(prop_changelist_t *);
+void changelist_rename(prop_changelist_t *, const char *, const char *);
+void changelist_free(prop_changelist_t *);
+prop_changelist_t *changelist_gather(zfs_handle_t *, zfs_prop_t, int);
+int changelist_unshare(prop_changelist_t *);
+int changelist_haszonedchild(prop_changelist_t *);
+
+void remove_mountpoint(zfs_handle_t *);
+
+zfs_handle_t *make_dataset_handle(const char *);
+void set_pool_health(nvlist_t *config);
+
+zpool_handle_t *zpool_open_silent(const char *pool);
+
+int zvol_create_link(const char *dataset);
+int zvol_remove_link(const char *dataset);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _LIBFS_IMPL_H */
diff --git a/usr/src/lib/libzfs/common/libzfs_import.c b/usr/src/lib/libzfs/common/libzfs_import.c
new file mode 100644
index 000000000000..c71bc437f5dc
--- /dev/null
+++ b/usr/src/lib/libzfs/common/libzfs_import.c
@@ -0,0 +1,753 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Pool import support functions.
+ *
+ * To import a pool, we rely on reading the configuration information from the
+ * ZFS label of each device.  If we successfully read the label, then we
+ * organize the configuration information in the following hierarchy:
+ *
+ * 	pool guid -> toplevel vdev guid -> label txg
+ *
+ * Duplicate entries matching this same tuple will be discarded.  Once we have
+ * examined every device, we pick the best label txg config for each toplevel
+ * vdev.  We then arrange these toplevel vdevs into a complete pool config, and
+ * update any paths that have changed.  Finally, we attempt to import the pool
+ * using our derived config, and record the results.
+ */
+
+#include <devid.h>
+#include <dirent.h>
+#include <errno.h>
+#include <libintl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include <sys/vdev_impl.h>
+
+#include "libzfs.h"
+#include "libzfs_impl.h"
+
+/*
+ * Intermediate structures used to gather configuration information.
+ */
+typedef struct config_entry {
+	uint64_t		ce_txg;
+	nvlist_t		*ce_config;
+	struct config_entry	*ce_next;
+} config_entry_t;
+
+typedef struct vdev_entry {
+	uint64_t		ve_guid;
+	config_entry_t		*ve_configs;
+	struct vdev_entry	*ve_next;
+} vdev_entry_t;
+
+typedef struct pool_entry {
+	uint64_t		pe_guid;
+	vdev_entry_t		*pe_vdevs;
+	struct pool_entry	*pe_next;
+} pool_entry_t;
+
+typedef struct name_entry {
+	const char		*ne_name;
+	uint64_t		ne_guid;
+	struct name_entry	*ne_next;
+} name_entry_t;
+
+typedef struct pool_list {
+	pool_entry_t		*pools;
+	name_entry_t		*names;
+} pool_list_t;
+
+static char *
+get_devid(const char *path)
+{
+	int fd;
+	ddi_devid_t devid;
+	char *minor, *ret;
+
+	if ((fd = open(path, O_RDONLY)) < 0)
+		return (NULL);
+
+	minor = NULL;
+	ret = NULL;
+	if (devid_get(fd, &devid) == 0) {
+		if (devid_get_minor_name(fd, &minor) == 0)
+			ret = devid_str_encode(devid, minor);
+		if (minor != NULL)
+			devid_str_free(minor);
+		devid_free(devid);
+	}
+
+	return (ret);
+}
+
+
+/*
+ * Go through and fix up any path and/or devid information for the given vdev
+ * configuration.
+ */
+static void
+fix_paths(nvlist_t *nv, name_entry_t *names)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	uint64_t guid;
+	name_entry_t *ne;
+	char *devid;
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++)
+			fix_paths(child[c], names);
+		return;
+	}
+
+	/*
+	 * This is a leaf (file or disk) vdev.  In either case, go through
+	 * the name list and see if we find a matching guid.  If so, replace
+	 * the path and see if we can calculate a new devid.
+	 */
+	verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0);
+
+	for (ne = names; ne != NULL; ne = ne->ne_next)
+		if (ne->ne_guid == guid)
+			break;
+
+	if (ne == NULL)
+		return;
+
+	verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, ne->ne_name) == 0);
+
+	if ((devid = get_devid(ne->ne_name)) == NULL) {
+		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
+	} else {
+		verify(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, devid) == 0);
+		devid_str_free(devid);
+	}
+}
+
+/*
+ * Add the given configuration to the list of known devices.
+ */
+static void
+add_config(pool_list_t *pl, const char *path, nvlist_t *config)
+{
+	uint64_t pool_guid, vdev_guid, top_guid, txg;
+	pool_entry_t *pe;
+	vdev_entry_t *ve;
+	config_entry_t *ce;
+	name_entry_t *ne;
+
+	/*
+	 * If we have a valid config but cannot read any of these fields, then
+	 * it means we have a half-initialized label.  In vdev_label_init()
+	 * we write a label with txg == 0 so that we can identify the device
+	 * in case the user refers to the same disk later on.  If we fail to
+	 * create the pool, we'll be left with a label in this state
+	 * which should not be considered part of a valid pool.
+	 */
+	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+	    &pool_guid) != 0 ||
+	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
+	    &vdev_guid) != 0 ||
+	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID,
+	    &top_guid) != 0 ||
+	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+	    &txg) != 0 || txg == 0) {
+		nvlist_free(config);
+		return;
+	}
+
+	/*
+	 * First, see if we know about this pool.  If not, then add it to the
+	 * list of known pools.
+	 */
+	for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
+		if (pe->pe_guid == pool_guid)
+			break;
+	}
+
+	if (pe == NULL) {
+		pe = zfs_malloc(sizeof (pool_entry_t));
+		pe->pe_guid = pool_guid;
+		pe->pe_next = pl->pools;
+		pl->pools = pe;
+	}
+
+	/*
+	 * Second, see if we know about this toplevel vdev.  Add it if its
+	 * missing.
+	 */
+	for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) {
+		if (ve->ve_guid == top_guid)
+			break;
+	}
+
+	if (ve == NULL) {
+		ve = zfs_malloc(sizeof (vdev_entry_t));
+		ve->ve_guid = top_guid;
+		ve->ve_next = pe->pe_vdevs;
+		pe->pe_vdevs = ve;
+	}
+
+	/*
+	 * Third, see if we have a config with a matching transaction group.  If
+	 * so, then we do nothing.  Otherwise, add it to the list of known
+	 * configs.
+	 */
+	for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) {
+		if (ce->ce_txg == txg)
+			break;
+	}
+
+	if (ce == NULL) {
+		ce = zfs_malloc(sizeof (config_entry_t));
+		ce->ce_txg = txg;
+		ce->ce_config = config;
+		ce->ce_next = ve->ve_configs;
+		ve->ve_configs = ce;
+	} else {
+		nvlist_free(config);
+	}
+
+	/*
+	 * At this point we've successfully added our config to the list of
+	 * known configs.  The last thing to do is add the vdev guid -> path
+	 * mappings so that we can fix up the configuration as necessary before
+	 * doing the import.
+	 */
+	ne = zfs_malloc(sizeof (name_entry_t));
+
+	ne->ne_name = zfs_strdup(path);
+	ne->ne_guid = vdev_guid;
+	ne->ne_next = pl->names;
+	pl->names = ne;
+}
+
+/*
+ * Convert our list of pools into the definitive set of configurations.  We
+ * start by picking the best config for each toplevel vdev.  Once that's done,
+ * we assemble the toplevel vdevs into a full config for the pool.  We make a
+ * pass to fix up any incorrect paths, and then add it to the main list to
+ * return to the user.
+ */
+static nvlist_t *
+get_configs(pool_list_t *pl)
+{
+	pool_entry_t *pe, *penext;
+	vdev_entry_t *ve, *venext;
+	config_entry_t *ce, *cenext;
+	nvlist_t *ret, *config, *tmp, *nvtop, *nvroot;
+	int config_seen;
+	uint64_t best_txg;
+	char *name;
+	zfs_cmd_t zc = { 0 };
+	uint64_t guid;
+	char *packed;
+	size_t len;
+	int err;
+
+	verify(nvlist_alloc(&ret, 0, 0) == 0);
+
+	for (pe = pl->pools; pe != NULL; pe = penext) {
+		uint_t c;
+		uint_t children = 0;
+		uint64_t id;
+		nvlist_t **child = NULL;
+
+		penext = pe->pe_next;
+
+		verify(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+		config_seen = FALSE;
+
+		/*
+		 * Iterate over all toplevel vdevs.  Grab the pool configuration
+		 * from the first one we find, and then go through the rest and
+		 * add them as necessary to the 'vdevs' member of the config.
+		 */
+		for (ve = pe->pe_vdevs; ve != NULL; ve = venext) {
+			venext = ve->ve_next;
+
+			/*
+			 * Determine the best configuration for this vdev by
+			 * selecting the config with the latest transaction
+			 * group.
+			 */
+			best_txg = 0;
+			for (ce = ve->ve_configs; ce != NULL;
+			    ce = ce->ce_next) {
+
+				if (ce->ce_txg > best_txg)
+					tmp = ce->ce_config;
+			}
+
+			if (!config_seen) {
+				/*
+				 * Copy the relevant pieces of data to the pool
+				 * configuration:
+				 *
+				 * 	pool guid
+				 * 	name
+				 * 	pool state
+				 */
+				uint64_t state;
+
+				verify(nvlist_lookup_uint64(tmp,
+				    ZPOOL_CONFIG_POOL_GUID, &guid) == 0);
+				verify(nvlist_add_uint64(config,
+				    ZPOOL_CONFIG_POOL_GUID, guid) == 0);
+				verify(nvlist_lookup_string(tmp,
+				    ZPOOL_CONFIG_POOL_NAME, &name) == 0);
+				verify(nvlist_add_string(config,
+				    ZPOOL_CONFIG_POOL_NAME, name) == 0);
+				verify(nvlist_lookup_uint64(tmp,
+				    ZPOOL_CONFIG_POOL_STATE, &state) == 0);
+				verify(nvlist_add_uint64(config,
+				    ZPOOL_CONFIG_POOL_STATE, state) == 0);
+
+				config_seen = TRUE;
+			}
+
+			/*
+			 * Add this top-level vdev to the child array.
+			 */
+			verify(nvlist_lookup_nvlist(tmp,
+			    ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);
+			verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,
+			    &id) == 0);
+			if (id >= children) {
+				nvlist_t **newchild;
+
+				newchild = zfs_malloc((id + 1) *
+				    sizeof (nvlist_t *));
+
+				for (c = 0; c < children; c++)
+					newchild[c] = child[c];
+
+				free(child);
+				child = newchild;
+				children = id + 1;
+			}
+			verify(nvlist_dup(nvtop, &child[id], 0) == 0);
+
+			/*
+			 * Go through and free all config information.
+			 */
+			for (ce = ve->ve_configs; ce != NULL; ce = cenext) {
+				cenext = ce->ce_next;
+
+				nvlist_free(ce->ce_config);
+				free(ce);
+			}
+
+			/*
+			 * Free this vdev entry, since it has now been merged
+			 * into the main config.
+			 */
+			free(ve);
+		}
+
+		verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+		    &guid) == 0);
+
+		/*
+		 * Look for any missing top-level vdevs.  If this is the case,
+		 * create a faked up 'missing' vdev as a placeholder.  We cannot
+		 * simply compress the child array, because the kernel performs
+		 * certain checks to make sure the vdev IDs match their location
+		 * in the configuration.
+		 */
+		for (c = 0; c < children; c++)
+			if (child[c] == NULL) {
+				nvlist_t *missing;
+				verify(nvlist_alloc(&missing, NV_UNIQUE_NAME,
+				    0) == 0);
+				verify(nvlist_add_string(missing,
+				    ZPOOL_CONFIG_TYPE, VDEV_TYPE_MISSING) == 0);
+				verify(nvlist_add_uint64(missing,
+				    ZPOOL_CONFIG_ID, c) == 0);
+				verify(nvlist_add_uint64(missing,
+				    ZPOOL_CONFIG_GUID, 0ULL) == 0);
+				child[c] = missing;
+			}
+
+		/*
+		 * Put all of this pool's top-level vdevs into a root vdev.
+		 */
+		verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
+		verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+		    VDEV_TYPE_ROOT) == 0);
+		verify(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
+		verify(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) == 0);
+		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+		    child, children) == 0);
+
+		for (c = 0; c < children; c++)
+			nvlist_free(child[c]);
+		free(child);
+
+		/*
+		 * Go through and fix up any paths and/or devids based on our
+		 * known list of vdev GUID -> path mappings.
+		 */
+		fix_paths(nvroot, pl->names);
+
+		/*
+		 * Add the root vdev to this pool's configuration.
+		 */
+		verify(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+		    nvroot) == 0);
+		nvlist_free(nvroot);
+
+		/*
+		 * Free this pool entry.
+		 */
+		free(pe);
+
+		/*
+		 * Determine if this pool is currently active, in which case we
+		 * can't actually import it.
+		 */
+		verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+		    &name) == 0);
+		verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+		    &guid) == 0);
+
+		(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
+		if (ioctl(zfs_fd, ZFS_IOC_POOL_GUID, &zc) == 0 &&
+		    guid == zc.zc_pool_guid) {
+			nvlist_free(config);
+			continue;
+		}
+
+		/*
+		 * Try to do the import in order to get vdev state.
+		 */
+		if ((err = nvlist_size(config, &len, NV_ENCODE_NATIVE)) != 0)
+			zfs_baderror(err);
+
+		packed = zfs_malloc(len);
+
+		if ((err = nvlist_pack(config, &packed, &len,
+		    NV_ENCODE_NATIVE, 0)) != 0)
+			zfs_baderror(err);
+
+		nvlist_free(config);
+		config = NULL;
+
+		zc.zc_config_src_size = len;
+		zc.zc_config_src = (uint64_t)(uintptr_t)packed;
+
+		zc.zc_config_dst_size = 2 * len;
+		zc.zc_config_dst = (uint64_t)(uintptr_t)
+		    zfs_malloc(zc.zc_config_dst_size);
+
+		while ((err = ioctl(zfs_fd, ZFS_IOC_POOL_TRYIMPORT,
+		    &zc)) != 0 && errno == ENOMEM) {
+			free((void *)(uintptr_t)zc.zc_config_dst);
+			zc.zc_config_dst = (uint64_t)(uintptr_t)
+			    zfs_malloc(zc.zc_config_dst_size);
+		}
+
+		free(packed);
+
+		if (err)
+			zfs_baderror(errno);
+
+		verify(nvlist_unpack((void *)(uintptr_t)zc.zc_config_dst,
+		    zc.zc_config_dst_size, &config, 0) == 0);
+
+		set_pool_health(config);
+
+		/*
+		 * Add this pool to the list of configs.
+		 */
+		verify(nvlist_add_nvlist(ret, name, config) == 0);
+
+		nvlist_free(config);
+
+		free((void *)(uintptr_t)zc.zc_config_dst);
+	}
+
+	return (ret);
+}
+
+/*
+ * Return the offset of the given label.
+ */
+static uint64_t
+label_offset(size_t size, int l)
+{
+	return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
+	    0 : size - VDEV_LABELS * sizeof (vdev_label_t)));
+}
+
+/*
+ * Given a file descriptor, read the label information and return an nvlist
+ * describing the configuration, if there is one.
+ */
+nvlist_t *
+zpool_read_label(int fd)
+{
+	struct stat64 statbuf;
+	int l;
+	vdev_label_t *label;
+	nvlist_t *config;
+	uint64_t version, state, txg;
+
+	if (fstat64(fd, &statbuf) == -1)
+		return (NULL);
+
+	label = zfs_malloc(sizeof (vdev_label_t));
+
+	for (l = 0; l < VDEV_LABELS; l++) {
+		if (pread(fd, label, sizeof (vdev_label_t),
+		    label_offset(statbuf.st_size, l)) != sizeof (vdev_label_t))
+			continue;
+
+		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
+		    sizeof (label->vl_vdev_phys.vp_nvlist), &config, 0) != 0)
+			continue;
+
+		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+		    &version) != 0 || version != UBERBLOCK_VERSION) {
+			nvlist_free(config);
+			continue;
+		}
+
+		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+		    &state) != 0 || state > POOL_STATE_EXPORTED) {
+			nvlist_free(config);
+			continue;
+		}
+
+		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+		    &txg) != 0 || txg == 0) {
+			nvlist_free(config);
+			continue;
+		}
+
+		free(label);
+		return (config);
+	}
+
+	free(label);
+	return (NULL);
+}
+
+/*
+ * Given a list of directories to search, find all pools stored on disk.  This
+ * includes partial pools which are not available to import.  If no args are
+ * given (argc is 0), then the default directory (/dev/dsk) is searched.
+ */
+nvlist_t *
+zpool_find_import(int argc, char **argv)
+{
+	int i;
+	DIR *dirp;
+	struct dirent64 *dp;
+	char path[MAXPATHLEN];
+	struct stat64 statbuf;
+	nvlist_t *ret, *config;
+	static char *default_dir = "/dev/dsk";
+	int fd;
+	pool_list_t pools = { 0 };
+
+	if (argc == 0) {
+		argc = 1;
+		argv = &default_dir;
+	}
+
+	/*
+	 * Go through and read the label configuration information from every
+	 * possible device, organizing the information according to pool GUID
+	 * and toplevel GUID.
+	 */
+	for (i = 0; i < argc; i++) {
+		if (argv[i][0] != '/') {
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot open '%s': must be an absolute path"),
+			    argv[i]);
+			return (NULL);
+		}
+
+		if ((dirp = opendir(argv[i])) == NULL) {
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot open '%s': %s"), argv[i],
+			    strerror(errno));
+			return (NULL);
+		}
+
+		/*
+		 * This is not MT-safe, but we have no MT consumers of libzfs
+		 */
+		while ((dp = readdir64(dirp)) != NULL) {
+
+			(void) snprintf(path, sizeof (path), "%s/%s",
+			    argv[i], dp->d_name);
+
+			if (stat64(path, &statbuf) != 0)
+				continue;
+
+			/*
+			 * Ignore directories (which includes "." and "..").
+			 */
+			if (S_ISDIR(statbuf.st_mode))
+				continue;
+
+			if ((fd = open64(path, O_RDONLY)) < 0)
+				continue;
+
+			config = zpool_read_label(fd);
+
+			(void) close(fd);
+
+			if (config != NULL)
+				add_config(&pools, path, config);
+		}
+	}
+
+	ret = get_configs(&pools);
+
+	return (ret);
+}
+
+int
+find_guid(nvlist_t *nv, uint64_t guid)
+{
+	uint64_t tmp;
+	nvlist_t **child;
+	uint_t c, children;
+
+	verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &tmp) == 0);
+	if (tmp == guid)
+		return (TRUE);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++)
+			if (find_guid(child[c], guid))
+				return (TRUE);
+	}
+
+	return (FALSE);
+}
+
+/*
+ * Determines if the pool is in use.  If so, it returns TRUE and the state of
+ * the pool as well as the name of the pool.  Both strings are allocated and
+ * must be freed by the caller.
+ */
+int
+zpool_in_use(int fd, char **statestr, char **namestr)
+{
+	nvlist_t *config;
+	uint64_t state;
+	char *name;
+	int ret;
+	zfs_cmd_t zc = { 0 };
+	uint64_t guid, vdev_guid;
+	zpool_handle_t *zhp;
+	nvlist_t *pool_config;
+
+	if ((config = zpool_read_label(fd)) == NULL)
+		return (FALSE);
+
+	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+	    &name) == 0);
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+	    &state) == 0);
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+	    &guid) == 0);
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
+	    &vdev_guid) == 0);
+
+	switch (state) {
+	case POOL_STATE_EXPORTED:
+		*statestr = zfs_strdup(dgettext(TEXT_DOMAIN, "exported"));
+		*namestr = zfs_strdup(name);
+		ret = TRUE;
+		break;
+
+	case POOL_STATE_ACTIVE:
+		/*
+		 * For an active pool, we have to determine if it's really part
+		 * of an active pool (in which case the pool will exist and the
+		 * guid will be the same), or whether it's part of an active
+		 * pool that was disconnected without being explicitly exported.
+		 *
+		 * We use the direct ioctl() first to avoid triggering an error
+		 * message if the pool cannot be opened.
+		 */
+		(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
+		if (ioctl(zfs_fd, ZFS_IOC_POOL_GUID, &zc) == 0 &&
+		    guid == zc.zc_pool_guid) {
+			/*
+			 * Because the device may have been removed while
+			 * offlined, we only report it as active if the vdev is
+			 * still present in the config.  Otherwise, pretend like
+			 * it's not in use.
+			 */
+			if ((zhp = zpool_open_canfail(name)) != NULL &&
+			    (pool_config = zpool_get_config(zhp)) != NULL) {
+				nvlist_t *nvroot;
+
+				verify(nvlist_lookup_nvlist(pool_config,
+				    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+				if (find_guid(nvroot, vdev_guid)) {
+					*statestr = zfs_strdup(
+					    dgettext(TEXT_DOMAIN, "active"));
+					*namestr = zfs_strdup(name);
+					ret = TRUE;
+				} else {
+					ret = FALSE;
+				}
+			} else {
+				ret = FALSE;
+			}
+		} else {
+			*statestr = zfs_strdup(dgettext(TEXT_DOMAIN,
+			    "potentially active"));
+			*namestr = zfs_strdup(name);
+			ret = TRUE;
+		}
+		break;
+
+	default:
+		ret = FALSE;
+	}
+
+	nvlist_free(config);
+	return (ret);
+}
diff --git a/usr/src/lib/libzfs/common/libzfs_mount.c b/usr/src/lib/libzfs/common/libzfs_mount.c
new file mode 100644
index 000000000000..1f4bec249920
--- /dev/null
+++ b/usr/src/lib/libzfs/common/libzfs_mount.c
@@ -0,0 +1,558 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Routines to manage ZFS mounts.  We separate all the nasty routines that have
+ * to deal with the OS.  The main entry points are:
+ *
+ * 	zfs_is_mounted()
+ * 	zfs_mount()
+ * 	zfs_unmount()
+ * 	zfs_unmountall()
+ *
+ * These functions are used by mount and unmount, and when changing a
+ * filesystem's mountpoint.  This file also contains the functions used to
+ * manage sharing filesystems via NFS:
+ *
+ * 	zfs_is_shared()
+ * 	zfs_share()
+ * 	zfs_unshare()
+ * 	zfs_unshareall()
+ */
+
+#include <dirent.h>
+#include <errno.h>
+#include <libgen.h>
+#include <libintl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <zone.h>
+#include <sys/mntent.h>
+#include <sys/mnttab.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+
+#include <libzfs.h>
+
+#include "libzfs_impl.h"
+
+
+/*
+ * The following two files are opened as part of zfs_init().  It's OK to for
+ * the sharetab to be NULL, but mnttab must always be non-NULL;
+ */
+FILE *mnttab_file;
+FILE *sharetab_file;
+
+/*
+ * Search the sharetab for the given mountpoint, returning TRUE if it is found.
+ */
+static int
+is_shared(const char *mountpoint)
+{
+	char buf[MAXPATHLEN], *tab;
+
+	if (sharetab_file == NULL)
+		return (0);
+
+	(void) fseek(sharetab_file, 0, SEEK_SET);
+
+	while (fgets(buf, sizeof (buf), sharetab_file) != NULL) {
+
+		/* the mountpoint is the first entry on each line */
+		if ((tab = strchr(buf, '\t')) != NULL) {
+			*tab = '\0';
+			if (strcmp(buf, mountpoint) == 0)
+				return (1);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Returns TRUE if the specified directory is empty.  If we can't open the
+ * directory at all, return TRUE so that the mount can fail with a more
+ * informative error message.
+ */
+static int
+dir_is_empty(const char *dirname)
+{
+	DIR *dirp;
+	struct dirent64 *dp;
+
+	if ((dirp = opendir(dirname)) == NULL)
+		return (TRUE);
+
+	while ((dp = readdir64(dirp)) != NULL) {
+
+		if (strcmp(dp->d_name, ".") == 0 ||
+		    strcmp(dp->d_name, "..") == 0)
+			continue;
+
+		(void) closedir(dirp);
+		return (FALSE);
+	}
+
+	(void) closedir(dirp);
+	return (TRUE);
+}
+
+/*
+ * Checks to see if the mount is active.  If the filesystem is mounted, we fill
+ * in 'where' with the current mountpoint, and return 1.  Otherwise, we return
+ * 0.
+ */
+int
+zfs_is_mounted(zfs_handle_t *zhp, char **where)
+{
+	struct mnttab search = { 0 }, entry;
+
+	/*
+	 * Search for the entry in /etc/mnttab.  We don't bother getting the
+	 * mountpoint, as we can just search for the special device.  This will
+	 * also let us find mounts when the mountpoint is 'legacy'.
+	 */
+	search.mnt_special = (char *)zfs_get_name(zhp);
+
+	rewind(mnttab_file);
+	if (getmntany(mnttab_file, &entry, &search) != 0)
+		return (FALSE);
+
+	if (where != NULL)
+		*where = zfs_strdup(entry.mnt_mountp);
+
+	return (TRUE);
+}
+
+/*
+ * Mount the given filesystem.
+ */
+int
+zfs_mount(zfs_handle_t *zhp, const char *options, int flags)
+{
+	struct stat buf;
+	char mountpoint[ZFS_MAXPROPLEN];
+	char mntopts[MNT_LINE_MAX];
+
+	if (options == NULL)
+		mntopts[0] = '\0';
+	else
+		(void) strlcpy(mntopts, options, sizeof (mntopts));
+
+	/* ignore non-filesystems */
+	if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
+	    sizeof (mountpoint), NULL, NULL, 0, FALSE) != 0)
+		return (0);
+
+	/* return success if there is no mountpoint set */
+	if (strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) == 0 ||
+	    strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0)
+		return (0);
+
+	/*
+	 * If the 'zoned' property is set, and we're in the global zone, simply
+	 * return success.
+	 */
+	if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
+		char zonename[ZONENAME_MAX];
+		if (getzonenamebyid(getzoneid(), zonename,
+		    sizeof (zonename)) < 0) {
+			zfs_error(dgettext(TEXT_DOMAIN, "internal error: "
+			    "cannot determine current zone"));
+			return (1);
+		}
+
+		if (strcmp(zonename, "global") == 0)
+			return (0);
+	}
+
+	/* Create the directory if it doesn't already exist */
+	if (lstat(mountpoint, &buf) != 0) {
+		if (mkdirp(mountpoint, 0755) != 0) {
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot mount '%s': "
+			    "unable to create mountpoint"), mountpoint);
+			return (1);
+		}
+	}
+
+	/*
+	 * Determine if the mountpoint is empty.  If so, refuse to perform the
+	 * mount.  We don't perform this check if MS_OVERLAY is specified, which
+	 * would defeat the point.  We also avoid this check if 'remount' is
+	 * specified.
+	 */
+	if ((flags & MS_OVERLAY) == 0 &&
+	    strstr(mntopts, MNTOPT_REMOUNT) == NULL &&
+	    !dir_is_empty(mountpoint)) {
+		zfs_error(dgettext(TEXT_DOMAIN, "cannot mount '%s': "
+		    "directory is not empty"), mountpoint);
+		zfs_error(dgettext(TEXT_DOMAIN, "use legacy mountpoint to "
+		    "allow this behavior, or use the -O flag"));
+		return (1);
+	}
+
+	/* perform the mount */
+	if (mount(zfs_get_name(zhp), mountpoint, MS_OPTIONSTR | flags,
+	    MNTTYPE_ZFS, NULL, 0, mntopts, sizeof (mntopts)) != 0) {
+		/*
+		 * Generic errors are nasty, but there are just way too many
+		 * from mount(), and they're well-understood.  We pick a few
+		 * common ones to improve upon.
+		 */
+		switch (errno) {
+		case EBUSY:
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot mount '%s': "
+			    "mountpoint '%s' is busy"), zhp->zfs_name,
+			    mountpoint);
+			break;
+		case EPERM:
+		case EACCES:
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot mount '%s': "
+			    "permission denied"), zhp->zfs_name,
+			    mountpoint);
+			break;
+		default:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot mount '%s': %s"),
+			    mountpoint, strerror(errno));
+			break;
+		}
+		return (1);
+	}
+
+	return (0);
+}
+
+/*
+ * Unmount the given filesystem.
+ */
+int
+zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags)
+{
+	struct mnttab search = { 0 }, entry;
+
+	/* check to see if need to unmount the filesystem */
+	search.mnt_special = (char *)zfs_get_name(zhp);
+	rewind(mnttab_file);
+	if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) &&
+	    getmntany(mnttab_file, &entry, &search) == 0)) {
+
+		if (mountpoint == NULL)
+			mountpoint = entry.mnt_mountp;
+
+		/*
+		 * Always unshare the filesystem first.
+		 */
+		if (zfs_unshare(zhp, mountpoint) != 0)
+			return (-1);
+
+		/*
+		 * Try to unmount the filesystem.  There is no reason to try a
+		 * forced unmount because the vnodes will still carry a
+		 * reference to the underlying dataset, so we can't destroy it
+		 * anyway.
+		 *
+		 * In the unmount case, we print out a slightly more informative
+		 * error message, though we'll be relying on the poor error
+		 * semantics from the kernel.
+		 */
+		if (umount2(mountpoint, flags) != 0) {
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot unmount '%s': %s"),
+			    mountpoint, strerror(errno));
+			return (-1);
+		}
+
+		/*
+		 * Don't actually destroy the underlying directory
+		 */
+	}
+
+	return (0);
+}
+
+/*
+ * Unmount this filesystem and any children inheriting the mountpoint property.
+ * To do this, just act like we're changing the mountpoint property, but don't
+ * remount the filesystems afterwards.
+ */
+int
+zfs_unmountall(zfs_handle_t *zhp, int flags)
+{
+	prop_changelist_t *clp;
+	int ret;
+
+	clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, flags);
+	if (clp == NULL)
+		return (-1);
+
+	ret = changelist_prefix(clp);
+	changelist_free(clp);
+
+	return (ret);
+}
+
+/*
+ * Check to see if the filesystem is currently shared.
+ */
+int
+zfs_is_shared(zfs_handle_t *zhp, char **where)
+{
+	char *mountpoint;
+
+	if (!zfs_is_mounted(zhp, &mountpoint))
+		return (FALSE);
+
+	if (is_shared(mountpoint)) {
+		if (where != NULL)
+			*where = mountpoint;
+		else
+			free(mountpoint);
+		return (TRUE);
+	} else {
+		free(mountpoint);
+		return (FALSE);
+	}
+}
+
+/*
+ * Share the given filesystem according to the options in 'sharenfs'.  We rely
+ * on share(1M) to the dirty work for us.
+ */
+int
+zfs_share(zfs_handle_t *zhp)
+{
+	char mountpoint[ZFS_MAXPROPLEN];
+	char shareopts[ZFS_MAXPROPLEN];
+	char buf[MAXPATHLEN];
+	FILE *fp;
+
+	/* ignore non-filesystems */
+	if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM)
+		return (0);
+
+	/* return success if there is no mountpoint set */
+	if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
+	    mountpoint, sizeof (mountpoint), NULL, NULL, 0, FALSE) != 0 ||
+	    strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) == 0 ||
+	    strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0)
+		return (0);
+
+	/* return success if there are no share options */
+	if (zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts, sizeof (shareopts),
+	    NULL, NULL, 0, FALSE) != 0 ||
+	    strcmp(shareopts, "off") == 0)
+		return (0);
+
+	/*
+	 * If the 'zoned' property is set, simply return success since:
+	 * 1. in a global zone, a dataset should not be shared if it's
+	 *    managed in a local zone.
+	 * 2. in a local zone, NFS server is not available.
+	 */
+	if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
+		return (0);
+	}
+
+	/*
+	 * Invoke the share(1M) command.  We always do this, even if it's
+	 * currently shared, as the options may have changed.
+	 */
+	if (strcmp(shareopts, "on") == 0)
+		(void) snprintf(buf, sizeof (buf), "/usr/sbin/share "
+		    "-F nfs \"%s\" 2>&1", mountpoint);
+	else
+		(void) snprintf(buf, sizeof (buf), "/usr/sbin/share "
+		    "-F nfs -o \"%s\" \"%s\" 2>&1", shareopts,
+		    mountpoint);
+
+	if ((fp = popen(buf, "r")) == NULL) {
+		zfs_error(dgettext(TEXT_DOMAIN, "cannot share '%s': "
+		    "share(1M) failed"), zfs_get_name(zhp));
+		return (-1);
+	}
+
+	/*
+	 * share(1M) should only produce output if there is some kind
+	 * of error.  All output begins with "share_nfs: ", so we trim
+	 * this off to get to the real error.
+	 */
+	if (fgets(buf, sizeof (buf), fp) != NULL) {
+		char *colon = strchr(buf, ':');
+
+		while (buf[strlen(buf) - 1] == '\n')
+			buf[strlen(buf) - 1] = '\0';
+
+		if (colon == NULL)
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot share "
+			    "'%s': share(1M) failed"),
+			    zfs_get_name(zhp));
+		else
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot share "
+			    "'%s': %s"), zfs_get_name(zhp),
+			    colon + 2);
+
+		verify(pclose(fp) != 0);
+		return (-1);
+	}
+
+	verify(pclose(fp) == 0);
+
+	return (0);
+}
+
+/*
+ * Unshare the given filesystem.
+ */
+int
+zfs_unshare(zfs_handle_t *zhp, const char *mountpoint)
+{
+	char buf[MAXPATHLEN];
+	struct mnttab search = { 0 }, entry;
+
+	/* check to see if need to unmount the filesystem */
+	search.mnt_special = (char *)zfs_get_name(zhp);
+	rewind(mnttab_file);
+	if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) &&
+	    getmntany(mnttab_file, &entry, &search) == 0)) {
+
+		if (mountpoint == NULL)
+			mountpoint = entry.mnt_mountp;
+
+		if (is_shared(mountpoint)) {
+			FILE *fp;
+
+			(void) snprintf(buf, sizeof (buf),
+			    "/usr/sbin/unshare  \"%s\" 2>&1",
+			    mountpoint);
+
+			if ((fp = popen(buf, "r")) == NULL) {
+				zfs_error(dgettext(TEXT_DOMAIN, "cannot "
+				    "unshare '%s': unshare(1M) failed"),
+				    zfs_get_name(zhp));
+				return (-1);
+			}
+
+			/*
+			 * unshare(1M) should only produce output if there is
+			 * some kind of error.  All output begins with "unshare
+			 * nfs: ", so we trim this off to get to the real error.
+			 */
+			if (fgets(buf, sizeof (buf), fp) != NULL) {
+				char *colon = strchr(buf, ':');
+
+				while (buf[strlen(buf) - 1] == '\n')
+					buf[strlen(buf) - 1] = '\0';
+
+				if (colon == NULL)
+					zfs_error(dgettext(TEXT_DOMAIN,
+					    "cannot unshare '%s': unshare(1M) "
+					    "failed"), zfs_get_name(zhp));
+				else
+					zfs_error(dgettext(TEXT_DOMAIN,
+					    "cannot unshare '%s': %s"),
+					    zfs_get_name(zhp), colon + 2);
+
+				verify(pclose(fp) != 0);
+				return (-1);
+			}
+
+			verify(pclose(fp) == 0);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Same as zfs_unmountall(), but for unshares.
+ */
+int
+zfs_unshareall(zfs_handle_t *zhp)
+{
+	prop_changelist_t *clp;
+	int ret;
+
+	clp = changelist_gather(zhp, ZFS_PROP_SHARENFS, 0);
+	if (clp == NULL)
+		return (-1);
+
+	ret = changelist_unshare(clp);
+	changelist_free(clp);
+
+	return (ret);
+}
+
+/*
+ * Remove the mountpoint associated with the current dataset, if necessary.
+ * We only remove the underlying directory if:
+ *
+ *	- The mountpoint is not 'none' or 'legacy'
+ *	- The mountpoint is non-empty
+ *	- The mountpoint is the default or inherited
+ *	- The 'zoned' property is set, or we're in a local zone
+ *
+ * Any other directories we leave alone.
+ */
+void
+remove_mountpoint(zfs_handle_t *zhp)
+{
+	char mountpoint[ZFS_MAXPROPLEN];
+	char source[ZFS_MAXNAMELEN];
+	zfs_source_t sourcetype;
+	char zonename[ZONENAME_MAX];
+
+	/* ignore non-filesystems */
+	if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
+	    sizeof (mountpoint), &sourcetype, source, sizeof (source),
+	    FALSE) != 0)
+		return;
+
+	if (getzonenamebyid(getzoneid(), zonename, sizeof (zonename)) < 0)
+		zfs_fatal(dgettext(TEXT_DOMAIN, "internal error: "
+		    "cannot determine current zone"));
+
+	if (strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0 &&
+	    strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 &&
+	    (sourcetype == ZFS_SRC_DEFAULT ||
+	    sourcetype == ZFS_SRC_INHERITED) &&
+	    (!zfs_prop_get_int(zhp, ZFS_PROP_ZONED) ||
+	    strcmp(zonename, "global") != 0)) {
+
+		/*
+		 * Try to remove the directory, silently ignoring any errors.
+		 * The filesystem may have since been removed or moved around,
+		 * and this isn't really useful to the administrator in any
+		 * way.
+		 */
+		(void) rmdir(mountpoint);
+	}
+}
diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c
new file mode 100644
index 000000000000..6b6f381bb160
--- /dev/null
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c
@@ -0,0 +1,1154 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <devid.h>
+#include <fcntl.h>
+#include <libintl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/zfs_ioctl.h>
+
+#include "zfs_namecheck.h"
+#include "libzfs_impl.h"
+
+/*
+ * Validate the given pool name, optionally putting an extended error message in
+ * 'buf'.
+ */
+static int
+zpool_name_valid(const char *pool, char *buf, size_t buflen)
+{
+	namecheck_err_t why;
+	char what;
+
+	if (strlen(pool) >= ZPOOL_MAXNAMELEN) {
+		if (buf)
+			(void) snprintf(buf, buflen,
+			    dgettext(TEXT_DOMAIN, "name is too long"));
+		return (FALSE);
+	}
+
+	if (pool_namecheck(pool, &why, &what) != 0) {
+		if (buf != NULL) {
+			switch (why) {
+			case NAME_ERR_INVALCHAR:
+				(void) snprintf(buf, buflen,
+				    dgettext(TEXT_DOMAIN, "invalid character "
+				    "'%c' in pool name"), what);
+				break;
+
+			case NAME_ERR_NOLETTER:
+				(void) strlcpy(buf, dgettext(TEXT_DOMAIN,
+				    "name must begin with a letter"), buflen);
+				break;
+
+			case NAME_ERR_RESERVED:
+				(void) strlcpy(buf, dgettext(TEXT_DOMAIN,
+				    "name is reserved\n"
+				    "pool name may have been omitted"), buflen);
+				break;
+
+			case NAME_ERR_DISKLIKE:
+				(void) strlcpy(buf, dgettext(TEXT_DOMAIN,
+				    "pool name is reserved\n"
+				    "pool name may have been omitted"), buflen);
+				break;
+			}
+		}
+		return (FALSE);
+	}
+
+	return (TRUE);
+}
+
+/*
+ * Set the pool-wide health based on the vdev state of the root vdev.
+ */
+void
+set_pool_health(nvlist_t *config)
+{
+	nvlist_t *nvroot;
+	vdev_stat_t *vs;
+	uint_t vsc;
+	char *health;
+
+	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+	    (uint64_t **)&vs, &vsc) == 0);
+
+	switch (vs->vs_state) {
+
+	case VDEV_STATE_CLOSED:
+	case VDEV_STATE_CANT_OPEN:
+	case VDEV_STATE_OFFLINE:
+		health = dgettext(TEXT_DOMAIN, "FAULTED");
+		break;
+
+	case VDEV_STATE_DEGRADED:
+		health = dgettext(TEXT_DOMAIN, "DEGRADED");
+		break;
+
+	case VDEV_STATE_HEALTHY:
+		health = dgettext(TEXT_DOMAIN, "ONLINE");
+		break;
+
+	default:
+		zfs_baderror(vs->vs_state);
+	}
+
+	verify(nvlist_add_string(config, ZPOOL_CONFIG_POOL_HEALTH,
+	    health) == 0);
+}
+
+/*
+ * Open a handle to the given pool, even if the pool is currently in the FAULTED
+ * state.
+ */
+zpool_handle_t *
+zpool_open_canfail(const char *pool)
+{
+	zpool_handle_t *zhp;
+	nvlist_t *newconfig;
+	int error;
+
+	/*
+	 * Make sure the pool name is valid.
+	 */
+	if (!zpool_name_valid(pool, NULL, 0)) {
+		zfs_error(dgettext(TEXT_DOMAIN, "cannot open '%s': invalid "
+		    "pool name"), pool);
+		return (NULL);
+	}
+
+	zhp = zfs_malloc(sizeof (zpool_handle_t));
+
+	(void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name));
+
+	if ((error = zpool_refresh_stats(zhp, NULL, &newconfig)) != 0) {
+		if (error == ENOENT || error == EINVAL) {
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot open '%s': no "
+			    "such pool"), pool);
+			free(zhp);
+			return (NULL);
+		} else {
+			zhp->zpool_state = POOL_STATE_UNAVAIL;
+		}
+	} else {
+		zhp->zpool_state = POOL_STATE_ACTIVE;
+	}
+
+	return (zhp);
+}
+
+/*
+ * Like the above, but silent on error.  Used when iterating over pools (because
+ * the configuration cache may be out of date).
+ */
+zpool_handle_t *
+zpool_open_silent(const char *pool)
+{
+	zpool_handle_t *zhp;
+	nvlist_t *newconfig;
+	int error;
+
+	zhp = zfs_malloc(sizeof (zpool_handle_t));
+
+	(void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name));
+
+	if ((error = zpool_refresh_stats(zhp, NULL, &newconfig)) != 0) {
+		if (error == ENOENT || error == EINVAL) {
+			free(zhp);
+			return (NULL);
+		} else {
+			zhp->zpool_state = POOL_STATE_UNAVAIL;
+		}
+	} else {
+		zhp->zpool_state = POOL_STATE_ACTIVE;
+	}
+
+	return (zhp);
+}
+
+/*
+ * Similar to zpool_open_canfail(), but refuses to open pools in the faulted
+ * state.
+ */
+zpool_handle_t *
+zpool_open(const char *pool)
+{
+	zpool_handle_t *zhp;
+
+	if ((zhp = zpool_open_canfail(pool)) == NULL)
+		return (NULL);
+
+	if (zhp->zpool_state == POOL_STATE_UNAVAIL) {
+		zfs_error(dgettext(TEXT_DOMAIN, "cannot open ' %s': pool is "
+		    "currently unavailable\n"), zhp->zpool_name);
+		zfs_error(dgettext(TEXT_DOMAIN, "run 'zpool status -v %s' for "
+		    "detailed information\n"), zhp->zpool_name);
+		zpool_close(zhp);
+		return (NULL);
+	}
+
+	return (zhp);
+}
+
+/*
+ * Close the handle.  Simply frees the memory associated with the handle.
+ */
+void
+zpool_close(zpool_handle_t *zhp)
+{
+	if (zhp->zpool_config)
+		nvlist_free(zhp->zpool_config);
+	free(zhp);
+}
+
+/*
+ * Return the name of the pool.
+ */
+const char *
+zpool_get_name(zpool_handle_t *zhp)
+{
+	return (zhp->zpool_name);
+}
+
+/*
+ * Return the GUID of the pool.
+ */
+uint64_t
+zpool_get_guid(zpool_handle_t *zhp)
+{
+	uint64_t guid;
+
+	verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_POOL_GUID,
+	    &guid) == 0);
+	return (guid);
+}
+
+/*
+ * Return the amount of space currently consumed by the pool.
+ */
+uint64_t
+zpool_get_space_used(zpool_handle_t *zhp)
+{
+	nvlist_t *nvroot;
+	vdev_stat_t *vs;
+	uint_t vsc;
+
+	verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+	    (uint64_t **)&vs, &vsc) == 0);
+
+	return (vs->vs_alloc);
+}
+
+/*
+ * Return the total space in the pool.
+ */
+uint64_t
+zpool_get_space_total(zpool_handle_t *zhp)
+{
+	nvlist_t *nvroot;
+	vdev_stat_t *vs;
+	uint_t vsc;
+
+	verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+	    (uint64_t **)&vs, &vsc) == 0);
+
+	return (vs->vs_space);
+}
+
+/*
+ * Return the alternate root for this pool, if any.
+ */
+int
+zpool_get_root(zpool_handle_t *zhp, char *buf, size_t buflen)
+{
+	zfs_cmd_t zc = { 0 };
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	if (ioctl(zfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0 ||
+	    zc.zc_objset_stats.dds_altroot[0] == '\0')
+		return (-1);
+
+	(void) strlcpy(buf, zc.zc_objset_stats.dds_altroot, buflen);
+
+	return (0);
+}
+
+/*
+ * Return the state of the pool (ACTIVE or UNAVAILABLE)
+ */
+int
+zpool_get_state(zpool_handle_t *zhp)
+{
+	return (zhp->zpool_state);
+}
+
+/*
+ * Create the named pool, using the provided vdev list.  It is assumed
+ * that the consumer has already validated the contents of the nvlist, so we
+ * don't have to worry about error semantics.
+ */
+int
+zpool_create(const char *pool, nvlist_t *nvroot, const char *altroot)
+{
+	zfs_cmd_t zc = { 0 };
+	char *packed;
+	size_t len;
+	int err;
+	char reason[64];
+
+	if (!zpool_name_valid(pool, reason, sizeof (reason))) {
+		zfs_error(dgettext(TEXT_DOMAIN, "cannot create '%s': %s"),
+		    pool, reason);
+		return (-1);
+	}
+
+	if (altroot != NULL && altroot[0] != '/') {
+		zfs_error(dgettext(TEXT_DOMAIN, "cannot create '%s': alternate "
+		    "root '%s' must be a complete path"), pool, altroot);
+		return (-1);
+	}
+
+	if ((err = nvlist_size(nvroot, &len, NV_ENCODE_NATIVE)) != 0)
+		zfs_baderror(err);
+
+	packed = zfs_malloc(len);
+
+	if ((err = nvlist_pack(nvroot, &packed, &len,
+	    NV_ENCODE_NATIVE, 0)) != 0)
+		zfs_baderror(err);
+
+	(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
+	zc.zc_config_src = (uint64_t)(uintptr_t)packed;
+	zc.zc_config_src_size = len;
+
+	if (altroot != NULL)
+		(void) strlcpy(zc.zc_root, altroot, sizeof (zc.zc_root));
+
+	if (ioctl(zfs_fd, ZFS_IOC_POOL_CREATE, &zc) != 0) {
+		switch (errno) {
+		case EEXIST:
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot create '%s': "
+			    "pool exists"), pool);
+			break;
+
+		case EPERM:
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot create '%s': "
+			    "permission denied"), pool);
+			break;
+
+		case EBUSY:
+			/*
+			 * This can happen if the user has specified the same
+			 * device multiple times.  We can't reliably detect this
+			 * until we try to add it and see we already have a
+			 * label.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot create '%s': "
+			    "one or more vdevs refer to the same device"),
+			    pool);
+			break;
+
+		case EOVERFLOW:
+			/*
+			 * This occurrs when one of the devices is below
+			 * SPA_MINDEVSIZE.  Unfortunately, we can't detect which
+			 * device was the problem device since there's no
+			 * reliable way to determine device size from userland.
+			 */
+			{
+				char buf[64];
+
+				zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf));
+
+				zfs_error(dgettext(TEXT_DOMAIN, "cannot "
+				    "create '%s': one or more devices is less "
+				    "than the minimum size (%s)"), pool,
+				    buf);
+			}
+			break;
+
+		case ENAMETOOLONG:
+			/*
+			 * One of the vdevs has exceeded VDEV_SPEC_MAX length in
+			 * its plaintext representation.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot create '%s': "
+			    "too many devices in a single vdev"), pool);
+			break;
+
+		case EIO:
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot create '%s': "
+			    "I/O error on one or more devices"), pool);
+			break;
+
+		case ENXIO:
+			/*
+			 * This is unlikely to happen since we've verified that
+			 * all the devices can be opened from userland, but it's
+			 * still possible in some circumstances.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot create '%s': "
+			    "one or more devices is unavailable"), pool);
+			break;
+
+		case ENOSPC:
+			/*
+			 * This can occur if we were incapable of writing to a
+			 * file vdev because the underlying filesystem is out of
+			 * space.  This is very similar to EOVERFLOW, but we'll
+			 * produce a slightly different message.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot create '%s': "
+			    "one or more devices is out of space"), pool);
+			break;
+
+		default:
+			zfs_baderror(errno);
+		}
+
+		return (-1);
+	}
+
+	free(packed);
+
+	/*
+	 * If this is an alternate root pool, then we automatically set the
+	 * moutnpoint of the root dataset to be '/'.
+	 */
+	if (altroot != NULL) {
+		zfs_handle_t *zhp;
+
+		verify((zhp = zfs_open(pool, ZFS_TYPE_ANY)) != NULL);
+		verify(zfs_prop_set(zhp, ZFS_PROP_MOUNTPOINT, "/") == 0);
+
+		zfs_close(zhp);
+	}
+
+	return (0);
+}
+
+/*
+ * Destroy the given pool.  It is up to the caller to ensure that there are no
+ * datasets left in the pool.
+ */
+int
+zpool_destroy(zpool_handle_t *zhp)
+{
+	zfs_cmd_t zc = { 0 };
+	zfs_handle_t *zfp = NULL;
+
+	if (zhp->zpool_state == POOL_STATE_ACTIVE &&
+	    (zfp = zfs_open(zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL)
+		return (-1);
+
+	if (zpool_remove_zvol_links(zhp) != NULL)
+		return (-1);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+
+	if (ioctl(zfs_fd, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
+		switch (errno) {
+		case EPERM:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot destroy '%s': permission denied"),
+			    zhp->zpool_name);
+			break;
+
+		case EBUSY:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot destroy '%s': pool busy"),
+			    zhp->zpool_name);
+			break;
+
+		case ENOENT:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot destroy '%s': no such pool"),
+			    zhp->zpool_name);
+			break;
+
+		case EROFS:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot destroy '%s': one or more devices is "
+			    "read only, or '/' is mounted read only"),
+			    zhp->zpool_name);
+			break;
+
+		default:
+			zfs_baderror(errno);
+		}
+
+		if (zfp)
+			zfs_close(zfp);
+		return (-1);
+	}
+
+	if (zfp) {
+		remove_mountpoint(zfp);
+		zfs_close(zfp);
+	}
+
+	return (0);
+}
+
+/*
+ * Add the given vdevs to the pool.  The caller must have already performed the
+ * necessary verification to ensure that the vdev specification is well-formed.
+ */
+int
+zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
+{
+	char *packed;
+	size_t len;
+	zfs_cmd_t zc;
+
+	verify(nvlist_size(nvroot, &len, NV_ENCODE_NATIVE) == 0);
+
+	packed = zfs_malloc(len);
+
+	verify(nvlist_pack(nvroot, &packed, &len, NV_ENCODE_NATIVE, 0) == 0);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	zc.zc_config_src = (uint64_t)(uintptr_t)packed;
+	zc.zc_config_src_size = len;
+
+	if (ioctl(zfs_fd, ZFS_IOC_VDEV_ADD, &zc) != 0) {
+		switch (errno) {
+		case EPERM:
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot add to '%s': "
+			    "permission denied"), zhp->zpool_name);
+			break;
+
+		case EBUSY:
+			/*
+			 * This can happen if the user has specified the same
+			 * device multiple times.  We can't reliably detect this
+			 * until we try to add it and see we already have a
+			 * label.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot add to '%s': "
+			    "one or more vdevs refer to the same device"),
+			    zhp->zpool_name);
+			break;
+
+		case ENAMETOOLONG:
+			/*
+			 * One of the vdevs has exceeded VDEV_SPEC_MAX length in
+			 * its plaintext representation.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot add to '%s': "
+			    "too many devices in a single vdev"),
+			    zhp->zpool_name);
+			break;
+
+		case ENXIO:
+			/*
+			 * This is unlikely to happen since we've verified that
+			 * all the devices can be opened from userland, but it's
+			 * still possible in some circumstances.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot add to '%s': "
+			    "one or more devices is unavailable"),
+			    zhp->zpool_name);
+			break;
+
+		case EOVERFLOW:
+			/*
+			 * This occurrs when one of the devices is below
+			 * SPA_MINDEVSIZE.  Unfortunately, we can't detect which
+			 * device was the problem device since there's no
+			 * reliable way to determine device size from userland.
+			 */
+			{
+				char buf[64];
+
+				zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf));
+
+				zfs_error(dgettext(TEXT_DOMAIN, "cannot "
+				    "add to '%s': one or more devices is less "
+				    "than the minimum size (%s)"),
+				    zhp->zpool_name, buf);
+			}
+			break;
+
+		default:
+			zfs_baderror(errno);
+		}
+
+		return (-1);
+	}
+
+	free(packed);
+
+	return (0);
+}
+
+/*
+ * Exports the pool from the system.  The caller must ensure that there are no
+ * mounted datasets in the pool.
+ */
+int
+zpool_export(zpool_handle_t *zhp)
+{
+	zfs_cmd_t zc = { 0 };
+
+	if (zpool_remove_zvol_links(zhp) != 0)
+		return (-1);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+
+	if (ioctl(zfs_fd, ZFS_IOC_POOL_EXPORT, &zc) != 0) {
+		switch (errno) {
+		case EPERM:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot export '%s': permission denied"),
+			    zhp->zpool_name);
+			break;
+
+		case EBUSY:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot export '%s': pool is in use"),
+			    zhp->zpool_name);
+			break;
+
+		case ENOENT:
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "cannot export '%s': no such pool"),
+			    zhp->zpool_name);
+			break;
+
+		default:
+			zfs_baderror(errno);
+		}
+
+		return (-1);
+	}
+
+	return (0);
+}
+
+/*
+ * Import the given pool using the known configuration.  The configuration
+ * should have come from zpool_find_import().  The 'newname' and 'altroot'
+ * parameters control whether the pool is imported with a different name or with
+ * an alternate root, respectively.
+ */
+int
+zpool_import(nvlist_t *config, const char *newname, const char *altroot)
+{
+	zfs_cmd_t zc;
+	char *packed;
+	size_t len;
+	char *thename;
+	char *origname;
+	int ret;
+
+	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+	    &origname) == 0);
+
+	if (newname != NULL) {
+		if (!zpool_name_valid(newname, NULL, 0)) {
+			zfs_error(dgettext(TEXT_DOMAIN, "cannot import '%s': "
+			    "invalid pool name"), newname);
+			return (-1);
+		}
+		thename = (char *)newname;
+	} else {
+		thename = origname;
+	}
+
+	if (altroot != NULL && altroot[0] != '/') {
+		zfs_error(dgettext(TEXT_DOMAIN, "cannot import '%s': alternate "
+		    "root '%s' must be a complete path"), thename,
+		    altroot);
+		return (-1);
+	}
+
+	(void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name));
+
+	if (altroot != NULL)
+		(void) strlcpy(zc.zc_root, altroot, sizeof (zc.zc_root));
+	else
+		zc.zc_root[0] = '\0';
+
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+	    &zc.zc_pool_guid) == 0);
+
+	verify(nvlist_size(config, &len, NV_ENCODE_NATIVE) == 0);
+
+	packed = zfs_malloc(len);
+
+	verify(nvlist_pack(config, &packed, &len, NV_ENCODE_NATIVE, 0) == 0);
+
+	zc.zc_config_src = (uint64_t)(uintptr_t)packed;
+	zc.zc_config_src_size = len;
+
+	ret = 0;
+	if (ioctl(zfs_fd, ZFS_IOC_POOL_IMPORT, &zc) != 0) {
+		char desc[1024];
+		if (newname == NULL)
+			(void) snprintf(desc, sizeof (desc),
+			    dgettext(TEXT_DOMAIN, "cannot import '%s'"),
+			    thename);
+		else
+			(void) snprintf(desc, sizeof (desc),
+			    dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"),
+			    origname, thename);
+
+		switch (errno) {
+		case EEXIST:
+			/*
+			 * A pool with that name already exists.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "%s: pool exists"),
+			    desc);
+			break;
+
+		case EPERM:
+			/*
+			 * The user doesn't have permission to create pools.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "%s: permission "
+			    "denied"), desc);
+			break;
+
+		case ENXIO:
+		case EDOM:
+			/*
+			 * Device is unavailable, or vdev sum didn't match.
+			 */
+			zfs_error(dgettext(TEXT_DOMAIN, "%s: one or more "
+			    "devices is unavailable"),
+			    desc);
+			break;
+
+		default:
+			zfs_baderror(errno);
+		}
+
+		ret = -1;
+	} else {
+		zpool_handle_t *zhp;
+		/*
+		 * This should never fail, but play it safe anyway.
+		 */
+		if ((zhp = zpool_open_silent(thename)) != NULL) {
+			ret = zpool_create_zvol_links(zhp);
+			zpool_close(zhp);
+		}
+	}
+
+	free(packed);
+	return (ret);
+}
+
+/*
+ * Scrub the pool.
+ */
+int
+zpool_scrub(zpool_handle_t *zhp, pool_scrub_type_t type)
+{
+	zfs_cmd_t zc = { 0 };
+	char msg[1024];
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	zc.zc_cookie = type;
+
+	if (ioctl(zfs_fd, ZFS_IOC_POOL_SCRUB, &zc) == 0)
+		return (0);
+
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name);
+
+	switch (errno) {
+	    case EPERM:
+		/*
+		 * No permission to scrub this pool.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: permission denied"), msg);
+		break;
+
+	    case EBUSY:
+		/*
+		 * Resilver in progress.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: currently resilvering"),
+		    msg);
+		break;
+
+	    default:
+		zfs_baderror(errno);
+	}
+	return (-1);
+}
+
+/*
+ * Bring the specified vdev online
+ */
+int
+zpool_vdev_online(zpool_handle_t *zhp, const char *path)
+{
+	zfs_cmd_t zc = { 0 };
+	char msg[1024];
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	(void) snprintf(zc.zc_prop_value, sizeof (zc.zc_prop_value),
+	    "%s%s", path[0] == '/' ? "" : "/dev/dsk/", path);
+
+	if (ioctl(zfs_fd, ZFS_IOC_VDEV_ONLINE, &zc) == 0)
+		return (0);
+
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot online %s"), zc.zc_prop_value);
+
+	switch (errno) {
+	    case ENODEV:
+		/*
+		 * Device doesn't exist
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: device not in pool"), msg);
+		break;
+
+	    case EPERM:
+		/*
+		 * No permission to bring this vdev online.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: permission denied"), msg);
+		break;
+
+	    default:
+		zfs_baderror(errno);
+	}
+	return (-1);
+}
+
+/*
+ * Take the specified vdev offline
+ */
+int
+zpool_vdev_offline(zpool_handle_t *zhp, const char *path)
+{
+	zfs_cmd_t zc = { 0 };
+	char msg[1024];
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	(void) snprintf(zc.zc_prop_value, sizeof (zc.zc_prop_value),
+	    "%s%s", path[0] == '/' ? "" : "/dev/dsk/", path);
+
+	if (ioctl(zfs_fd, ZFS_IOC_VDEV_OFFLINE, &zc) == 0)
+		return (0);
+
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot offline %s"), zc.zc_prop_value);
+
+	switch (errno) {
+	    case ENODEV:
+		/*
+		 * Device doesn't exist
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: device not in pool"), msg);
+		break;
+
+	    case EPERM:
+		/*
+		 * No permission to take this vdev offline.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: permission denied"), msg);
+		break;
+
+	    case EBUSY:
+		/*
+		 * There are no other replicas of this device.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: no valid replicas"), msg);
+		break;
+
+	    default:
+		zfs_baderror(errno);
+	}
+	return (-1);
+}
+
+/*
+ * Attach new_disk (fully described by nvroot) to old_disk.
+ * If 'replacing' is specified, tne new disk will replace the old one.
+ */
+int
+zpool_vdev_attach(zpool_handle_t *zhp,
+    const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing)
+{
+	zfs_cmd_t zc = { 0 };
+	char msg[1024];
+	char *packed;
+	int ret;
+	size_t len;
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	(void) snprintf(zc.zc_prop_value, sizeof (zc.zc_prop_value),
+	    "%s%s", old_disk[0] == '/' ? "" : "/dev/dsk/", old_disk);
+	zc.zc_cookie = replacing;
+
+	verify(nvlist_size(nvroot, &len, NV_ENCODE_NATIVE) == 0);
+
+	packed = zfs_malloc(len);
+
+	verify(nvlist_pack(nvroot, &packed, &len, NV_ENCODE_NATIVE, 0) == 0);
+
+	zc.zc_config_src = (uint64_t)(uintptr_t)packed;
+	zc.zc_config_src_size = len;
+
+	ret = ioctl(zfs_fd, ZFS_IOC_VDEV_ATTACH, &zc);
+
+	free(packed);
+
+	if (ret == 0)
+		return (0);
+
+	if (replacing)
+		(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+		    "cannot replace %s with %s"), old_disk, new_disk);
+	else
+		(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+		    "cannot attach %s to %s"), new_disk, old_disk);
+
+	switch (errno) {
+	    case EPERM:
+		/*
+		 * No permission to mess with the config.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: permission denied"), msg);
+		break;
+
+	    case ENODEV:
+		/*
+		 * Device doesn't exist.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: %s not in pool"),
+		    msg, old_disk);
+		break;
+
+	    case ENOTSUP:
+		/*
+		 * Can't attach to or replace this type of vdev.
+		 */
+		if (replacing)
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "%s: cannot replace a replacing device"), msg);
+		else
+			zfs_error(dgettext(TEXT_DOMAIN,
+			    "%s: attach is only applicable to mirrors"), msg);
+		break;
+
+	    case EINVAL:
+		/*
+		 * The new device must be a single disk.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "%s: <new_device> must be a single disk"), msg);
+		break;
+
+	    case ENXIO:
+		/*
+		 * This is unlikely to happen since we've verified that
+		 * all the devices can be opened from userland, but it's
+		 * still possible in some circumstances.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: %s is unavailable"),
+		    msg, new_disk);
+		break;
+
+	    case EBUSY:
+		/*
+		 * The new device is is use.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: %s busy"), msg, new_disk);
+		break;
+
+	    case EOVERFLOW:
+		/*
+		 * The new device is too small.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: %s is too small"),
+		    msg, new_disk);
+		break;
+
+	    case EDOM:
+		/*
+		 * The new device has a different alignment requirement.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "%s: devices have different sector alignment"), msg);
+		break;
+
+	    case ENAMETOOLONG:
+		/*
+		 * The resulting top-level vdev spec won't fit in the label.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "%s: too many devices in a single vdev"), msg);
+		break;
+
+	    default:
+		zfs_baderror(errno);
+	}
+
+	return (1);
+}
+
+/*
+ * Detach the specified device.
+ */
+int
+zpool_vdev_detach(zpool_handle_t *zhp, const char *path)
+{
+	zfs_cmd_t zc = { 0 };
+	char msg[1024];
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	(void) snprintf(zc.zc_prop_value, sizeof (zc.zc_prop_value),
+	    "%s%s", path[0] == '/' ? "" : "/dev/dsk/", path);
+
+	if (ioctl(zfs_fd, ZFS_IOC_VDEV_DETACH, &zc) == 0)
+		return (0);
+
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot detach %s"), zc.zc_prop_value);
+
+	switch (errno) {
+	    case EPERM:
+		/*
+		 * No permission to mess with the config.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: permission denied"), msg);
+		break;
+
+	    case ENODEV:
+		/*
+		 * Device doesn't exist.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: device not in pool"), msg);
+		break;
+
+	    case ENOTSUP:
+		/*
+		 * Can't detach from this type of vdev.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN,
+		    "%s: only applicable to mirror and replacing vdevs"), msg);
+		break;
+
+	    case EBUSY:
+		/*
+		 * There are no other replicas of this device.
+		 */
+		zfs_error(dgettext(TEXT_DOMAIN, "%s: no valid replicas"), msg);
+		break;
+
+	    default:
+		zfs_baderror(errno);
+	}
+
+	return (1);
+}
+
+static int
+do_zvol(zfs_handle_t *zhp, void *data)
+{
+	int linktype = (int)(uintptr_t)data;
+	int ret;
+
+	/*
+	 * We check for volblocksize intead of ZFS_TYPE_VOLUME so that we
+	 * correctly handle snapshots of volumes.
+	 */
+	if (zhp->zfs_volblocksize != 0) {
+		if (linktype)
+			ret = zvol_create_link(zhp->zfs_name);
+		else
+			ret = zvol_remove_link(zhp->zfs_name);
+	}
+
+	ret = zfs_iter_children(zhp, do_zvol, data);
+
+	zfs_close(zhp);
+	return (ret);
+}
+
+/*
+ * Iterate over all zvols in the pool and make any necessary minor nodes.
+ */
+int
+zpool_create_zvol_links(zpool_handle_t *zhp)
+{
+	zfs_handle_t *zfp;
+	int ret;
+
+	/*
+	 * If the pool is unavailable, just return success.
+	 */
+	if ((zfp = make_dataset_handle(zhp->zpool_name)) == NULL)
+		return (0);
+
+	ret = zfs_iter_children(zfp, do_zvol, (void *)TRUE);
+
+	zfs_close(zfp);
+	return (ret);
+}
+
+/*
+ * Iterate over all zvols in the poool and remove any minor nodes.
+ */
+int
+zpool_remove_zvol_links(zpool_handle_t *zhp)
+{
+	zfs_handle_t *zfp;
+	int ret;
+
+	/*
+	 * If the pool is unavailable, just return success.
+	 */
+	if ((zfp = make_dataset_handle(zhp->zpool_name)) == NULL)
+		return (0);
+
+	ret = zfs_iter_children(zfp, do_zvol, (void *)FALSE);
+
+	zfs_close(zfp);
+	return (ret);
+}
diff --git a/usr/src/lib/libzfs/common/libzfs_status.c b/usr/src/lib/libzfs/common/libzfs_status.c
new file mode 100644
index 000000000000..27a86d0c3c9a
--- /dev/null
+++ b/usr/src/lib/libzfs/common/libzfs_status.c
@@ -0,0 +1,248 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * This file contains the functions which analyze the status of a pool.  This
+ * include both the status of an active pool, as well as the status exported
+ * pools.  Returns one of the ZPOOL_STATUS_* defines describing the status of
+ * the pool.  This status is independent (to a certain degree) from the state of
+ * the pool.  A pool's state descsribes only whether or not it is capable of
+ * providing the necessary fault tolerance for data.  The status describes the
+ * overall status of devices.  A pool that is online can still have a device
+ * that is experiencing errors.
+ *
+ * Only a subset of the possible faults can be detected using 'zpool status',
+ * and not all possible errors correspond to a FMA message ID.  The explanation
+ * is left up to the caller, depending on whether it is a live pool or an
+ * import.
+ */
+
+#include <libzfs.h>
+#include <string.h>
+#include "libzfs_impl.h"
+
+/*
+ * Message ID table.  This must be kep in sync with the ZPOOL_STATUS_* defines
+ * in libzfs.h.  Note that there are some status results which go past the end
+ * of this table, and hence have no associated message ID.
+ */
+static char *msgid_table[] = {
+	"ZFS-8000-14",
+	"ZFS-8000-2Q",
+	"ZFS-8000-3C",
+	"ZFS-8000-4J",
+	"ZFS-8000-5E",
+	"ZFS-8000-6X",
+	"ZFS-8000-72",
+	"ZFS-8000-8A",
+	"ZFS-8000-9P",
+	"ZFS-8000-A5"
+};
+
+#define	NMSGID	(sizeof (msgid_table) / sizeof (msgid_table[0]))
+
+/* ARGSUSED */
+static int
+vdev_missing(uint64_t state, uint64_t aux, uint64_t errs)
+{
+	return (state == VDEV_STATE_CANT_OPEN &&
+	    aux == VDEV_AUX_OPEN_FAILED);
+}
+
+/* ARGSUSED */
+static int
+vdev_errors(uint64_t state, uint64_t aux, uint64_t errs)
+{
+	return (errs != 0);
+}
+
+/* ARGSUSED */
+static int
+vdev_broken(uint64_t state, uint64_t aux, uint64_t errs)
+{
+	return (state == VDEV_STATE_CANT_OPEN);
+}
+
+/* ARGSUSED */
+static int
+vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
+{
+	return (state == VDEV_STATE_OFFLINE);
+}
+
+/*
+ * Detect if any leaf devices that have seen errors or could not be opened.
+ */
+static int
+find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
+{
+	nvlist_t **child;
+	vdev_stat_t *vs;
+	uint_t c, children;
+	char *type;
+
+	/*
+	 * Ignore problems within a 'replacing' vdev, since we're presumably in
+	 * the process of repairing any such errors, and don't want to call them
+	 * out again.  We'll pick up the fact that a resilver is happening
+	 * later.
+	 */
+	verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0);
+	if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
+		return (FALSE);
+
+	if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child,
+	    &children) == 0) {
+		for (c = 0; c < children; c++)
+			if (find_vdev_problem(child[c], func))
+				return (TRUE);
+	} else {
+		verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS,
+		    (uint64_t **)&vs, &c) == 0);
+
+		if (func(vs->vs_state, vs->vs_aux,
+		    vs->vs_read_errors +
+		    vs->vs_write_errors +
+		    vs->vs_checksum_errors))
+			return (TRUE);
+	}
+
+	return (FALSE);
+}
+
+/*
+ * Active pool health status.
+ *
+ * To determine the status for a pool, we make several passes over the config,
+ * picking the most egregious error we find.  In order of importance, we do the
+ * following:
+ *
+ *	- Check for a complete and valid configuration
+ *	- Look for any missing devices
+ *	- Look for any devices showing errors
+ *	- Check for any data errors
+ *	- Check for any resilvering devices
+ *
+ * There can obviously be multiple errors within a single pool, so this routine
+ * only picks the most damaging of all the current errors to report.
+ */
+static zpool_status_t
+check_status(nvlist_t *config, int isimport)
+{
+	nvlist_t *nvroot;
+	vdev_stat_t *vs;
+	uint_t vsc;
+
+	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+	    (uint64_t **)&vs, &vsc) == 0);
+
+	/*
+	 * Check that the config is complete.
+	 */
+	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
+	    vs->vs_aux == VDEV_AUX_BAD_GUID_SUM) {
+		return (ZPOOL_STATUS_BAD_GUID_SUM);
+	}
+
+	/*
+	 * Missing devices
+	 */
+	if (find_vdev_problem(nvroot, vdev_missing)) {
+		if (vs->vs_state == VDEV_STATE_CANT_OPEN)
+			return (ZPOOL_STATUS_MISSING_DEV_NR);
+		else
+			return (ZPOOL_STATUS_MISSING_DEV_R);
+	}
+
+	/*
+	 * Devices with corrupted labels.
+	 */
+	if (find_vdev_problem(nvroot, vdev_broken)) {
+		if (vs->vs_state == VDEV_STATE_CANT_OPEN)
+			return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
+		else
+			return (ZPOOL_STATUS_CORRUPT_LABEL_R);
+	}
+
+	/*
+	 * Devices with errors
+	 */
+	if (!isimport && find_vdev_problem(nvroot, vdev_errors))
+		return (ZPOOL_STATUS_FAILING_DEV);
+
+	/*
+	 * Offlined devices
+	 */
+	if (find_vdev_problem(nvroot, vdev_offlined))
+		return (ZPOOL_STATUS_OFFLINE_DEV);
+
+	/*
+	 * Currently resilvering
+	 */
+	if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
+		return (ZPOOL_STATUS_RESILVERING);
+
+	/*
+	 * We currently have no way to detect the following errors:
+	 *
+	 * 	CORRUPT_CACHE
+	 * 	VERSION_MISMATCH
+	 * 	CORRUPT_POOL
+	 * 	CORRUPT_DATA
+	 */
+
+	return (ZPOOL_STATUS_OK);
+}
+
+zpool_status_t
+zpool_get_status(zpool_handle_t *zhp, char **msgid)
+{
+	zpool_status_t ret = check_status(zhp->zpool_config, FALSE);
+
+	if (ret >= NMSGID)
+		*msgid = NULL;
+	else
+		*msgid = msgid_table[ret];
+
+	return (ret);
+}
+
+zpool_status_t
+zpool_import_status(nvlist_t *config, char **msgid)
+{
+	zpool_status_t ret = check_status(config, TRUE);
+
+	if (ret >= NMSGID)
+		*msgid = NULL;
+	else
+		*msgid = msgid_table[ret];
+
+	return (ret);
+}
diff --git a/usr/src/lib/libzfs/common/libzfs_util.c b/usr/src/lib/libzfs/common/libzfs_util.c
new file mode 100644
index 000000000000..2f5c538212af
--- /dev/null
+++ b/usr/src/lib/libzfs/common/libzfs_util.c
@@ -0,0 +1,204 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Internal utility routines for the ZFS library.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <libintl.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <sys/mnttab.h>
+
+#include <libzfs.h>
+
+#include "libzfs_impl.h"
+
+int zfs_fd;
+
+void (*error_func)(const char *, va_list);
+
+/*
+ * All error handling is kept within libzfs where we have the most information
+ * immediately available.  While this may not be suitable for a general purpose
+ * library, it greatly simplifies our commands.  This command name is used to
+ * prefix all error messages appropriately.
+ */
+void
+zfs_error(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+
+	if (error_func != NULL) {
+		error_func(fmt, ap);
+	} else {
+		(void) vfprintf(stderr, fmt, ap);
+		(void) fprintf(stderr, "\n");
+	}
+
+	va_end(ap);
+}
+
+/*
+ * An internal error is something that we cannot recover from, and should never
+ * happen (such as running out of memory).  It should only be used in
+ * exceptional circumstances.
+ */
+void
+zfs_fatal(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+
+	if (error_func != NULL) {
+		error_func(fmt, ap);
+	} else {
+		(void) vfprintf(stderr, fmt, ap);
+		(void) fprintf(stderr, "\n");
+	}
+
+	va_end(ap);
+
+	exit(1);
+}
+
+/*
+ * Consumers (such as the JNI interface) that need to capture error output can
+ * override the default error handler using this function.
+ */
+void
+zfs_set_error_handler(void (*func)(const char *, va_list))
+{
+	error_func = func;
+}
+
+/*
+ * Display an out of memory error message and abort the current program.
+ */
+void
+no_memory(void)
+{
+	assert(errno == ENOMEM);
+	zfs_fatal(dgettext(TEXT_DOMAIN, "internal error: out of memory\n"));
+}
+
+/*
+ * A safe form of malloc() which will die if the allocation fails.
+ */
+void *
+zfs_malloc(size_t size)
+{
+	void *data;
+
+	if ((data = calloc(1, size)) == NULL)
+		no_memory();
+
+	return (data);
+}
+
+/*
+ * A safe form of strdup() which will die if the allocation fails.
+ */
+char *
+zfs_strdup(const char *str)
+{
+	char *ret;
+
+	if ((ret = strdup(str)) == NULL)
+		no_memory();
+
+	return (ret);
+}
+
+/*
+ * Initialize the library.  Sets the command name used when reporting errors.
+ * This command name is used to prefix all error messages appropriately.
+ * Also opens /dev/zfs and dies if it cannot be opened.
+ */
+#pragma init(zfs_init)
+void
+zfs_init(void)
+{
+	if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0)
+		zfs_fatal(dgettext(TEXT_DOMAIN,
+		    "internal error: cannot open zfs device"));
+
+	if ((mnttab_file = fopen(MNTTAB, "r")) == NULL)
+		zfs_fatal(dgettext(TEXT_DOMAIN, "internal error: unable to "
+		    "open %s\n"), MNTTAB);
+
+	sharetab_file = fopen("/etc/dfs/sharetab", "r");
+}
+
+/*
+ * Cleanup function for library.  Simply close the file descriptors that we
+ * opened as part of libzfs_init().
+ */
+#pragma fini(zfs_fini)
+void
+zfs_fini(void)
+{
+	(void) close(zfs_fd);
+}
+
+/*
+ * Convert a number to an appropriately human-readable output.
+ */
+void
+zfs_nicenum(uint64_t num, char *buf, size_t buflen)
+{
+	uint64_t n = num;
+	int index = 0;
+	char u;
+
+	while (n >= 1024) {
+		n = (n + (1024 / 2)) / 1024; /* Round up or down */
+		index++;
+	}
+
+	u = " KMGTPE"[index];
+
+	if (index == 0)
+		(void) snprintf(buf, buflen, "%llu", n);
+	else if (n < 10 && (num & (num - 1)) != 0)
+		(void) snprintf(buf, buflen, "%.2f%c",
+		    (double)num / (1ULL << 10 * index), u);
+	else if (n < 100 && (num & (num - 1)) != 0)
+		(void) snprintf(buf, buflen, "%.1f%c",
+		    (double)num / (1ULL << 10 * index), u);
+	else
+		(void) snprintf(buf, buflen, "%llu%c", n, u);
+}
diff --git a/usr/src/lib/libzfs/common/llib-lzfs b/usr/src/lib/libzfs/common/llib-lzfs
new file mode 100644
index 000000000000..83ac1841a89d
--- /dev/null
+++ b/usr/src/lib/libzfs/common/llib-lzfs
@@ -0,0 +1,32 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*LINTLIBRARY*/
+/*PROTOLIB1*/
+
+#include <libzfs.h>
diff --git a/usr/src/lib/libzfs/i386/Makefile b/usr/src/lib/libzfs/i386/Makefile
new file mode 100644
index 000000000000..cd02883abf56
--- /dev/null
+++ b/usr/src/lib/libzfs/i386/Makefile
@@ -0,0 +1,30 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include ../Makefile.com
+
+install: all $(ROOTLIBS) $(ROOTLINKS) $(ROOTLINT)
diff --git a/usr/src/lib/libzfs/inc.flg b/usr/src/lib/libzfs/inc.flg
new file mode 100644
index 000000000000..94a1191086e4
--- /dev/null
+++ b/usr/src/lib/libzfs/inc.flg
@@ -0,0 +1,31 @@
+#!/bin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+find_files "s.*" usr/src/common/zfs
+find_files "s.*" usr/src/uts/common/fs/zfs/sys
+echo_file usr/src/uts/common/sys/fs/zfs.h
diff --git a/usr/src/lib/libzfs/sparc/Makefile b/usr/src/lib/libzfs/sparc/Makefile
new file mode 100644
index 000000000000..cd02883abf56
--- /dev/null
+++ b/usr/src/lib/libzfs/sparc/Makefile
@@ -0,0 +1,30 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include ../Makefile.com
+
+install: all $(ROOTLIBS) $(ROOTLINKS) $(ROOTLINT)
diff --git a/usr/src/lib/libzfs/sparcv9/Makefile b/usr/src/lib/libzfs/sparcv9/Makefile
new file mode 100644
index 000000000000..44075ed1bddf
--- /dev/null
+++ b/usr/src/lib/libzfs/sparcv9/Makefile
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include ../Makefile.com
+include ../../Makefile.lib.64
+
+install: all $(ROOTLIBS64) $(ROOTLINKS64)
diff --git a/usr/src/lib/libzfs/spec/Makefile b/usr/src/lib/libzfs/spec/Makefile
new file mode 100644
index 000000000000..2cb984bfc990
--- /dev/null
+++ b/usr/src/lib/libzfs/spec/Makefile
@@ -0,0 +1,28 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include $(SRC)/lib/Makefile.spec.arch
diff --git a/usr/src/lib/libzfs/spec/Makefile.targ b/usr/src/lib/libzfs/spec/Makefile.targ
new file mode 100644
index 000000000000..5af8faa76729
--- /dev/null
+++ b/usr/src/lib/libzfs/spec/Makefile.targ
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+LIBRARY=	libzfs.a
+VERS=		.1
+
+OBJECTS=	libzfs.o
diff --git a/usr/src/lib/libzfs/spec/amd64/Makefile b/usr/src/lib/libzfs/spec/amd64/Makefile
new file mode 100644
index 000000000000..98db1f927173
--- /dev/null
+++ b/usr/src/lib/libzfs/spec/amd64/Makefile
@@ -0,0 +1,35 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+.KEEP_STATE:
+
+include ../Makefile.targ
+include $(SRC)/lib/Makefile.lib
+include $(SRC)/lib/Makefile.lib.64
+include $(SRC)/lib/Makefile.spec
+
+install: $(ROOTABILIB64)
diff --git a/usr/src/lib/libzfs/spec/i386/Makefile b/usr/src/lib/libzfs/spec/i386/Makefile
new file mode 100644
index 000000000000..6256c68c81d7
--- /dev/null
+++ b/usr/src/lib/libzfs/spec/i386/Makefile
@@ -0,0 +1,34 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+.KEEP_STATE:
+
+include ../Makefile.targ
+include $(SRC)/lib/Makefile.lib
+include $(SRC)/lib/Makefile.spec
+
+install: $(ROOTABILIB)
diff --git a/usr/src/lib/libzfs/spec/libzfs.spec b/usr/src/lib/libzfs/spec/libzfs.spec
new file mode 100644
index 000000000000..a8949c963649
--- /dev/null
+++ b/usr/src/lib/libzfs/spec/libzfs.spec
@@ -0,0 +1,341 @@
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+function zfs_backup
+version SUNWprivate_1.1
+end
+
+function zfs_clone
+version SUNWprivate_1.1
+end
+
+function zfs_close
+version SUNWprivate_1.1
+end
+
+function zfs_create
+version SUNWprivate_1.1
+end
+
+function zfs_destroy
+version SUNWprivate_1.1
+end
+
+function zfs_get_name
+version SUNWprivate_1.1
+end
+
+function zfs_get_type
+version SUNWprivate_1.1
+end
+
+function zfs_init
+version SUNWprivate_1.1
+end
+
+function zfs_is_mounted
+version SUNWprivate_1.1
+end
+
+function zfs_is_shared
+version SUNWprivate_1.1
+end
+
+function zfs_iter_children
+version SUNWprivate_1.1
+end
+
+function zfs_iter_dependents
+version SUNWprivate_1.1
+end
+
+function zfs_iter_root
+version SUNWprivate_1.1
+end
+
+function zfs_mount
+version SUNWprivate_1.1
+end
+
+function zfs_name_to_prop
+version SUNWprivate_1.1
+end
+
+function zfs_name_valid
+version SUNWprivate_1.1
+end
+
+function zfs_nicenum
+version SUNWprivate_1.1
+end
+
+function zfs_nicestrtonum
+version SUNWprivate_1.1
+end
+
+function zfs_open
+version SUNWprivate_1.1
+end
+
+function zfs_prop_column_name
+version SUNWprivate_1.1
+end
+
+function zfs_prop_column_format
+version SUNWprivate_1.1
+end
+
+function zfs_prop_column_subopts
+version SUNWprivate_1.1
+end
+
+function zfs_prop_column_short_subopts
+version SUNWprivate_1.1
+end
+
+function zfs_prop_default_numeric
+version SUNWprivate_1.1
+end
+
+function zfs_prop_default_string
+version SUNWprivate_1.1
+end
+
+function zfs_prop_get
+version SUNWprivate_1.1
+end
+
+function zfs_prop_get_int
+version SUNWprivate_1.1
+end
+
+function zfs_prop_get_numeric
+version SUNWprivate_1.1
+end
+
+function zfs_prop_inherit
+version SUNWprivate_1.1
+end
+
+function zfs_prop_inheritable
+version SUNWprivate_1.1
+end
+
+function zfs_prop_is_string
+version SUNWprivate_1.1
+end
+
+function zfs_prop_readonly
+version SUNWprivate_1.1
+end
+
+function zfs_prop_set
+version SUNWprivate_1.1
+end
+
+function zfs_prop_valid_for_type
+version SUNWprivate_1.1
+end
+
+function zfs_prop_validate
+version SUNWprivate_1.1
+end
+
+function zfs_prop_values
+version SUNWprivate_1.1
+end
+
+function zfs_prop_to_name
+version SUNWprivate_1.1
+end
+
+function zfs_refresh_properties
+version SUNWprivate_1.1
+end
+
+function zfs_rename
+version SUNWprivate_1.1
+end
+
+function zfs_restore
+version SUNWprivate_1.1
+end
+
+function zfs_rollback
+version SUNWprivate_1.1
+end
+
+function zfs_set_error_handler
+version SUNWprivate_1.1
+end
+
+function zfs_share
+version SUNWprivate_1.1
+end
+
+function zfs_snapshot
+version SUNWprivate_1.1
+end
+
+function zfs_type_to_name
+version SUNWprivate_1.1
+end
+
+function zfs_unmount
+version SUNWprivate_1.1
+end
+
+function zfs_unmountall
+version SUNWprivate_1.1
+end
+
+function zfs_unshare
+version SUNWprivate_1.1
+end
+
+function zfs_unshareall
+version SUNWprivate_1.1
+end
+
+function zpool_add
+version SUNWprivate_1.1
+end
+
+function zpool_close
+version SUNWprivate_1.1
+end
+
+function zpool_create
+version SUNWprivate_1.1
+end
+
+function zpool_create_zvol_links
+version SUNWprivate_1.1
+end
+
+function zpool_destroy
+version SUNWprivate_1.1
+end
+
+function zpool_export
+version SUNWprivate_1.1
+end
+
+function zpool_find_import
+version SUNWprivate_1.1
+end
+
+function zpool_get_config
+version SUNWprivate_1.1
+end
+
+function zpool_get_guid
+version SUNWprivate_1.1
+end
+
+function zpool_get_name
+version SUNWprivate_1.1
+end
+
+function zpool_get_root
+version SUNWprivate_1.1
+end
+
+function zpool_get_space_total
+version SUNWprivate_1.1
+end
+
+function zpool_get_space_used
+version SUNWprivate_1.1
+end
+
+function zpool_get_state
+version SUNWprivate_1.1
+end
+
+function zpool_get_status
+version SUNWprivate_1.1
+end
+
+function zpool_import
+version SUNWprivate_1.1
+end
+
+function zpool_import_status
+version SUNWprivate_1.1
+end
+
+function zpool_scrub
+version SUNWprivate_1.1
+end
+
+function zpool_in_use
+version SUNWprivate_1.1
+end
+
+function zpool_iter
+version SUNWprivate_1.1
+end
+
+function zpool_open
+version SUNWprivate_1.1
+end
+
+function zpool_open_canfail
+version SUNWprivate_1.1
+end
+
+function zpool_read_label
+version SUNWprivate_1.1
+end
+
+function zpool_refresh_stats
+version SUNWprivate_1.1
+end
+
+function zpool_remove_zvol_links
+version SUNWprivate_1.1
+end
+
+function zpool_vdev_online
+version SUNWprivate_1.1
+end
+
+function zpool_vdev_offline
+version SUNWprivate_1.1
+end
+
+function zpool_vdev_attach
+version SUNWprivate_1.1
+end
+
+function zpool_vdev_detach
+version SUNWprivate_1.1
+end
diff --git a/usr/src/lib/libzfs/spec/sparc/Makefile b/usr/src/lib/libzfs/spec/sparc/Makefile
new file mode 100644
index 000000000000..6256c68c81d7
--- /dev/null
+++ b/usr/src/lib/libzfs/spec/sparc/Makefile
@@ -0,0 +1,34 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+.KEEP_STATE:
+
+include ../Makefile.targ
+include $(SRC)/lib/Makefile.lib
+include $(SRC)/lib/Makefile.spec
+
+install: $(ROOTABILIB)
diff --git a/usr/src/lib/libzfs/spec/sparcv9/Makefile b/usr/src/lib/libzfs/spec/sparcv9/Makefile
new file mode 100644
index 000000000000..98db1f927173
--- /dev/null
+++ b/usr/src/lib/libzfs/spec/sparcv9/Makefile
@@ -0,0 +1,35 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+.KEEP_STATE:
+
+include ../Makefile.targ
+include $(SRC)/lib/Makefile.lib
+include $(SRC)/lib/Makefile.lib.64
+include $(SRC)/lib/Makefile.spec
+
+install: $(ROOTABILIB64)
diff --git a/usr/src/lib/libzfs/spec/versions b/usr/src/lib/libzfs/spec/versions
new file mode 100644
index 000000000000..0cbdf7e792e6
--- /dev/null
+++ b/usr/src/lib/libzfs/spec/versions
@@ -0,0 +1,40 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+sparc {
+	SUNWprivate_1.1;
+}
+sparcv9 {
+	SUNWprivate_1.1;
+}
+i386 {
+	SUNWprivate_1.1;
+}
+amd64 {
+	SUNWprivate_1.1;
+}
diff --git a/usr/src/lib/libzfs_jni/Makefile b/usr/src/lib/libzfs_jni/Makefile
new file mode 100644
index 000000000000..2e46af841e00
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/Makefile
@@ -0,0 +1,73 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include		../Makefile.lib
+
+HDRS=		libzfs_jni_dataset.h \
+		libzfs_jni_disk.h \
+		libzfs_jni_diskmgt.h \
+		libzfs_jni_main.h \
+		libzfs_jni_pool.h \
+		libzfs_jni_property.h \
+		libzfs_jni_util.h
+
+HDRDIR=		common
+
+SUBDIRS=	$(MACH)
+$(BUILD64)SUBDIRS += $(MACH64)
+
+all :=		TARGET= all
+clean :=	TARGET= clean
+clobber :=	TARGET= clobber
+install :=	TARGET= install
+lint :=		TARGET= lint
+
+MSGFILES =
+
+POFILE =
+
+.KEEP_STATE:
+
+all clean clobber install: spec .WAIT $(SUBDIRS)
+
+$(POFILE):	pofile_MSGFILES
+
+lint: $(SUBDIRS)
+
+install_h: $(ROOTHDRS)
+
+check: $(CHECKHDRS)
+
+_msg: $(MSGDOMAINPOFILE)
+
+$(SUBDIRS) spec: FRC
+	@cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
+include ../../Makefile.msg.targ
diff --git a/usr/src/lib/libzfs_jni/Makefile.com b/usr/src/lib/libzfs_jni/Makefile.com
new file mode 100644
index 000000000000..b9282caa8494
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/Makefile.com
@@ -0,0 +1,69 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+LIBRARY= libzfs_jni.a
+VERS= .1
+
+OBJS_COMMON=	libzfs_jni_dataset.o \
+		libzfs_jni_disk.o \
+		libzfs_jni_diskmgt.o \
+		libzfs_jni_main.o \
+		libzfs_jni_pool.o \
+		libzfs_jni_property.o \
+		libzfs_jni_util.o
+OBJECTS= $(OBJS_COMMON)
+
+include ../../Makefile.lib
+
+LIBS=	$(DYNLIB) $(LINTLIB)
+
+INCS += -I$(SRCDIR) \
+	-I../../../common/zfsj \
+	-I$(JAVA_ROOT)/include \
+	-I$(JAVA_ROOT)/include/solaris
+
+LDLIBS +=	-lc -lnvpair -ldiskmgt -lzfs
+CPPFLAGS +=	$(INCS)
+
+SRCS=	$(OBJS_COMMON:%.o=$(SRCDIR)/%.c)
+$(LINTLIB) := SRCS=	$(SRCDIR)/$(LINTSRC)
+
+SRCDIR=		../common
+MAPDIR=		../spec/$(TRANSMACH)
+SPECMAPFILE=	$(MAPDIR)/mapfile
+
+.KEEP_STATE:
+
+all: $(LIBS)
+
+lint: lintcheck
+
+pics/%.o: ../../../common/zfsj/%.c
+	$(COMPILE.c) -o $@ $<
+	$(POST_PROCESS_O)
+
+include ../../Makefile.targ
diff --git a/usr/src/lib/libzfs_jni/amd64/Makefile b/usr/src/lib/libzfs_jni/amd64/Makefile
new file mode 100644
index 000000000000..44075ed1bddf
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/amd64/Makefile
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include ../Makefile.com
+include ../../Makefile.lib.64
+
+install: all $(ROOTLIBS64) $(ROOTLINKS64)
diff --git a/usr/src/lib/libzfs_jni/common/libzfs_jni_dataset.c b/usr/src/lib/libzfs_jni/common/libzfs_jni_dataset.c
new file mode 100644
index 000000000000..124e52d8e184
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_dataset.c
@@ -0,0 +1,710 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include "libzfs_jni_dataset.h"
+#include "libzfs_jni_property.h"
+#include <strings.h>
+
+#define	REGEX_ZFS_NAME "^((([^/]*)(/.+)?)[/@])?([^/]+)/*"
+#define	REGEX_ZFS_NAME_NGROUPS	6
+#define	REGEX_ZFS_NAME_POOL_GROUP 3
+#define	REGEX_ZFS_NAME_PARENT_GROUP 2
+#define	REGEX_ZFS_NAME_BASE_GROUP 5
+
+/*
+ * Types
+ */
+
+typedef struct DatasetBean {
+	zjni_Object_t super;
+
+	jmethodID method_setPoolName;
+	jmethodID method_setParentName;
+	jmethodID method_setBaseName;
+	jmethodID method_setProperties;
+	jmethodID method_addProperty;
+} DatasetBean_t;
+
+typedef struct FileSystemBean {
+	DatasetBean_t super;
+} FileSystemBean_t;
+
+typedef struct PoolBean {
+	FileSystemBean_t super;
+} PoolBean_t;
+
+typedef struct VolumeBean {
+	DatasetBean_t super;
+} VolumeBean_t;
+
+typedef struct SnapshotBean {
+	DatasetBean_t super;
+} SnapshotBean_t;
+
+typedef struct FileSystemSnapshotBean {
+	DatasetBean_t super;
+} FileSystemSnapshotBean_t;
+
+typedef struct VolumeSnapshotBean {
+	DatasetBean_t super;
+} VolumeSnapshotBean_t;
+
+/*
+ * Function prototypes
+ */
+
+static void new_DatasetBean(JNIEnv *, DatasetBean_t *);
+static void new_PoolBean(JNIEnv *, PoolBean_t *);
+static void new_FileSystemBean(JNIEnv *, FileSystemBean_t *);
+static void new_VolumeBean(JNIEnv *, VolumeBean_t *);
+static void new_SnapshotBean(JNIEnv *, SnapshotBean_t *);
+static void new_FileSystemSnapshotBean(JNIEnv *, FileSystemSnapshotBean_t *);
+static void new_VolumeSnapshotBean(JNIEnv *, VolumeSnapshotBean_t *);
+static int populate_DatasetBean(JNIEnv *, zfs_handle_t *, DatasetBean_t *);
+static int populate_PoolBean(JNIEnv *, zfs_handle_t *, PoolBean_t *);
+static int populate_FileSystemBean(
+    JNIEnv *, zfs_handle_t *, FileSystemBean_t *);
+static int populate_VolumeBean(
+    JNIEnv *, zfs_handle_t *, VolumeBean_t *);
+static int populate_SnapshotBean(JNIEnv *, zfs_handle_t *, SnapshotBean_t *);
+static int populate_FileSystemSnapshotBean(
+    JNIEnv *, zfs_handle_t *, FileSystemSnapshotBean_t *);
+static int populate_VolumeSnapshotBean(
+    JNIEnv *, zfs_handle_t *, VolumeSnapshotBean_t *);
+static jobject create_PoolBean(JNIEnv *, zfs_handle_t *);
+static jobject create_FileSystemBean(JNIEnv *, zfs_handle_t *);
+static jobject create_VolumeBean(JNIEnv *, zfs_handle_t *);
+static jobject create_FileSystemSnapshotBean(JNIEnv *, zfs_handle_t *);
+static jobject create_VolumeSnapshotBean(JNIEnv *, zfs_handle_t *);
+static jobject create_DatasetBean(JNIEnv *, zfs_handle_t *);
+static int is_fs_snapshot(zfs_handle_t *);
+static int is_pool(zfs_handle_t *);
+static zfs_handle_t *open_device(JNIEnv *, jstring, zfs_type_t);
+
+/*
+ * Static functions
+ */
+
+/* Create a DatasetBean */
+static void
+new_DatasetBean(JNIEnv *env, DatasetBean_t *bean)
+{
+	zjni_Object_t *object = (zjni_Object_t *)bean;
+
+	if (object->object == NULL) {
+		object->class =
+		    (*env)->FindClass(env, ZFSJNI_PACKAGE_DATA "DatasetBean");
+
+		object->constructor =
+		    (*env)->GetMethodID(env, object->class, "<init>", "()V");
+
+		object->object =
+		    (*env)->NewObject(env, object->class, object->constructor);
+	}
+
+	bean->method_setPoolName = (*env)->GetMethodID(
+	    env, object->class, "setPoolName", "(Ljava/lang/String;)V");
+
+	bean->method_setParentName = (*env)->GetMethodID(
+	    env, object->class, "setParentName", "(Ljava/lang/String;)V");
+
+	bean->method_setBaseName = (*env)->GetMethodID(
+	    env, object->class, "setBaseName", "(Ljava/lang/String;)V");
+
+	bean->method_setProperties = (*env)->GetMethodID(
+	    env, object->class, "setProperties",
+	    "([L" ZFSJNI_PACKAGE_DATA "Property;)V");
+
+	bean->method_addProperty = (*env)->GetMethodID(
+	    env, object->class, "addProperty",
+	    "(L" ZFSJNI_PACKAGE_DATA "Property;)V");
+}
+
+/* Create a PoolBean */
+static void
+new_PoolBean(JNIEnv *env, PoolBean_t *bean)
+{
+	zjni_Object_t *object = (zjni_Object_t *)bean;
+
+	if (object->object == NULL) {
+
+		object->class =
+		    (*env)->FindClass(env, ZFSJNI_PACKAGE_DATA "PoolBean");
+
+		object->constructor =
+		    (*env)->GetMethodID(env, object->class, "<init>", "()V");
+
+		object->object =
+		    (*env)->NewObject(env, object->class, object->constructor);
+	}
+
+	new_FileSystemBean(env, (FileSystemBean_t *)bean);
+}
+
+/* Create a FileSystemBean */
+static void
+new_FileSystemBean(JNIEnv *env, FileSystemBean_t *bean)
+{
+	zjni_Object_t *object = (zjni_Object_t *)bean;
+
+	if (object->object == NULL) {
+		object->class =
+		    (*env)->FindClass(env,
+			ZFSJNI_PACKAGE_DATA "FileSystemBean");
+
+		object->constructor =
+		    (*env)->GetMethodID(env, object->class, "<init>", "()V");
+
+		object->object =
+		    (*env)->NewObject(env, object->class, object->constructor);
+	}
+
+	new_DatasetBean(env, (DatasetBean_t *)bean);
+}
+
+/* Create a VolumeBean */
+static void
+new_VolumeBean(JNIEnv *env, VolumeBean_t *bean)
+{
+	zjni_Object_t *object = (zjni_Object_t *)bean;
+
+	if (object->object == NULL) {
+		object->class =
+		    (*env)->FindClass(env,
+			ZFSJNI_PACKAGE_DATA "VolumeBean");
+
+		object->constructor =
+		    (*env)->GetMethodID(env, object->class, "<init>", "()V");
+
+		object->object =
+		    (*env)->NewObject(env, object->class, object->constructor);
+	}
+
+	new_DatasetBean(env, (DatasetBean_t *)bean);
+}
+
+/* Create a SnapshotBean */
+static void
+new_SnapshotBean(JNIEnv *env, SnapshotBean_t *bean)
+{
+	zjni_Object_t *object = (zjni_Object_t *)bean;
+
+	if (object->object == NULL) {
+		object->class =
+		    (*env)->FindClass(env,
+			ZFSJNI_PACKAGE_DATA "SnapshotBean");
+
+		object->constructor =
+		    (*env)->GetMethodID(env, object->class, "<init>", "()V");
+
+		object->object =
+		    (*env)->NewObject(env, object->class, object->constructor);
+	}
+
+	new_DatasetBean(env, (DatasetBean_t *)bean);
+}
+
+/* Create a FileSystemSnapshotBean */
+static void
+new_FileSystemSnapshotBean(JNIEnv *env, FileSystemSnapshotBean_t *bean)
+{
+	zjni_Object_t *object = (zjni_Object_t *)bean;
+
+	if (object->object == NULL) {
+		object->class =
+		    (*env)->FindClass(env,
+			ZFSJNI_PACKAGE_DATA "FileSystemSnapshotBean");
+
+		object->constructor =
+		    (*env)->GetMethodID(env, object->class, "<init>", "()V");
+
+		object->object =
+		    (*env)->NewObject(env, object->class, object->constructor);
+	}
+
+	new_SnapshotBean(env, (SnapshotBean_t *)bean);
+}
+
+/* Create a VolumeSnapshotBean */
+static void
+new_VolumeSnapshotBean(JNIEnv *env, VolumeSnapshotBean_t *bean)
+{
+	zjni_Object_t *object = (zjni_Object_t *)bean;
+
+	if (object->object == NULL) {
+		object->class =
+		    (*env)->FindClass(env,
+			ZFSJNI_PACKAGE_DATA "VolumeSnapshotBean");
+
+		object->constructor =
+		    (*env)->GetMethodID(env, object->class, "<init>", "()V");
+
+		object->object =
+		    (*env)->NewObject(env, object->class, object->constructor);
+	}
+
+	new_SnapshotBean(env, (SnapshotBean_t *)bean);
+}
+
+static int
+populate_DatasetBean(JNIEnv *env, zfs_handle_t *zhp, DatasetBean_t *bean)
+{
+	jstring poolUTF;
+	jstring parentUTF;
+	jstring baseUTF;
+	jobjectArray properties;
+	zjni_Object_t *object = (zjni_Object_t *)bean;
+
+	/*
+	 * zhp->zfs_name has the format
+	 * <pool>[[/<container...>]/<dataset>[@<snapshot>]]
+	 */
+
+	regex_t re;
+	regmatch_t matches[REGEX_ZFS_NAME_NGROUPS];
+
+	char *name = (char *)zfs_get_name(zhp);
+	if (regcomp(&re, REGEX_ZFS_NAME, REG_EXTENDED) != 0 ||
+	    regexec(&re, name, REGEX_ZFS_NAME_NGROUPS, matches, 0) != 0) {
+		regfree(&re);
+		zjni_throw_exception(env, "invalid name: %s", name);
+		return (-1);
+	}
+
+	regfree(&re);
+
+	/* Set names */
+	poolUTF = zjni_get_matched_string(
+	    env, name, matches + REGEX_ZFS_NAME_POOL_GROUP);
+	parentUTF = zjni_get_matched_string(
+	    env, name, matches + REGEX_ZFS_NAME_PARENT_GROUP);
+	baseUTF = zjni_get_matched_string(
+	    env, name, matches + REGEX_ZFS_NAME_BASE_GROUP);
+
+	if (poolUTF == NULL) {
+		poolUTF = baseUTF;
+	}
+
+	(*env)->CallVoidMethod(
+	    env, object->object, bean->method_setPoolName, poolUTF);
+	(*env)->CallVoidMethod(
+	    env, object->object, bean->method_setBaseName, baseUTF);
+
+	if (parentUTF != NULL) {
+		(*env)->CallVoidMethod(
+		    env, object->object, bean->method_setParentName, parentUTF);
+	}
+
+	properties = zjni_get_Dataset_properties(env, zhp);
+	if (properties == NULL) {
+		/* Must not call any more Java methods to preserve exception */
+		return (-1);
+	}
+
+	(*env)->CallVoidMethod(
+	    env, object->object, bean->method_setProperties, properties);
+
+	return (0);
+}
+
+static int
+populate_PoolBean(JNIEnv *env, zfs_handle_t *zhp, PoolBean_t *bean)
+{
+	return (populate_FileSystemBean(env, zhp, (FileSystemBean_t *)bean));
+}
+
+static int
+populate_FileSystemBean(JNIEnv *env, zfs_handle_t *zhp, FileSystemBean_t *bean)
+{
+	return (populate_DatasetBean(env, zhp, (DatasetBean_t *)bean));
+}
+
+static int
+populate_VolumeBean(JNIEnv *env, zfs_handle_t *zhp, VolumeBean_t *bean)
+{
+	return (populate_DatasetBean(env, zhp, (DatasetBean_t *)bean));
+}
+
+static int
+populate_SnapshotBean(JNIEnv *env, zfs_handle_t *zhp, SnapshotBean_t *bean)
+{
+	return (populate_DatasetBean(env, zhp, (DatasetBean_t *)bean));
+}
+
+static int
+populate_FileSystemSnapshotBean(JNIEnv *env, zfs_handle_t *zhp,
+    FileSystemSnapshotBean_t *bean)
+{
+	return (populate_SnapshotBean(env, zhp, (SnapshotBean_t *)bean));
+}
+
+static int
+populate_VolumeSnapshotBean(JNIEnv *env, zfs_handle_t *zhp,
+    VolumeSnapshotBean_t *bean)
+{
+	return (populate_SnapshotBean(env, zhp, (SnapshotBean_t *)bean));
+}
+
+static jobject
+create_PoolBean(JNIEnv *env, zfs_handle_t *zhp)
+{
+	int result;
+	PoolBean_t bean_obj = {0};
+	PoolBean_t *bean = &bean_obj;
+
+	/* Construct PoolBean */
+	new_PoolBean(env, bean);
+
+	result = populate_PoolBean(env, zhp, bean);
+	if (result) {
+		/* Must not call any more Java methods to preserve exception */
+		return (NULL);
+	}
+
+	return (((zjni_Object_t *)bean)->object);
+}
+
+static jobject
+create_FileSystemBean(JNIEnv *env, zfs_handle_t *zhp)
+{
+	int result;
+	FileSystemBean_t bean_obj = {0};
+	FileSystemBean_t *bean = &bean_obj;
+
+	/* Construct FileSystemBean */
+	new_FileSystemBean(env, bean);
+
+	result = populate_FileSystemBean(env, zhp, bean);
+	if (result) {
+		/* Must not call any more Java methods to preserve exception */
+		return (NULL);
+	}
+
+	return (((zjni_Object_t *)bean)->object);
+}
+
+static jobject
+create_VolumeBean(JNIEnv *env, zfs_handle_t *zhp)
+{
+	int result;
+	VolumeBean_t bean_obj = {0};
+	VolumeBean_t *bean = &bean_obj;
+
+	/* Construct VolumeBean */
+	new_VolumeBean(env, bean);
+
+	result = populate_VolumeBean(env, zhp, bean);
+	if (result) {
+		/* Must not call any more Java methods to preserve exception */
+		return (NULL);
+	}
+
+	return (((zjni_Object_t *)bean)->object);
+}
+
+static jobject
+create_FileSystemSnapshotBean(JNIEnv *env, zfs_handle_t *zhp)
+{
+	int result;
+	FileSystemSnapshotBean_t bean_obj = {0};
+	FileSystemSnapshotBean_t *bean = &bean_obj;
+
+	/* Construct FileSystemSnapshotBean */
+	new_FileSystemSnapshotBean(env, bean);
+
+	result = populate_FileSystemSnapshotBean(env, zhp, bean);
+	if (result) {
+		/* Must not call any more Java methods to preserve exception */
+		return (NULL);
+	}
+
+	return (((zjni_Object_t *)bean)->object);
+}
+
+static jobject
+create_VolumeSnapshotBean(JNIEnv *env, zfs_handle_t *zhp)
+{
+	int result;
+	VolumeSnapshotBean_t bean_obj = {0};
+	VolumeSnapshotBean_t *bean = &bean_obj;
+
+	/* Construct VolumeSnapshotBean */
+	new_VolumeSnapshotBean(env, bean);
+
+	result = populate_VolumeSnapshotBean(env, zhp, bean);
+	if (result) {
+		/* Must not call any more Java methods to preserve exception */
+		return (NULL);
+	}
+
+	return (((zjni_Object_t *)bean)->object);
+}
+
+static jobject
+create_DatasetBean(JNIEnv *env, zfs_handle_t *zhp)
+{
+	jobject object = NULL;
+
+	switch (zfs_get_type(zhp)) {
+	case ZFS_TYPE_FILESYSTEM:
+		object = is_pool(zhp) ?
+		    create_PoolBean(env, zhp) :
+		    create_FileSystemBean(env, zhp);
+		break;
+
+	case ZFS_TYPE_VOLUME:
+		object = create_VolumeBean(env, zhp);
+		break;
+
+	case ZFS_TYPE_SNAPSHOT:
+		object = is_fs_snapshot(zhp) ?
+		    create_FileSystemSnapshotBean(env, zhp) :
+		    create_VolumeSnapshotBean(env, zhp);
+		break;
+	}
+
+	return (object);
+}
+
+/*
+ * Determines whether the given snapshot is a snapshot of a file
+ * system or of a volume.
+ *
+ * Returns:
+ *
+ *	0 if it is a volume snapshot
+ *	1 if it is a file system snapshot
+ *	-1 on error
+ */
+static int
+is_fs_snapshot(zfs_handle_t *zhp)
+{
+	char parent[ZFS_MAXNAMELEN];
+	zfs_handle_t *parent_zhp;
+	int isfs;
+
+	if (zfs_get_type(zhp) != ZFS_TYPE_SNAPSHOT) {
+		return (-1);
+	}
+
+	zjni_get_dataset_from_snapshot(
+	    zfs_get_name(zhp), parent, sizeof (parent));
+
+	parent_zhp = zfs_open(parent, ZFS_TYPE_ANY);
+	if (parent_zhp == NULL) {
+		return (-1);
+	}
+
+	isfs = zfs_get_type(parent_zhp) == ZFS_TYPE_FILESYSTEM;
+	zfs_close(parent_zhp);
+
+	return (isfs);
+}
+
+static int
+is_pool(zfs_handle_t *zhp)
+{
+	return (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM &&
+	    strchr(zfs_get_name(zhp), '/') == NULL);
+}
+
+static zfs_handle_t *
+open_device(JNIEnv *env, jstring nameUTF, zfs_type_t typemask)
+{
+	zfs_handle_t *zhp = NULL;
+
+	if (nameUTF != NULL) {
+		const char *name =
+		    (*env)->GetStringUTFChars(env, nameUTF, NULL);
+
+		zhp = zfs_open(name, typemask);
+		if (zhp == NULL) {
+			zjni_throw_exception(env, "invalid device name: %s",
+			    name);
+		}
+
+		(*env)->ReleaseStringUTFChars(env, nameUTF, name);
+	}
+
+	return (zhp);
+}
+
+/*
+ * Package-private functions
+ */
+
+/*
+ * Callback function for zfs_iter_children().  Creates the appropriate
+ * Dataset and adds it to the given zjni_ArrayList.  Per the contract
+ * with zfs_iter_children(), calls zfs_close() on the given
+ * zfs_handle_t.
+ */
+int
+zjni_create_add_Dataset(zfs_handle_t *zhp, void *data)
+{
+	JNIEnv *env = ((zjni_ArrayCallbackData_t *)data)->env;
+	zjni_Collection_t *list = ((zjni_ArrayCallbackData_t *)data)->list;
+	zfs_type_t typemask =
+	    ((zjni_DatasetArrayCallbackData_t *)data)->typemask;
+
+	/* Only add allowed types */
+	if (zfs_get_type(zhp) & typemask) {
+
+		jobject bean = create_DatasetBean(env, zhp);
+		zfs_close(zhp);
+
+		if (bean == NULL) {
+			/*
+			 * Must not call any more Java methods to preserve
+			 * exception
+			 */
+			return (-1);
+		}
+
+		/* Add pool to zjni_ArrayList */
+		(*env)->CallBooleanMethod(env, ((zjni_Object_t *)list)->object,
+		    ((zjni_Collection_t *)list)->method_add, bean);
+	}
+
+	return (0);
+}
+
+jobjectArray
+zjni_get_Datasets_below(JNIEnv *env, jstring parentUTF,
+    zfs_type_t parent_typemask, zfs_type_t child_typemask, char *arrayClass)
+{
+	jobjectArray array = NULL;
+	zfs_handle_t *zhp;
+
+	/* Create an array list to hold the children */
+	zjni_DatasetSet_t list_obj = {0};
+	zjni_DatasetSet_t *list = &list_obj;
+	zjni_new_DatasetSet(env, list);
+
+	/* Retrieve parent */
+	zhp = open_device(env, parentUTF, parent_typemask);
+	if (zhp != NULL) {
+
+		if (!(zfs_get_type(zhp) & parent_typemask)) {
+			zjni_throw_exception(env, "wrong type: %s",
+			    zfs_get_name(zhp));
+		} else {
+
+			zjni_DatasetArrayCallbackData_t data = {0};
+			data.data.env = env;
+			data.data.list = (zjni_Collection_t *)list;
+			data.typemask = child_typemask;
+
+			(void) zfs_iter_children(zhp, zjni_create_add_Dataset,
+			    &data);
+		}
+
+		zfs_close(zhp);
+	}
+
+	if ((*env)->ExceptionOccurred(env) == NULL) {
+		array = zjni_Collection_to_array(
+		    env, (zjni_Collection_t *)list, arrayClass);
+	}
+
+	return (array);
+}
+
+jobjectArray
+zjni_get_Datasets_dependents(JNIEnv *env, jobjectArray paths)
+{
+	jint i;
+	jint npaths;
+	zjni_DatasetArrayCallbackData_t data = {0};
+	jobjectArray array = NULL;
+
+	/* Create a list to hold the children */
+	zjni_DatasetSet_t list_obj = {0};
+	zjni_DatasetSet_t *list = &list_obj;
+	zjni_new_DatasetSet(env, list);
+
+	data.data.env = env;
+	data.data.list = (zjni_Collection_t *)list;
+	data.typemask = ZFS_TYPE_ANY;
+
+	npaths = (*env)->GetArrayLength(env, paths);
+	for (i = 0; i < npaths; i++) {
+
+		jstring pathUTF = (jstring)
+		    ((*env)->GetObjectArrayElement(env, paths, i));
+
+		zfs_handle_t *zhp = open_device(env, pathUTF, ZFS_TYPE_ANY);
+		if (zhp == NULL) {
+			/* Clear the exception */
+			(*env)->ExceptionClear(env);
+		} else {
+
+			/* Add all dependents of this Dataset to the list */
+			(void) zfs_iter_dependents(zhp,
+			    zjni_create_add_Dataset, &data);
+
+			/* Add this Dataset to the list (and close zhp) */
+			(void) zjni_create_add_Dataset(zhp, &data);
+		}
+	}
+
+	if ((*env)->ExceptionOccurred(env) == NULL) {
+		array = zjni_Collection_to_array(env, (zjni_Collection_t *)list,
+		    ZFSJNI_PACKAGE_DATA "Dataset");
+	}
+
+	return (array);
+}
+
+/*
+ * Gets a Dataset of the given name and type, or NULL if no such
+ * Dataset exists.
+ */
+jobject
+zjni_get_Dataset(JNIEnv *env, jstring nameUTF, zfs_type_t typemask)
+{
+	jobject device = NULL;
+	zfs_handle_t *zhp = open_device(env, nameUTF, typemask);
+	if (zhp == NULL) {
+		/*
+		 * Clear the exception -- this function returns NULL
+		 * on invalid device
+		 */
+		(*env)->ExceptionClear(env);
+	} else {
+
+		/* Is this device the expected type? */
+		if (zfs_get_type(zhp) & typemask) {
+			/* Creates an object of the appropriate class */
+			device = create_DatasetBean(env, zhp);
+		}
+		zfs_close(zhp);
+	}
+
+	return (device);
+}
diff --git a/usr/src/lib/libzfs_jni/common/libzfs_jni_dataset.h b/usr/src/lib/libzfs_jni/common/libzfs_jni_dataset.h
new file mode 100644
index 000000000000..4476e0ed1a07
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_dataset.h
@@ -0,0 +1,62 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LIBZFS_JNI_DATASET_H
+#define	_LIBZFS_JNI_DATASET_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <libzfs_jni_util.h>
+#include <libzfs.h>
+
+/*
+ * Types
+ */
+
+typedef struct zjni_DatasetArrayCallbackData {
+	zjni_ArrayCallbackData_t data;
+	zfs_type_t typemask;
+} zjni_DatasetArrayCallbackData_t;
+
+/*
+ * Function prototypes
+ */
+
+jobjectArray zjni_get_Datasets_below(JNIEnv *, jstring,
+    zfs_type_t, zfs_type_t, char *);
+jobjectArray zjni_get_Datasets_dependents(JNIEnv *, jobjectArray);
+jobject zjni_get_Dataset(JNIEnv *, jstring, zfs_type_t);
+int zjni_create_add_Dataset(zfs_handle_t *, void *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBZFS_JNI_DATASET_H */
diff --git a/usr/src/lib/libzfs_jni/common/libzfs_jni_disk.c b/usr/src/lib/libzfs_jni/common/libzfs_jni_disk.c
new file mode 100644
index 000000000000..218f3d8226f0
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_disk.c
@@ -0,0 +1,198 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include "libzfs_jni_disk.h"
+
+/*
+ * Function prototypes
+ */
+
+static jobject create_DiskDeviceBean(JNIEnv *, dmgt_disk_t *);
+static jobject get_SliceUsage_Use(JNIEnv *, char *);
+static jobject create_SliceUsage(JNIEnv *env, dmgt_slice_t *sp);
+static jobject create_SliceDeviceBean(JNIEnv *env, dmgt_slice_t *sp);
+static jobjectArray create_SliceDeviceBean_array(JNIEnv *, dmgt_slice_t **);
+
+/*
+ * Static functions
+ */
+
+static jobject
+create_DiskDeviceBean(JNIEnv *env, dmgt_disk_t *dp)
+{
+	jobject disk = NULL;
+
+	int naliases = zjni_count_elements((void **)dp->aliases);
+	jobjectArray aliases = zjni_string_array_to_String_array(
+	    env, dp->aliases, naliases);
+	if (aliases != NULL) {
+		jobjectArray slices = create_SliceDeviceBean_array(env,
+		    dp->slices);
+		if (slices != NULL) {
+			jstring nameUTF = (*env)->NewStringUTF(env, dp->name);
+
+			jclass class_DiskDeviceBean = (*env)->FindClass(
+			    env, ZFSJNI_PACKAGE_DATA "DiskDeviceBean");
+
+			jmethodID constructor =
+			    (*env)->GetMethodID(env, class_DiskDeviceBean,
+				"<init>",
+				"(JLjava/lang/String;[Ljava/lang/String;[L"
+				ZFSJNI_PACKAGE_DATA "SliceDeviceBean;)V");
+
+			disk = (*env)->NewObject(env, class_DiskDeviceBean,
+			    constructor, dp->size, nameUTF, aliases, slices);
+		}
+	}
+
+	return (disk);
+}
+
+static jobject
+get_SliceUsage_Use(JNIEnv *env, char *dm_usage)
+{
+	jobject enumVal = NULL;
+
+	if (dm_usage != NULL) {
+		jclass class_SliceUsage_Use = (*env)->FindClass(
+		    env, ZFSJNI_PACKAGE_DATA "SliceUsage$Use");
+
+		jfieldID id = (*env)->GetStaticFieldID(env,
+		    class_SliceUsage_Use,
+		    dm_usage, "L" ZFSJNI_PACKAGE_DATA "SliceUsage$Use;");
+
+		if (id != NULL) {
+			/* Retrieve the proper SliceUsage$Use enum value */
+			enumVal = (*env)->GetStaticObjectField(
+			    env, class_SliceUsage_Use, id);
+#ifdef	DEBUG
+		} else {
+			(void) fprintf(stderr, "Unknown slice usage: %s\n",
+			    dm_usage);
+#endif /* DEBUG */
+		}
+	}
+
+	return (enumVal);
+}
+
+static jobject
+create_SliceUsage(JNIEnv *env, dmgt_slice_t *sp)
+{
+	jobject usage = NULL;
+	if (sp->used_name != NULL) {
+		jobject use = get_SliceUsage_Use(env, sp->used_name);
+
+		if (use != NULL) {
+			jstring usedByUTF =
+			    (*env)->NewStringUTF(env, sp->used_by);
+
+			jclass class_SliceUsage = (*env)->FindClass(
+			    env, ZFSJNI_PACKAGE_DATA "SliceUsage");
+
+			jmethodID constructor =
+			    (*env)->GetMethodID(env, class_SliceUsage, "<init>",
+				"(L" ZFSJNI_PACKAGE_DATA
+				"SliceUsage$Use;Ljava/lang/String;)V");
+
+			usage = (*env)->NewObject(env,
+			    class_SliceUsage, constructor, use, usedByUTF);
+		}
+	}
+
+	return (usage);
+}
+
+static jobject
+create_SliceDeviceBean(JNIEnv *env, dmgt_slice_t *sp)
+{
+	jobject slice = NULL;
+
+	/* May be NULL if unused */
+	jobject usage = create_SliceUsage(env, sp);
+
+	jstring nameUTF = (*env)->NewStringUTF(env, sp->name);
+
+	jclass class_SliceDeviceBean = (*env)->FindClass(
+	    env, ZFSJNI_PACKAGE_DATA "SliceDeviceBean");
+
+	jmethodID constructor =
+	    (*env)->GetMethodID(env, class_SliceDeviceBean, "<init>",
+		"(JLjava/lang/String;JL" ZFSJNI_PACKAGE_DATA "SliceUsage;)V");
+
+	slice = (*env)->NewObject(env, class_SliceDeviceBean,
+	    constructor, sp->size, nameUTF, sp->start, usage);
+
+	return (slice);
+}
+
+static jobjectArray
+create_SliceDeviceBean_array(JNIEnv *env, dmgt_slice_t **slices)
+{
+	/* Create an array list */
+	zjni_ArrayList_t list_class = {0};
+	zjni_ArrayList_t *list_class_p = &list_class;
+	zjni_new_ArrayList(env, list_class_p);
+
+	if (slices != NULL) {
+		int i;
+		for (i = 0; slices[i] != NULL; i++) {
+			dmgt_slice_t *slice = slices[i];
+			jobject obj;
+			obj = create_SliceDeviceBean(env, slice);
+			if (obj != NULL) {
+				(*env)->CallBooleanMethod(env,
+				    ((zjni_Object_t *)list_class_p)->object,
+				    ((zjni_Collection_t *)list_class_p)->
+				    method_add, obj);
+			}
+		}
+	}
+
+	return (zjni_Collection_to_array(
+	    env, (zjni_Collection_t *)list_class_p,
+	    ZFSJNI_PACKAGE_DATA "SliceDeviceBean"));
+}
+
+/*
+ * Package-private functions
+ */
+
+int
+zjni_create_add_DiskDevice(dmgt_disk_t *dp, void *data)
+{
+	JNIEnv *env = ((zjni_ArrayCallbackData_t *)data)->env;
+	zjni_Collection_t *list = ((zjni_ArrayCallbackData_t *)data)->list;
+	jobject disk = create_DiskDeviceBean(env, dp);
+
+	/* Add disk to zjni_ArrayList */
+	(*env)->CallBooleanMethod(env, ((zjni_Object_t *)list)->object,
+	    ((zjni_Collection_t *)list)->method_add, disk);
+
+	return (0);
+}
diff --git a/usr/src/lib/libzfs_jni/common/libzfs_jni_disk.h b/usr/src/lib/libzfs_jni/common/libzfs_jni_disk.h
new file mode 100644
index 000000000000..a05efc13b841
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_disk.h
@@ -0,0 +1,49 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LIBZFS_JNI_DISK_H
+#define	_LIBZFS_JNI_DISK_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <libzfs_jni_util.h>
+#include <libzfs_jni_diskmgt.h>
+
+/*
+ * Function prototypes
+ */
+
+int zjni_create_add_DiskDevice(dmgt_disk_t *, void *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBZFS_JNI_DISK_H */
diff --git a/usr/src/lib/libzfs_jni/common/libzfs_jni_diskmgt.c b/usr/src/lib/libzfs_jni/common/libzfs_jni_diskmgt.c
new file mode 100644
index 000000000000..ded27aaf3992
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_diskmgt.c
@@ -0,0 +1,764 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include "libzfs_jni_diskmgt.h"
+#include "libzfs_jni_util.h"
+#include <strings.h>
+#include <libzfs.h>
+#include <sys/mnttab.h>
+
+/*
+ * Constants
+ */
+
+#define	DISK_IN_USE	12345
+
+/*
+ * Function prototypes
+ */
+
+static void free_slice_array(dmgt_slice_t **slices);
+static char *get_device_name(dm_descriptor_t device, int *error);
+static dmgt_disk_t *get_disk(dm_descriptor_t disk, int *error);
+static char **get_disk_aliases(dm_descriptor_t disk, char *name, int *error);
+static int get_disk_online(dm_descriptor_t disk, int *error);
+static void remove_slice_from_list(dmgt_slice_t **slices, int index);
+static dmgt_slice_t **get_disk_slices(dm_descriptor_t media,
+    const char *name, uint32_t blocksize, int *error);
+static dmgt_slice_t **get_disk_usable_slices(dm_descriptor_t media,
+    const char *name, uint32_t blocksize, int *in_use, int *error);
+static void get_disk_size(dm_descriptor_t media, char *name,
+    uint64_t *size, uint32_t *blocksize, int *error);
+static void get_slice_use(dm_descriptor_t slice, char *name,
+    char **used_name, char **used_by, int *error);
+static dmgt_slice_t *get_slice(
+    dm_descriptor_t slice, uint32_t blocksize, int *error);
+static void handle_error(const char *format, ...);
+static int slice_in_use(dmgt_slice_t *slice);
+static int slice_too_small(dmgt_slice_t *slice);
+
+/*
+ * Static data
+ */
+
+static void (*error_func)(const char *, va_list);
+
+/*
+ * Static functions
+ */
+
+static void
+free_slice_array(dmgt_slice_t **slices)
+{
+	if (slices != NULL) {
+		int i;
+		for (i = 0; slices[i] != NULL; i++) {
+			dmgt_free_slice(slices[i]);
+		}
+		free(slices);
+	}
+}
+
+static char *
+get_device_name(dm_descriptor_t device, int *error)
+{
+	char *dup;
+	char *name;
+
+	*error = 0;
+	name = dm_get_name(device, error);
+	if (*error) {
+		handle_error("could not determine name of device");
+	} else {
+		dup = strdup(name);
+		if (dup == NULL) {
+			handle_error("out of memory");
+			*error = -1;
+		}
+
+		dm_free_name(name);
+	}
+
+	return (dup);
+}
+
+/*
+ * Gets a dmgt_disk_t for the given disk dm_descriptor_t.
+ *
+ * Results:
+ *
+ *  1. Success: error is set to 0 and a dmgt_disk_t is returned
+ *
+ *  2. Failure: error is set to -1 and NULL is returned
+ *
+ *  3. In use: error is set to DISK_IN_USE and NULL is returned if all
+ *     of the slices have an existing use that precludes use in ZFS
+ */
+static dmgt_disk_t *
+get_disk(dm_descriptor_t disk, int *error)
+{
+	dmgt_disk_t *dp;
+	*error = 0;
+
+	dp = (dmgt_disk_t *)calloc(1, sizeof (dmgt_disk_t));
+	if (dp == NULL) {
+		handle_error("out of memory");
+		*error = -1;
+	} else {
+
+		/* Get name */
+		dp->name = get_device_name(disk, error);
+		if (!*error) {
+
+			/* Get aliases */
+			dp->aliases = get_disk_aliases(disk, dp->name, error);
+			if (!*error) {
+
+				/* Get media */
+				dm_descriptor_t *media =
+				    dm_get_associated_descriptors(disk,
+					DM_MEDIA, error);
+				if (*error != 0 || media == NULL ||
+				    *media == NULL) {
+					handle_error(
+					    "could not get media from disk %s",
+					    dp->name);
+					*error = -1;
+				} else {
+					/* Get size */
+					get_disk_size(media[0], dp->name,
+					    &(dp->size), &(dp->blocksize),
+					    error);
+					if (!*error) {
+						/* Get free slices */
+						dp->slices =
+						    get_disk_usable_slices(
+							media[0], dp->name,
+							dp->blocksize,
+							&(dp->in_use), error);
+
+						/*
+						 * If this disk has no usable
+						 * slices...
+						 */
+						if (dp->in_use) {
+							*error = DISK_IN_USE;
+						}
+					}
+					dm_free_descriptors(media);
+				}
+			}
+		}
+	}
+
+	if (*error) {
+		if (*error != DISK_IN_USE) {
+			/* Normalize error */
+			*error = -1;
+		}
+
+		if (dp != NULL) {
+			dmgt_free_disk(dp);
+			dp = NULL;
+		}
+	}
+
+	return (dp);
+}
+
+static char **
+get_disk_aliases(dm_descriptor_t disk, char *name, int *error)
+{
+	char **names = NULL;
+	dm_descriptor_t *aliases;
+
+	*error = 0;
+	aliases = dm_get_associated_descriptors(disk, DM_ALIAS, error);
+	if (*error || aliases == NULL) {
+		*error = -1;
+		handle_error("could not get aliases for disk %s", name);
+	} else {
+
+		int j;
+
+		/* Count aliases */
+		for (j = 0; aliases[j] != NULL; j++);
+
+		names = (char **)calloc(j + 1, sizeof (char *));
+		if (names == NULL) {
+			*error = -1;
+			handle_error("out of memory");
+		} else {
+
+			/* For each alias... */
+			for (j = 0; *error == 0 && aliases[j] != NULL; j++) {
+
+				dm_descriptor_t alias = aliases[j];
+				char *aname = dm_get_name(alias, error);
+				if (*error) {
+					handle_error("could not get alias %d "
+					    "for disk %s", (j + 1), name);
+				} else {
+					names[j] = strdup(aname);
+					if (names[j] == NULL) {
+						*error = -1;
+						handle_error("out of memory");
+					}
+
+					dm_free_name(aname);
+				}
+			}
+		}
+
+		dm_free_descriptors(aliases);
+	}
+
+	if (*error && names != NULL) {
+		int i;
+		/* Free previously-allocated names */
+		for (i = 0; names[i] != NULL; i++) {
+			free(names[i]);
+		}
+		free(names);
+	}
+
+	return (names);
+}
+
+static int
+get_disk_online(dm_descriptor_t disk, int *error)
+{
+	uint32_t status = 0;
+
+	nvlist_t *attrs;
+	*error = 0;
+	attrs = dm_get_attributes(disk, error);
+	if (*error) {
+		handle_error("could not get disk attributes for disk");
+	} else {
+
+		/* Try to get the status */
+		nvpair_t *match = zjni_nvlist_walk_nvpair(
+		    attrs, DM_STATUS, DATA_TYPE_UINT32, NULL);
+
+		if (match == NULL || nvpair_value_uint32(match, &status)) {
+
+			handle_error("could not get status of disk");
+			*error = 1;
+		}
+
+		nvlist_free(attrs);
+	}
+
+	return (status != 0);
+}
+
+/*
+ * Gets the slices for the given disk.
+ *
+ * Results:
+ *
+ *  1. Success: error is set to 0 and slices are returned
+ *
+ *  2. Failure: error is set to -1 and NULL is returned
+ */
+static dmgt_slice_t **
+get_disk_slices(dm_descriptor_t media, const char *name, uint32_t blocksize,
+    int *error)
+{
+	dm_descriptor_t *slices;
+	dmgt_slice_t **sap = NULL;
+
+	*error = 0;
+	slices = dm_get_associated_descriptors(media, DM_SLICE, error);
+	if (*error != 0) {
+		handle_error("could not get slices of disk %s", name);
+	} else {
+		int j;
+		int nslices = 0;
+
+		/* For each slice... */
+		for (j = 0; *error == 0 &&
+		    slices != NULL && slices[j] != NULL; j++) {
+
+			/* Get slice */
+			dmgt_slice_t *slice =
+			    get_slice(slices[j], blocksize, error);
+			if (!*error) {
+
+				sap = (dmgt_slice_t **)realloc(sap,
+				    (nslices + 2) * sizeof (dmgt_slice_t *));
+				if (sap == NULL) {
+					handle_error("out of memory");
+					*error = -1;
+				} else {
+
+					/* NULL-terminated array */
+					sap[nslices] = slice;
+					sap[nslices + 1] = NULL;
+
+					nslices++;
+				}
+			}
+		}
+
+		dm_free_descriptors(slices);
+	}
+
+	if (*error) {
+		/* Normalize error */
+		*error = -1;
+	}
+
+	if (*error && sap != NULL) {
+		free_slice_array(sap);
+		sap = NULL;
+	}
+
+	return (sap);
+}
+
+static void
+remove_slice_from_list(dmgt_slice_t **slices, int index)
+{
+	int i;
+	for (i = index; slices[i] != NULL; i++) {
+		slices[i] = slices[i + 1];
+	}
+}
+
+static int
+slices_overlap(dmgt_slice_t *slice1, dmgt_slice_t *slice2)
+{
+
+	uint64_t start1 = slice1->start;
+	uint64_t end1 = start1 + slice1->size - 1;
+	uint64_t start2 = slice2->start;
+	uint64_t end2 = start2 + slice2->size - 2;
+
+	int overlap = (start2 <= end1 && start1 <= end2);
+
+#ifdef DEBUG
+	if (overlap) {
+		(void) fprintf(stderr, "can't use %s: overlaps with %s\n",
+		    slice2->name, slice1->name);
+		(void) fprintf(stderr, "  1: start: %llu - %llu\n",
+		    (unsigned long long)start1, (unsigned long long)end1);
+		(void) fprintf(stderr, "  2: start: %llu - %llu\n",
+		    (unsigned long long)start2, (unsigned long long)end2);
+	}
+#endif
+
+	return (overlap);
+}
+
+/*
+ * Gets the slices for the given disk.
+ *
+ * Results:
+ *
+ *  1. Success: error is set to 0 and slices are returned
+ *
+ *  2. Failure: error is set to -1 and NULL is returned
+ */
+static dmgt_slice_t **
+get_disk_usable_slices(dm_descriptor_t media, const char *name,
+    uint32_t blocksize, int *in_use, int *error)
+{
+	dmgt_slice_t **slices = get_disk_slices(media, name, blocksize, error);
+
+	*in_use = 0;
+
+	if (!*error && slices != NULL) {
+		int i, nslices;
+
+		for (nslices = 0; slices[nslices] != NULL; nslices++);
+
+		/* Prune slices based on use */
+		for (i = nslices - 1; i >= 0; i--) {
+			dmgt_slice_t *slice = slices[i];
+			if (slice == NULL) {
+				continue;
+			}
+
+			if (slice_in_use(slice)) {
+				int j;
+				remove_slice_from_list(slices, i);
+
+				*in_use = 1;
+
+				/*
+				 * Remove any slice that overlaps with this
+				 * in-use slice
+				 */
+				for (j = nslices - 1; j >= 0; j--) {
+					if (slices[j] == NULL) {
+						continue;
+					}
+					if (slices_overlap(slice, slices[j])) {
+						remove_slice_from_list(slices,
+						    j);
+					}
+				}
+			} else
+				if (slice_too_small(slice)) {
+					remove_slice_from_list(slices, i);
+				}
+		}
+	}
+
+	return (slices);
+}
+
+static void
+get_disk_size(dm_descriptor_t media, char *name, uint64_t *size,
+    uint32_t *blocksize, int *error)
+{
+	nvlist_t *attrs;
+
+	*size = 0;
+	*error = 0;
+
+	attrs = dm_get_attributes(media, error);
+
+	if (*error) {
+		handle_error("could not get media attributes from disk: %s",
+		    name);
+	} else {
+		/* Try to get the number of accessible blocks */
+		nvpair_t *match = zjni_nvlist_walk_nvpair(
+		    attrs, DM_NACCESSIBLE, DATA_TYPE_UINT64, NULL);
+		if (match == NULL || nvpair_value_uint64(match, size)) {
+
+			/* Disk is probably not labeled, get raw size instead */
+			match = zjni_nvlist_walk_nvpair(
+			    attrs, DM_SIZE, DATA_TYPE_UINT64, NULL);
+			if (match == NULL || nvpair_value_uint64(match, size)) {
+				handle_error("could not get size of disk: %s",
+				    name);
+				*error = 1;
+			}
+		}
+
+		if (*error == 0) {
+			match = zjni_nvlist_walk_nvpair(
+			    attrs, DM_BLOCKSIZE, DATA_TYPE_UINT32, NULL);
+			if (match == NULL ||
+			    nvpair_value_uint32(match, blocksize)) {
+				handle_error("could not get "
+				    "block size of disk: %s", name);
+				*error = 1;
+			} else {
+				*size *= *blocksize;
+			}
+		}
+
+		nvlist_free(attrs);
+	}
+}
+
+static void
+get_slice_use(dm_descriptor_t slice, char *name, char **used_name,
+    char **used_by, int *error)
+{
+	/* Get slice use statistics */
+	nvlist_t *stats = dm_get_stats(slice, DM_SLICE_STAT_USE, error);
+	if (*error != 0) {
+		handle_error("could not get stats of slice %s", name);
+	} else {
+
+		*used_name = NULL;
+		*used_by = NULL;
+
+		if (stats != NULL) {
+			char *tmp;
+			nvpair_t *match;
+
+			/* Get the type of usage for this slice */
+			match = zjni_nvlist_walk_nvpair(
+			    stats, DM_USED_BY, DATA_TYPE_STRING, NULL);
+
+			if (match != NULL &&
+			    nvpair_value_string(match, &tmp) == 0) {
+
+				*used_name = strdup(tmp);
+				if (*used_name == NULL) {
+					*error = -1;
+					handle_error("out of memory");
+				} else {
+
+					/* Get the object using this slice */
+					match =
+					    zjni_nvlist_walk_nvpair(stats,
+					    DM_USED_NAME, DATA_TYPE_STRING,
+					    NULL);
+
+					if (match != NULL &&
+					    nvpair_value_string(match, &tmp) ==
+					    0) {
+						*used_by = strdup(tmp);
+						if (*used_by == NULL) {
+							*error = -1;
+							handle_error(
+							    "out of memory");
+						}
+					}
+				}
+			}
+			nvlist_free(stats);
+		}
+	}
+}
+
+static dmgt_slice_t *
+get_slice(dm_descriptor_t slice, uint32_t blocksize, int *error)
+{
+	dmgt_slice_t *sp;
+	*error = 0;
+	sp = (dmgt_slice_t *)calloc(1, sizeof (dmgt_slice_t));
+	if (sp == NULL) {
+		*error = -1;
+		handle_error("out of memory");
+	} else {
+
+		/* Get name */
+		sp->name = get_device_name(slice, error);
+		if (!*error) {
+
+			nvlist_t *attrs = dm_get_attributes(slice, error);
+			if (*error) {
+				handle_error("could not get "
+				    "attributes from slice: %s", sp->name);
+			} else {
+				/* Get the size in blocks */
+				nvpair_t *match = zjni_nvlist_walk_nvpair(
+				    attrs, DM_SIZE, DATA_TYPE_UINT64, NULL);
+				uint64_t size_blocks;
+
+				sp->size = 0;
+
+				if (match == NULL ||
+				    nvpair_value_uint64(match, &size_blocks)) {
+					handle_error("could not get "
+					    "size of slice: %s", sp->name);
+					*error = 1;
+				} else {
+					uint64_t start_blocks;
+
+					/* Convert to bytes */
+					sp->size = blocksize * size_blocks;
+
+					/* Get the starting block */
+					match = zjni_nvlist_walk_nvpair(
+					    attrs, DM_START, DATA_TYPE_UINT64,
+					    NULL);
+
+					if (match == NULL ||
+					    nvpair_value_uint64(match,
+					    &start_blocks)) {
+						handle_error(
+						    "could not get "
+						    "start block of slice: %s",
+						    sp->name);
+						*error = 1;
+					} else {
+						/* Convert to bytes */
+						sp->start =
+						    blocksize * start_blocks;
+
+						/* Set slice use */
+						get_slice_use(slice, sp->name,
+						    &(sp->used_name),
+						    &(sp->used_by), error);
+					}
+				}
+			}
+		}
+	}
+
+	if (*error && sp != NULL) {
+		dmgt_free_slice(sp);
+	}
+
+	return (sp);
+}
+
+static void
+handle_error(const char *format, ...)
+{
+	va_list ap;
+	va_start(ap, format);
+
+	if (error_func != NULL) {
+		error_func(format, ap);
+	}
+
+	va_end(ap);
+}
+
+/* Should go away once 6285992 is fixed */
+static int
+slice_too_small(dmgt_slice_t *slice)
+{
+	/* Check size */
+	if (slice->size < SPA_MINDEVSIZE) {
+#ifdef DEBUG
+		(void) fprintf(stderr, "can't use %s: slice too small: %llu\n",
+			slice->name, (unsigned long long)slice->size);
+#endif
+		return (1);
+	}
+
+	return (0);
+}
+
+/* Should go away once 6285992 is fixed */
+static int
+slice_in_use(dmgt_slice_t *slice)
+{
+	int in_use = 0;
+
+	/* Check use */
+	if (slice->used_name != NULL) {
+
+		in_use = 1;
+
+		/* If the slice contains an unmounted file system... */
+		if (strcmp(DM_USE_FS, slice->used_name) == 0) {
+
+			/* Allow only if file system is not ZFS */
+			if (strcmp(slice->used_by, "zfs") != 0) {
+				in_use = 0;
+			}
+		} else
+
+			/* Uses that don't preclude slice from use by ZFS */
+			if (strcmp(DM_USE_SVM, slice->used_name) == 0 ||
+			    strcmp(DM_USE_VXVM, slice->used_name) == 0 ||
+			    strcmp(DM_USE_LU, slice->used_name) == 0) {
+				in_use = 0;
+			}
+	}
+
+#ifdef DEBUG
+	if (in_use) {
+		(void) fprintf(stderr,
+		    "can't use %s: used name: %s: used by: %s\n",
+		    slice->name, slice->used_name, slice->used_by);
+	}
+#endif
+
+	return (in_use);
+}
+
+/*
+ * Extern functions
+ */
+
+/*
+ * Iterates through each available disk on the system.  For each free
+ * dmgt_disk_t *, runs the given function with the dmgt_disk_t * as
+ * the first arg and the given void * as the second arg.
+ */
+int
+dmgt_avail_disk_iter(dmgt_disk_iter_f func, void *data)
+{
+	int error = 0;
+	int filter[] = { DM_DT_FIXED, -1 };
+
+	/* Search for fixed disks */
+	dm_descriptor_t *disks = dm_get_descriptors(DM_DRIVE, filter, &error);
+
+	if (error) {
+		handle_error("unable to communicate with libdiskmgt");
+	} else {
+		int i;
+
+		/* For each disk... */
+		for (i = 0; disks != NULL && error == 0 && disks[i] != NULL;
+		    i++) {
+			/* Is this disk online? */
+			dm_descriptor_t disk = (dm_descriptor_t)disks[i];
+			int online = get_disk_online(disk, &error);
+			if (!error && online) {
+				dmgt_disk_t *dp = get_disk(disk, &error);
+				if (error == DISK_IN_USE) {
+					error = 0;
+				} else
+					if (!error) {
+						/* Run the given function */
+						if (func(dp, data)) {
+							error = -1;
+						}
+						dmgt_free_disk(dp);
+					}
+			}
+		}
+		dm_free_descriptors(disks);
+	}
+	return (error);
+}
+
+void
+dmgt_free_disk(dmgt_disk_t *disk)
+{
+	if (disk != NULL) {
+		int i;
+		free(disk->name);
+
+		if (disk->aliases != NULL) {
+			for (i = 0; disk->aliases[i] != NULL; i++) {
+				free(disk->aliases[i]);
+			}
+			free(disk->aliases);
+		}
+
+		free_slice_array(disk->slices);
+		free(disk);
+	}
+}
+
+void
+dmgt_free_slice(dmgt_slice_t *slice)
+{
+	if (slice != NULL) {
+		free(slice->name);
+		free(slice->used_name);
+		free(slice->used_by);
+		free(slice);
+	}
+}
+
+/*
+ * For clients that need to capture error output.
+ */
+void
+dmgt_set_error_handler(void (*func)(const char *, va_list))
+{
+	error_func = func;
+}
diff --git a/usr/src/lib/libzfs_jni/common/libzfs_jni_diskmgt.h b/usr/src/lib/libzfs_jni/common/libzfs_jni_diskmgt.h
new file mode 100644
index 000000000000..0d10909eaa6c
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_diskmgt.h
@@ -0,0 +1,80 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LIBZFS_JNI_DISKMGT_H
+#define	_LIBZFS_JNI_DISKMGT_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <libdiskmgt.h>
+#include <sys/varargs.h>
+
+/*
+ * Types
+ */
+
+typedef struct dmgt_slice {
+	char *name;
+	uint64_t start;
+	uint64_t size;
+	char *used_name;
+	char *used_by;
+} dmgt_slice_t;
+
+typedef struct dmgt_disk {
+	char *name;
+	uint64_t size;
+	uint32_t blocksize;
+	int in_use;
+
+	/* NULL-terminated array */
+	char **aliases;
+
+	/* NULL-terminated array */
+	dmgt_slice_t **slices;
+} dmgt_disk_t;
+
+/* Callback function for available disk iteration */
+typedef int (*dmgt_disk_iter_f)(dmgt_disk_t *, void *);
+
+/*
+ * Function prototypes
+ */
+
+extern int dmgt_avail_disk_iter(dmgt_disk_iter_f func, void *data);
+extern void dmgt_free_disk(dmgt_disk_t *);
+extern void dmgt_free_slice(dmgt_slice_t *);
+extern void dmgt_set_error_handler(void (*)(const char *, va_list));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBZFS_JNI_DISKMGT_H */
diff --git a/usr/src/lib/libzfs_jni/common/libzfs_jni_main.c b/usr/src/lib/libzfs_jni/common/libzfs_jni_main.c
new file mode 100644
index 000000000000..b6c78644a280
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_main.c
@@ -0,0 +1,485 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include "libzfs_jni_main.h"
+#include "libzfs_jni_util.h"
+#include "libzfs_jni_dataset.h"
+#include "libzfs_jni_property.h"
+#include "libzfs_jni_pool.h"
+#include "libzfs_jni_diskmgt.h"
+#include "libzfs_jni_disk.h"
+
+/*
+ * Function prototypes
+ */
+
+static void handle_error(const char *, va_list);
+static void init();
+
+/*
+ * Static functions
+ */
+
+char libzfs_err[1024];
+static void
+handle_error(const char *fmt, va_list ap)
+{
+	/* Save the error message in case it's needed */
+	(void) vsnprintf(libzfs_err, sizeof (libzfs_err), fmt, ap);
+#ifdef	DEBUG
+	(void) fprintf(stderr, "caught error: %s\n", libzfs_err);
+#endif
+}
+
+/*
+ * Initialize the library.  Sets the error handler.
+ */
+#pragma init(init)
+static void
+init()
+{
+	libzfs_err[0] = '\0';
+
+	/* libzfs error handler */
+	zfs_set_error_handler(handle_error);
+
+	/* diskmgt.o error handler */
+	dmgt_set_error_handler(handle_error);
+}
+
+/*
+ * JNI functions
+ */
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getPools
+ * Signature: ()[Lcom/sun/zfs/common/model/Pool;
+ */
+/* ARGSUSED */
+JNIEXPORT jobjectArray JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getPools(JNIEnv *env, jobject obj)
+{
+	zjni_DatasetArrayCallbackData_t data = {0};
+	int result;
+
+	/* Create an array list */
+	zjni_ArrayList_t list_obj = {0};
+	zjni_ArrayList_t *list = &list_obj;
+	zjni_new_ArrayList(env, list);
+
+	data.data.env = env;
+	data.data.list = (zjni_Collection_t *)list;
+	data.typemask = ZFS_TYPE_FILESYSTEM;
+
+	result = zfs_iter_root(zjni_create_add_Dataset, &data);
+	if (result && (*env)->ExceptionOccurred(env) != NULL) {
+		/* Must not call any more Java methods to preserve exception */
+		return (NULL);
+	}
+
+	return (zjni_Collection_to_array(env, (zjni_Collection_t *)list,
+	    ZFSJNI_PACKAGE_DATA "Pool"));
+}
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getPool
+ * Signature: (Ljava/lang/String;)
+ *            Lcom/sun/zfs/common/model/Pool;
+ */
+/* ARGSUSED */
+JNIEXPORT jobject JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getPool(JNIEnv *env,
+    jobject obj, jstring poolUTF)
+{
+	return (zjni_get_Dataset(env, poolUTF, ZFS_TYPE_FILESYSTEM));
+}
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getFileSystems
+ * Signature: (Ljava/lang/String;)
+ *            [Lcom/sun/zfs/common/model/FileSystem;
+ */
+/* ARGSUSED */
+JNIEXPORT jobjectArray JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getFileSystems(JNIEnv *env,
+    jobject obj, jstring containerUTF)
+{
+	return (zjni_get_Datasets_below(env, containerUTF,
+	    ZFS_TYPE_FILESYSTEM, ZFS_TYPE_FILESYSTEM,
+	    ZFSJNI_PACKAGE_DATA "FileSystem"));
+}
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getFileSystem
+ * Signature: (Ljava/lang/String;)
+ *            Lcom/sun/zfs/common/model/FileSystem;
+ */
+/* ARGSUSED */
+JNIEXPORT jobject JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getFileSystem(JNIEnv *env,
+    jobject obj, jstring nameUTF)
+{
+	return (zjni_get_Dataset(env, nameUTF, ZFS_TYPE_FILESYSTEM));
+}
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getVolumes
+ * Signature: (Ljava/lang/String;)
+ *            [Lcom/sun/zfs/common/model/Volume;
+ */
+/* ARGSUSED */
+JNIEXPORT jobjectArray JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getVolumes(JNIEnv *env,
+    jobject obj, jstring containerUTF)
+{
+	return (zjni_get_Datasets_below(env, containerUTF,
+	    ZFS_TYPE_FILESYSTEM, ZFS_TYPE_VOLUME,
+	    ZFSJNI_PACKAGE_DATA "Volume"));
+}
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getVolume
+ * Signature: (Ljava/lang/String;)
+ *            Lcom/sun/zfs/common/model/Volume;
+ */
+/* ARGSUSED */
+JNIEXPORT jobject JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getVolume(JNIEnv *env,
+    jobject obj, jstring nameUTF)
+{
+	return (zjni_get_Dataset(env, nameUTF, ZFS_TYPE_VOLUME));
+}
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getSnapshots
+ * Signature: (Ljava/lang/String;)
+ *            [Lcom/sun/zfs/common/model/Snapshot;
+ */
+/* ARGSUSED */
+JNIEXPORT jobjectArray JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getSnapshots(JNIEnv *env,
+    jobject obj, jstring datasetUTF)
+{
+	return (zjni_get_Datasets_below(env, datasetUTF,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, ZFS_TYPE_SNAPSHOT,
+	    ZFSJNI_PACKAGE_DATA "Snapshot"));
+}
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getSnapshot
+ * Signature: (Ljava/lang/String;)
+ *            Lcom/sun/zfs/common/model/Snapshot;
+ */
+/* ARGSUSED */
+JNIEXPORT jobject JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getSnapshot(JNIEnv *env,
+    jobject obj, jstring nameUTF)
+{
+	return (zjni_get_Dataset(env, nameUTF, ZFS_TYPE_SNAPSHOT));
+}
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getDatasets
+ * Signature: (Ljava/lang/String;)
+ *            [Lcom/sun/zfs/common/model/Dataset;
+ */
+/* ARGSUSED */
+JNIEXPORT jobjectArray JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getDatasets(JNIEnv *env,
+    jobject obj, jstring containerUTF)
+{
+	if (containerUTF == NULL) {
+		return (Java_com_sun_zfs_common_model_SystemDataModel_getPools(
+		    env, obj));
+	}
+
+	return (zjni_get_Datasets_below(env, containerUTF,
+	    ZFS_TYPE_FILESYSTEM, ZFS_TYPE_ANY,
+	    ZFSJNI_PACKAGE_DATA "Dataset"));
+}
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getDataset
+ * Signature: (Ljava/lang/String;)
+ *            Lcom/sun/zfs/common/model/Dataset;
+ */
+/* ARGSUSED */
+JNIEXPORT jobject JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getDataset(JNIEnv *env,
+    jobject obj, jstring nameUTF)
+{
+	return (zjni_get_Dataset(env, nameUTF, ZFS_TYPE_ANY));
+}
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getVirtualDevice
+ * Signature: (Ljava/lang/String;J)Lcom/sun/zfs/common/model/VirtualDevice;
+ */
+/* ARGSUSED */
+JNIEXPORT jobject JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getVirtualDevice(JNIEnv *env,
+    jobject obj, jstring poolUTF, jlong index)
+{
+	jobject vdev = NULL;
+
+	if (poolUTF != NULL) {
+		const char *pool = (*env)->GetStringUTFChars(env, poolUTF,
+		    NULL);
+		zpool_handle_t *zhp = zpool_open(pool);
+		(*env)->ReleaseStringUTFChars(env, poolUTF, pool);
+
+		if (zhp != NULL) {
+			nvlist_t *vdev_cfg = zjni_get_vdev(zhp, NULL, index);
+			if (vdev_cfg != NULL) {
+				vdev = zjni_get_VirtualDevice_from_vdev(env,
+				    zhp, vdev_cfg);
+			}
+			zpool_close(zhp);
+		}
+	}
+
+	return (vdev);
+}
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getVirtualDevices
+ * Signature: (Ljava/lang/String;J)
+ *            [Lcom/sun/zfs/common/model/VirtualDevice;
+ */
+/* ARGSUSED */
+JNIEXPORT jobjectArray JNICALL
+/* CSTYLED */
+Java_com_sun_zfs_common_model_SystemDataModel_getVirtualDevices__Ljava_lang_String_2J(
+    JNIEnv *env, jobject obj, jstring poolUTF, jlong index)
+{
+	jobjectArray vdevs = NULL;
+
+	if (poolUTF != NULL) {
+		const char *pool = (*env)->GetStringUTFChars(env, poolUTF,
+		    NULL);
+		zpool_handle_t *zhp = zpool_open(pool);
+		(*env)->ReleaseStringUTFChars(env, poolUTF, pool);
+
+		/* Is the pool valid? */
+		if (zhp != NULL) {
+			nvlist_t *vdev_cfg = zjni_get_vdev(zhp, NULL, index);
+			if (vdev_cfg != NULL) {
+				vdevs = zjni_get_VirtualDevices_from_vdev(
+				    env, zhp, vdev_cfg);
+			}
+			zpool_close(zhp);
+		}
+	}
+
+	return (vdevs);
+}
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getVirtualDevices
+ * Signature: (Ljava/lang/String;)
+ *            [Lcom/sun/zfs/common/model/VirtualDevice;
+ */
+/* ARGSUSED */
+JNIEXPORT jobjectArray JNICALL
+/* CSTYLED */
+Java_com_sun_zfs_common_model_SystemDataModel_getVirtualDevices__Ljava_lang_String_2(
+    JNIEnv *env, jobject obj, jstring poolUTF)
+{
+	jobjectArray vdevs = NULL;
+
+	if (poolUTF != NULL) {
+		const char *pool = (*env)->GetStringUTFChars(env,
+		    poolUTF, NULL);
+		zpool_handle_t *zhp = zpool_open(pool);
+		(*env)->ReleaseStringUTFChars(env, poolUTF, pool);
+
+		/* Is the pool valid? */
+		if (zhp != NULL) {
+			vdevs = zjni_get_VirtualDevices_from_vdev(env,
+			    zhp, NULL);
+			zpool_close(zhp);
+		}
+	}
+
+	return (vdevs);
+}
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getAvailableDisks
+ * Signature: ()[Lcom/sun/zfs/common/model/DiskDevice;
+ */
+/* ARGSUSED */
+JNIEXPORT jobjectArray JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getAvailableDisks(JNIEnv *env,
+    jobject obj)
+{
+	int error;
+	zjni_ArrayCallbackData_t data = {0};
+	jobjectArray array = NULL;
+
+	/* Create an array list */
+	zjni_ArrayList_t list_obj = {0};
+	zjni_ArrayList_t *list = &list_obj;
+	zjni_new_ArrayList(env, list);
+
+	data.env = env;
+	data.list = (zjni_Collection_t *)list;
+	error = dmgt_avail_disk_iter(zjni_create_add_DiskDevice, &data);
+
+	if (error) {
+		zjni_throw_exception(env, "%s", libzfs_err);
+	} else {
+		array = zjni_Collection_to_array(
+		    env, (zjni_Collection_t *)list,
+		    ZFSJNI_PACKAGE_DATA "DiskDevice");
+	}
+
+	return (array);
+}
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getDependents
+ * Signature: ([Ljava/lang/String;)
+ *            [Lcom/sun/zfs/common/model/Dataset;
+ */
+/* ARGSUSED */
+JNIEXPORT jobjectArray JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getDependents(JNIEnv *env,
+    jobject obj, jobjectArray paths)
+{
+	return (zjni_get_Datasets_dependents(env, paths));
+}
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getPropertyDefault
+ * Signature: (Ljava/lang/String;)
+ *            Lcom/sun/zfs/common/model/Property;
+ */
+/* ARGSUSED */
+JNIEXPORT jobject JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getPropertyDefault(JNIEnv *env,
+    jobject obj, jstring nameUTF)
+{
+	jobject defProperty = NULL;
+
+	const char *name = (*env)->GetStringUTFChars(env, nameUTF, NULL);
+	zfs_prop_t prop = zjni_get_property_from_name(name);
+	(*env)->ReleaseStringUTFChars(env, nameUTF, name);
+
+	if (prop != ZFS_PROP_INVAL) {
+		defProperty = zjni_get_default_property(env, prop);
+	}
+
+	return (defProperty);
+}
+
+typedef struct zjni_class_type_map {
+	char *class;
+	zfs_type_t type;
+} zjni_class_type_map_t;
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getValidPropertyNames
+ * Signature: (Ljava/lang/Class;)
+ *            [Ljava/lang/String;
+ */
+/* ARGSUSED */
+JNIEXPORT jobjectArray JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getValidPropertyNames(JNIEnv *env,
+    jobject obj, jclass class)
+{
+	int i;
+
+	/* Mappings of class names to zfs_type_t */
+	static zjni_class_type_map_t mappings[] = {
+		{ ZFSJNI_PACKAGE_DATA "FileSystem", ZFS_TYPE_FILESYSTEM },
+		{ ZFSJNI_PACKAGE_DATA "Volume", ZFS_TYPE_VOLUME },
+		{ ZFSJNI_PACKAGE_DATA "Snapshot", ZFS_TYPE_SNAPSHOT },
+	};
+	int nmappings = sizeof (mappings) / sizeof (zjni_class_type_map_t);
+
+	jclass class_Class = (*env)->FindClass(env, "java/lang/Class");
+
+	jmethodID isAssignableFrom = (*env)->GetMethodID(
+	    env, class_Class, "isAssignableFrom", "(Ljava/lang/Class;)Z");
+
+	/* Create an array list for the property names */
+	zjni_ArrayList_t list_obj = {0};
+	zjni_ArrayList_t *list = &list_obj;
+	zjni_new_ArrayList(env, list);
+
+	/* For each mapping... */
+	for (i = 0; i < nmappings; i++) {
+		/*
+		 * Is the given class an instance of the class in the mapping?
+		 */
+		jclass typeClass = (*env)->FindClass(env, mappings[i].class);
+
+		jboolean isInstance = (*env)->CallBooleanMethod(
+		    env, typeClass, isAssignableFrom, class);
+
+		if (isInstance == JNI_TRUE) {
+			zfs_prop_t prop;
+			for (prop = 0; prop < ZFS_NPROP_VISIBLE; prop++) {
+				if (zfs_prop_valid_for_type(prop,
+				    mappings[i].type)) {
+					/* Add name of property to list */
+					jstring propName =
+					    (*env)->NewStringUTF(env,
+						zfs_prop_to_name(prop));
+					(*env)->CallBooleanMethod(
+					    env,
+					    ((zjni_Object_t *)list)->object,
+					    ((zjni_Collection_t *)list)->
+					    method_add, propName);
+				}
+			}
+			break;
+		}
+	}
+
+	return (zjni_Collection_to_array(
+	    env, (zjni_Collection_t *)list, "java/lang/String"));
+}
diff --git a/usr/src/lib/libzfs_jni/common/libzfs_jni_main.h b/usr/src/lib/libzfs_jni/common/libzfs_jni_main.h
new file mode 100644
index 000000000000..c64d49953d6c
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_main.h
@@ -0,0 +1,215 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZLIBZFS_JNI_MAIN_H
+#define	_ZLIBZFS_JNI_MAIN_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <jni.h>
+/* Header for class com_sun_zfs_common_model_SystemDataModel */
+
+#ifndef _Included_com_sun_zfs_common_model_SystemDataModel
+#define	_Included_com_sun_zfs_common_model_SystemDataModel
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getPools
+ * Signature: ()[Lcom/sun/zfs/common/model/Pool;
+ */
+JNIEXPORT jobjectArray JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getPools(
+    JNIEnv *, jobject);
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getPool
+ * Signature: (Ljava/lang/String;)
+ *            Lcom/sun/zfs/common/model/Pool;
+ */
+JNIEXPORT jobject JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getPool(
+    JNIEnv *, jobject, jstring);
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getFileSystems
+ * Signature: (Ljava/lang/String;)
+ *            [Lcom/sun/zfs/common/model/FileSystem;
+ */
+JNIEXPORT jobjectArray JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getFileSystems(
+    JNIEnv *, jobject, jstring);
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getFileSystem
+ * Signature: (Ljava/lang/String;)
+ *            Lcom/sun/zfs/common/model/FileSystem;
+ */
+JNIEXPORT jobject JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getFileSystem(
+    JNIEnv *, jobject, jstring);
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getVolumes
+ * Signature: (Ljava/lang/String;)
+ *            [Lcom/sun/zfs/common/model/Volume;
+ */
+JNIEXPORT jobjectArray JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getVolumes(
+    JNIEnv *, jobject, jstring);
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getVolume
+ * Signature: (Ljava/lang/String;)
+ *            Lcom/sun/zfs/common/model/Volume;
+ */
+JNIEXPORT jobject JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getVolume(
+    JNIEnv *, jobject, jstring);
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getSnapshots
+ * Signature: (Ljava/lang/String;)
+ *            [Lcom/sun/zfs/common/model/Snapshot;
+ */
+JNIEXPORT jobjectArray JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getSnapshots(
+    JNIEnv *, jobject, jstring);
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getSnapshot
+ * Signature: (Ljava/lang/String;)
+ *            Lcom/sun/zfs/common/model/Snapshot;
+ */
+JNIEXPORT jobject JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getSnapshot(
+    JNIEnv *, jobject, jstring);
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getDatasets
+ * Signature: (Ljava/lang/String;)
+ *            [Lcom/sun/zfs/common/model/Dataset;
+ */
+JNIEXPORT jobjectArray JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getDatasets(
+    JNIEnv *, jobject, jstring);
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getDataset
+ * Signature: (Ljava/lang/String;)
+ *            Lcom/sun/zfs/common/model/Dataset;
+ */
+JNIEXPORT jobject JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getDataset(
+    JNIEnv *, jobject, jstring);
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getVirtualDevice
+ * Signature: (Ljava/lang/String;J)Lcom/sun/zfs/common/model/VirtualDevice;
+ */
+JNIEXPORT jobject JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getVirtualDevice(
+    JNIEnv *, jobject, jstring, jlong);
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getVirtualDevices
+ * Signature: (Ljava/lang/String;)
+ *            [Lcom/sun/zfs/common/model/VirtualDevice;
+ */
+JNIEXPORT jobjectArray JNICALL
+/* CSTYLED */
+Java_com_sun_zfs_common_model_SystemDataModel_getVirtualDevices__Ljava_lang_String_2(
+    JNIEnv *, jobject, jstring);
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getVirtualDevices
+ * Signature: (Ljava/lang/String;J)[Lcom/sun/zfs/common/model/VirtualDevice;
+ */
+JNIEXPORT jobjectArray JNICALL
+/* CSTYLED */
+Java_com_sun_zfs_common_model_SystemDataModel_getVirtualDevices__Ljava_lang_String_2J(
+    JNIEnv *, jobject, jstring, jlong);
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getAvailableDisks
+ * Signature: ()[Lcom/sun/zfs/common/model/DiskDevice;
+ */
+JNIEXPORT jobjectArray JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getAvailableDisks(
+    JNIEnv *, jobject);
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getDependents
+ * Signature: ([Ljava/lang/String;)
+ *            [Lcom/sun/zfs/common/model/Dataset;
+ */
+JNIEXPORT jobjectArray JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getDependents(
+    JNIEnv *, jobject, jobjectArray);
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getPropertyDefault
+ * Signature: (Ljava/lang/String;)
+ *            Lcom/sun/zfs/common/model/Property;
+ */
+JNIEXPORT jobject JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getPropertyDefault(
+    JNIEnv *, jobject, jstring);
+
+/*
+ * Class:     com_sun_zfs_common_model_SystemDataModel
+ * Method:    getValidPropertyNames
+ * Signature: (Ljava/lang/Class;)
+ *            [Ljava/lang/String;
+ */
+JNIEXPORT jobjectArray JNICALL
+Java_com_sun_zfs_common_model_SystemDataModel_getValidPropertyNames(
+    JNIEnv *, jobject, jclass);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+
+#endif /* _ZLIBZFS_JNI_MAIN_H */
diff --git a/usr/src/lib/libzfs_jni/common/libzfs_jni_pool.c b/usr/src/lib/libzfs_jni/common/libzfs_jni_pool.c
new file mode 100644
index 000000000000..5e50313ebdd2
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_pool.c
@@ -0,0 +1,575 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include "libzfs_jni_pool.h"
+#include "libzfs_jni_util.h"
+#include <strings.h>
+
+/*
+ * Types
+ */
+
+typedef struct VirtualDeviceBean {
+	zjni_Object_t super;
+
+	jmethodID method_setPoolName;
+	jmethodID method_setIndex;
+	jmethodID method_setSize;
+	jmethodID method_setUsed;
+} VirtualDeviceBean_t;
+
+typedef struct DiskVirtualDeviceBean {
+	VirtualDeviceBean_t super;
+
+	jmethodID method_setDiskName;
+} DiskVirtualDeviceBean_t;
+
+typedef struct FileVirtualDeviceBean {
+	VirtualDeviceBean_t super;
+
+	jmethodID method_setFileName;
+} FileVirtualDeviceBean_t;
+
+typedef struct RAIDVirtualDeviceBean {
+	VirtualDeviceBean_t super;
+} RAIDVirtualDeviceBean_t;
+
+typedef struct MirrorVirtualDeviceBean {
+	VirtualDeviceBean_t super;
+} MirrorVirtualDeviceBean_t;
+
+/*
+ * Function prototypes
+ */
+
+static void new_VirtualDevice(JNIEnv *, VirtualDeviceBean_t *);
+static void new_DiskVirtualDeviceBean(JNIEnv *, DiskVirtualDeviceBean_t *);
+static void new_FileVirtualDeviceBean(JNIEnv *, FileVirtualDeviceBean_t *);
+static void new_RAIDVirtualDeviceBean(JNIEnv *, RAIDVirtualDeviceBean_t *);
+static void new_MirrorVirtualDeviceBean(JNIEnv *, MirrorVirtualDeviceBean_t *);
+static int populate_VirtualDeviceBean(
+    JNIEnv *, zpool_handle_t *, nvlist_t *, VirtualDeviceBean_t *);
+static int populate_DiskVirtualDeviceBean(
+    JNIEnv *, zpool_handle_t *, nvlist_t *, DiskVirtualDeviceBean_t *);
+static int populate_FileVirtualDeviceBean(
+    JNIEnv *, zpool_handle_t *, nvlist_t *, FileVirtualDeviceBean_t *);
+static int populate_RAIDVirtualDeviceBean(
+    JNIEnv *, zpool_handle_t *, nvlist_t *, RAIDVirtualDeviceBean_t *);
+static int populate_MirrorVirtualDeviceBean(
+    JNIEnv *, zpool_handle_t *, nvlist_t *, MirrorVirtualDeviceBean_t *);
+static jobject create_DiskVirtualDeviceBean(
+    JNIEnv *, zpool_handle_t *, nvlist_t *);
+static jobject create_FileVirtualDeviceBean(
+    JNIEnv *, zpool_handle_t *, nvlist_t *);
+static jobject create_RAIDVirtualDeviceBean(
+    JNIEnv *, zpool_handle_t *, nvlist_t *);
+static jobject create_MirrorVirtualDeviceBean(
+    JNIEnv *, zpool_handle_t *, nvlist_t *);
+
+/*
+ * Static functions
+ */
+
+/* Create a VirtualDeviceBean */
+static void
+new_VirtualDevice(JNIEnv *env, VirtualDeviceBean_t *bean)
+{
+	zjni_Object_t *object = (zjni_Object_t *)bean;
+
+	if (object->object == NULL) {
+		object->class =
+		    (*env)->FindClass(env,
+			ZFSJNI_PACKAGE_DATA "VirtualDeviceBean");
+
+		object->constructor =
+		    (*env)->GetMethodID(env, object->class, "<init>", "()V");
+
+		object->object =
+		    (*env)->NewObject(env, object->class, object->constructor);
+	}
+
+	bean->method_setPoolName = (*env)->GetMethodID(
+	    env, object->class, "setPoolName", "(Ljava/lang/String;)V");
+
+	bean->method_setIndex = (*env)->GetMethodID(
+	    env, object->class, "setIndex", "(J)V");
+
+	bean->method_setSize = (*env)->GetMethodID(
+	    env, object->class, "setSize", "(J)V");
+
+	bean->method_setUsed = (*env)->GetMethodID(
+	    env, object->class, "setUsed", "(J)V");
+}
+
+/* Create a DiskVirtualDeviceBean */
+static void
+new_DiskVirtualDeviceBean(JNIEnv *env, DiskVirtualDeviceBean_t *bean)
+{
+	zjni_Object_t *object = (zjni_Object_t *)bean;
+
+	if (object->object == NULL) {
+		object->class = (*env)->FindClass(
+		    env, ZFSJNI_PACKAGE_DATA "DiskVirtualDeviceBean");
+
+		object->constructor =
+		    (*env)->GetMethodID(env, object->class, "<init>", "()V");
+
+		object->object =
+		    (*env)->NewObject(env, object->class, object->constructor);
+	}
+
+	new_VirtualDevice(env, (VirtualDeviceBean_t *)bean);
+
+	bean->method_setDiskName = (*env)->GetMethodID(
+	    env, object->class, "setDiskName", "(Ljava/lang/String;)V");
+
+}
+
+/* Create a FileVirtualDeviceBean */
+static void
+new_FileVirtualDeviceBean(JNIEnv *env, FileVirtualDeviceBean_t *bean)
+{
+	zjni_Object_t *object = (zjni_Object_t *)bean;
+
+	if (object->object == NULL) {
+		object->class = (*env)->FindClass(
+		    env, ZFSJNI_PACKAGE_DATA "FileVirtualDeviceBean");
+
+		object->constructor =
+		    (*env)->GetMethodID(env, object->class, "<init>", "()V");
+
+		object->object =
+		    (*env)->NewObject(env, object->class, object->constructor);
+	}
+
+	new_VirtualDevice(env, (VirtualDeviceBean_t *)bean);
+
+	bean->method_setFileName = (*env)->GetMethodID(
+	    env, object->class, "setFileName", "(Ljava/lang/String;)V");
+}
+
+/* Create a RAIDVirtualDeviceBean */
+static void
+new_RAIDVirtualDeviceBean(JNIEnv *env, RAIDVirtualDeviceBean_t *bean)
+{
+	zjni_Object_t *object = (zjni_Object_t *)bean;
+
+	if (object->object == NULL) {
+
+		object->class = (*env)->FindClass(
+		    env, ZFSJNI_PACKAGE_DATA "RAIDVirtualDeviceBean");
+
+		object->constructor =
+		    (*env)->GetMethodID(env, object->class, "<init>", "()V");
+
+		object->object =
+		    (*env)->NewObject(env, object->class, object->constructor);
+	}
+
+	new_VirtualDevice(env, (VirtualDeviceBean_t *)bean);
+}
+
+/* Create a MirrorVirtualDeviceBean */
+static void
+new_MirrorVirtualDeviceBean(JNIEnv *env, MirrorVirtualDeviceBean_t *bean)
+{
+	zjni_Object_t *object = (zjni_Object_t *)bean;
+
+	if (object->object == NULL) {
+		object->class = (*env)->FindClass(
+		    env, ZFSJNI_PACKAGE_DATA "MirrorVirtualDeviceBean");
+
+		object->constructor =
+		    (*env)->GetMethodID(env, object->class, "<init>", "()V");
+
+		object->object =
+		    (*env)->NewObject(env, object->class, object->constructor);
+	}
+
+	new_VirtualDevice(env, (VirtualDeviceBean_t *)bean);
+}
+
+static int
+populate_VirtualDeviceBean(JNIEnv *env, zpool_handle_t *zhp,
+    nvlist_t *vdev, VirtualDeviceBean_t *bean)
+{
+	int result;
+	uint64_t vdev_id;
+	zjni_Object_t *object = (zjni_Object_t *)bean;
+
+	/* Set pool name */
+	jstring poolUTF = (*env)->NewStringUTF(env, zpool_get_name(zhp));
+	(*env)->CallVoidMethod(
+	    env, object->object, bean->method_setPoolName, poolUTF);
+
+	/* Get index */
+	result = nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &vdev_id);
+	if (result != 0) {
+		zjni_throw_exception(env,
+		    "could not retrieve virtual device ID (pool %s)",
+		    zpool_get_name(zhp));
+	} else {
+
+		uint64_t used;
+		uint64_t total;
+
+		(*env)->CallVoidMethod(
+		    env, object->object, bean->method_setIndex, (jlong)vdev_id);
+
+		/* Set used space */
+		used = zpool_get_space_used(zhp);
+
+		(*env)->CallVoidMethod(
+		    env, object->object, bean->method_setUsed, (jlong)used);
+
+		/* Set available space */
+		total = zpool_get_space_total(zhp);
+
+		(*env)->CallVoidMethod(
+		    env, object->object, bean->method_setSize, (jlong)total);
+	}
+
+	return (result != 0);
+}
+
+static int
+populate_DiskVirtualDeviceBean(JNIEnv *env, zpool_handle_t *zhp,
+    nvlist_t *vdev, DiskVirtualDeviceBean_t *bean)
+{
+	char *path;
+	int result = populate_VirtualDeviceBean(
+	    env, zhp, vdev, (VirtualDeviceBean_t *)bean);
+
+	if (result) {
+		/* Must not call any more Java methods to preserve exception */
+		return (-1);
+	}
+
+	/* Set path */
+	result = nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path);
+	if (result != 0) {
+		zjni_throw_exception(env,
+		    "could not retrive path from disk virtual device (pool %s)",
+		    zpool_get_name(zhp));
+	} else {
+
+		jstring pathUTF = (*env)->NewStringUTF(env, path);
+		(*env)->CallVoidMethod(env, ((zjni_Object_t *)bean)->object,
+		    bean->method_setDiskName, pathUTF);
+	}
+
+	return (result != 0);
+}
+
+static int
+populate_FileVirtualDeviceBean(JNIEnv *env, zpool_handle_t *zhp,
+    nvlist_t *vdev, FileVirtualDeviceBean_t *bean)
+{
+	char *path;
+	int result = populate_VirtualDeviceBean(
+	    env, zhp, vdev, (VirtualDeviceBean_t *)bean);
+
+	if (result) {
+		/* Must not call any more Java methods to preserve exception */
+		return (-1);
+	}
+
+	/* Set path */
+	result = nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path);
+	if (result != 0) {
+		zjni_throw_exception(env,
+		    "could not retrive path from disk virtual device (pool %s)",
+		    zpool_get_name(zhp));
+	} else {
+
+		jstring pathUTF = (*env)->NewStringUTF(env, path);
+		(*env)->CallVoidMethod(env, ((zjni_Object_t *)bean)->object,
+		    bean->method_setFileName, pathUTF);
+	}
+
+	return (result != 0);
+}
+
+static int
+populate_RAIDVirtualDeviceBean(JNIEnv *env, zpool_handle_t *zhp,
+    nvlist_t *vdev, RAIDVirtualDeviceBean_t *bean)
+{
+	return (populate_VirtualDeviceBean(env, zhp, vdev,
+	    (VirtualDeviceBean_t *)bean));
+}
+
+static int
+populate_MirrorVirtualDeviceBean(JNIEnv *env, zpool_handle_t *zhp,
+    nvlist_t *vdev, MirrorVirtualDeviceBean_t *bean)
+{
+	return (populate_VirtualDeviceBean(env, zhp, vdev,
+	    (VirtualDeviceBean_t *)bean));
+}
+
+static jobject
+create_DiskVirtualDeviceBean(JNIEnv *env, zpool_handle_t *zhp, nvlist_t *vdev)
+{
+	int result;
+	DiskVirtualDeviceBean_t bean_obj = {0};
+	DiskVirtualDeviceBean_t *bean = &bean_obj;
+
+	/* Construct DiskVirtualDeviceBean */
+	new_DiskVirtualDeviceBean(env, bean);
+
+	result = populate_DiskVirtualDeviceBean(env, zhp, vdev, bean);
+	if (result) {
+		/* Must not call any more Java methods to preserve exception */
+		return (NULL);
+	}
+
+	return (((zjni_Object_t *)bean)->object);
+}
+
+static jobject
+create_FileVirtualDeviceBean(JNIEnv *env, zpool_handle_t *zhp, nvlist_t *vdev)
+{
+	int result;
+	FileVirtualDeviceBean_t bean_obj = {0};
+	FileVirtualDeviceBean_t *bean = &bean_obj;
+
+	/* Construct FileVirtualDeviceBean */
+	new_FileVirtualDeviceBean(env, bean);
+
+	result = populate_FileVirtualDeviceBean(env, zhp, vdev, bean);
+	if (result) {
+		/* Must not call any more Java methods to preserve exception */
+		return (NULL);
+	}
+
+	return (((zjni_Object_t *)bean)->object);
+}
+
+static jobject
+create_RAIDVirtualDeviceBean(JNIEnv *env, zpool_handle_t *zhp, nvlist_t *vdev)
+{
+	int result;
+	RAIDVirtualDeviceBean_t bean_obj = {0};
+	RAIDVirtualDeviceBean_t *bean = &bean_obj;
+
+	((zjni_Object_t *)bean)->object = NULL;
+
+	/* Construct RAIDVirtualDeviceBean */
+	new_RAIDVirtualDeviceBean(env, bean);
+
+	result = populate_RAIDVirtualDeviceBean(env, zhp, vdev, bean);
+	if (result) {
+		/* Must not call any more Java methods to preserve exception */
+		return (NULL);
+	}
+
+	return (((zjni_Object_t *)bean)->object);
+}
+
+static jobject
+create_MirrorVirtualDeviceBean(JNIEnv *env, zpool_handle_t *zhp, nvlist_t *vdev)
+{
+	int result;
+	MirrorVirtualDeviceBean_t bean_obj = {0};
+	MirrorVirtualDeviceBean_t *bean = &bean_obj;
+
+	/* Construct MirrorVirtualDeviceBean */
+	new_MirrorVirtualDeviceBean(env, bean);
+
+	result = populate_MirrorVirtualDeviceBean(env, zhp, vdev, bean);
+	if (result) {
+		/* Must not call any more Java methods to preserve exception */
+		return (NULL);
+	}
+
+	return (((zjni_Object_t *)bean)->object);
+}
+
+/*
+ * Package-private functions
+ */
+
+/*
+ * Gets the root vdev (an nvlist_t *) for the given pool.
+ */
+nvlist_t *
+zjni_get_root_vdev(zpool_handle_t *zhp)
+{
+	nvlist_t *root = NULL;
+
+	if (zhp != NULL) {
+		nvlist_t *attrs = zpool_get_config(zhp);
+
+		if (attrs != NULL) {
+			int result = nvlist_lookup_nvlist(
+			    attrs, ZPOOL_CONFIG_VDEV_TREE, &root);
+			if (result != 0) {
+				root = NULL;
+			}
+			/*		nvlist_print(stderr, vdev_parent); */
+		}
+	}
+
+	return (root);
+}
+
+/*
+ * Gets the vdev (an nvlist_t *) with the given vdev_id, below the
+ * given vdev.  If the given vdev is NULL, all vdevs within the given
+ * pool are searched.
+ */
+nvlist_t *
+zjni_get_vdev(zpool_handle_t *zhp, nvlist_t *vdev_parent,
+    uint64_t vdev_id_to_find)
+{
+	int result;
+
+	/* Was a vdev specified? */
+	if (vdev_parent == NULL) {
+		/* No -- retrieve the top-level pool vdev */
+		vdev_parent = zjni_get_root_vdev(zhp);
+	} else {
+		/* Get index of this vdev and compare with vdev_id_to_find */
+		uint64_t id;
+		result = nvlist_lookup_uint64(
+		    vdev_parent, ZPOOL_CONFIG_GUID, &id);
+		if (result == 0 && id == vdev_id_to_find) {
+			return (vdev_parent);
+		}
+	}
+
+	if (vdev_parent != NULL) {
+
+		nvlist_t **children;
+		uint_t nelem = 0;
+
+		/* Get the vdevs under this vdev */
+		result = nvlist_lookup_nvlist_array(
+		    vdev_parent, ZPOOL_CONFIG_CHILDREN, &children, &nelem);
+
+		if (result == 0) {
+
+			int i;
+			nvlist_t *child;
+
+			/* For each vdev child... */
+			for (i = 0; i < nelem; i++) {
+				child = zjni_get_vdev(zhp, children[i],
+				    vdev_id_to_find);
+				if (child != NULL) {
+					return (child);
+				}
+			}
+		}
+	}
+
+	return (NULL);
+}
+
+jobject
+zjni_get_VirtualDevice_from_vdev(JNIEnv *env, zpool_handle_t *zhp,
+    nvlist_t *vdev)
+{
+	jobject obj = NULL;
+	char *type = NULL;
+	int result = nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type);
+
+	if (result == 0) {
+		if (strcmp(type, VDEV_TYPE_DISK) == 0) {
+			obj = create_DiskVirtualDeviceBean(env, zhp, vdev);
+		} else if (strcmp(type, VDEV_TYPE_FILE) == 0) {
+			obj = create_FileVirtualDeviceBean(env, zhp,
+			    vdev);
+		} else if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
+			obj = create_RAIDVirtualDeviceBean(env,
+			    zhp, vdev);
+		} else if (strcmp(type, VDEV_TYPE_MIRROR) == 0) {
+			obj = create_MirrorVirtualDeviceBean(env, zhp, vdev);
+		}
+	}
+
+	return (obj);
+}
+
+jobject
+zjni_get_VirtualDevices_from_vdev(JNIEnv *env, zpool_handle_t *zhp,
+    nvlist_t *vdev_parent)
+{
+	/* Create an array list for the vdevs */
+	zjni_ArrayList_t list_class = {0};
+	zjni_ArrayList_t *list_class_p = &list_class;
+	zjni_new_ArrayList(env, list_class_p);
+
+	/* Was a vdev specified? */
+	if (vdev_parent == NULL) {
+		/* No -- retrieve the top-level pool vdev */
+		vdev_parent = zjni_get_root_vdev(zhp);
+	}
+
+	/* nvlist_print(stderr, vdev_parent); */
+
+	if (vdev_parent != NULL) {
+
+		/* Get the vdevs under this vdev */
+		nvlist_t **children;
+		uint_t nelem = 0;
+		int result = nvlist_lookup_nvlist_array(
+		    vdev_parent, ZPOOL_CONFIG_CHILDREN, &children, &nelem);
+
+		if (result == 0) {
+
+			/* For each vdev child... */
+			int i;
+			for (i = 0; i < nelem; i++) {
+				nvlist_t *child = children[i];
+
+				/* Create a Java object from this vdev */
+				jobject obj =
+				    zjni_get_VirtualDevice_from_vdev(env,
+					zhp, child);
+
+				if (obj == NULL) {
+					/*
+					 * Must not call any more Java methods
+					 * to preserve exception
+					 */
+					return (NULL);
+				}
+
+				/* Add child to child vdev list */
+				(*env)->CallBooleanMethod(env,
+				    ((zjni_Object_t *)list_class_p)->object,
+				    ((zjni_Collection_t *)list_class_p)->
+				    method_add, obj);
+			}
+		}
+	}
+
+	return (zjni_Collection_to_array(
+	    env, (zjni_Collection_t *)list_class_p,
+	    ZFSJNI_PACKAGE_DATA "VirtualDevice"));
+}
diff --git a/usr/src/lib/libzfs_jni/common/libzfs_jni_pool.h b/usr/src/lib/libzfs_jni/common/libzfs_jni_pool.h
new file mode 100644
index 000000000000..e7c08cea3623
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_pool.h
@@ -0,0 +1,55 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LIBZFS_JNI_POOL_H
+#define	_LIBZFS_JNI_POOL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <jni.h>
+#include <libnvpair.h>
+#include <libzfs.h>
+
+/*
+ * Function prototypes
+ */
+
+nvlist_t *zjni_get_root_vdev(zpool_handle_t *);
+nvlist_t *zjni_get_vdev(zpool_handle_t *, nvlist_t *, uint64_t);
+jobject zjni_get_VirtualDevice_from_vdev(
+    JNIEnv *, zpool_handle_t *, nvlist_t *);
+jobject zjni_get_VirtualDevices_from_vdev(
+    JNIEnv *, zpool_handle_t *, nvlist_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBZFS_JNI_POOL_H */
diff --git a/usr/src/lib/libzfs_jni/common/libzfs_jni_property.c b/usr/src/lib/libzfs_jni/common/libzfs_jni_property.c
new file mode 100644
index 000000000000..d396da4e5314
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_property.c
@@ -0,0 +1,1163 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include "libzfs_jni_property.h"
+#include "libzfs_jni_util.h"
+#include <strings.h>
+
+/*
+ * Function prototypes
+ */
+
+static jobject create_BooleanProperty(JNIEnv *, zfs_handle_t *, zfs_prop_t);
+static jobject create_ChecksumProperty(JNIEnv *, zfs_handle_t *);
+static jobject create_CompressionProperty(JNIEnv *, zfs_handle_t *);
+static jobject create_DateProperty(JNIEnv *, zfs_handle_t *, zfs_prop_t);
+static jobject create_LongProperty(JNIEnv *, zfs_handle_t *, zfs_prop_t);
+static jobject create_RecordSizeProperty(JNIEnv *, zfs_handle_t *);
+static jobject create_StringProperty(JNIEnv *, zfs_handle_t *, zfs_prop_t);
+static jobject str_to_checksum(JNIEnv *, char *);
+static jobject str_to_compression(JNIEnv *, char *);
+static jobject create_default_BooleanProperty(JNIEnv *, zfs_prop_t);
+static jobject create_default_LongProperty(JNIEnv *, zfs_prop_t);
+static jobject create_default_StringProperty(JNIEnv *, zfs_prop_t);
+static jobject create_default_MountPointProperty(JNIEnv *);
+static jobject create_default_ShareNFSProperty(JNIEnv *);
+static jobject create_default_ChecksumProperty(JNIEnv *);
+static jobject create_default_CompressionProperty(JNIEnv *);
+static jobject create_default_RecordSizeProperty(JNIEnv *);
+
+/*
+ * Static data
+ */
+
+zfs_prop_t props_boolean[] = {
+	ZFS_PROP_ATIME,
+	ZFS_PROP_DEVICES,
+	ZFS_PROP_EXEC,
+	ZFS_PROP_MOUNTED,
+	ZFS_PROP_READONLY,
+	ZFS_PROP_SETUID,
+	ZFS_PROP_ZONED,
+	ZFS_PROP_INVAL
+};
+
+zfs_prop_t props_long[] = {
+	ZFS_PROP_AVAILABLE,
+	ZFS_PROP_QUOTA,
+	/*	ZFS_PROP_RATIO, */
+	ZFS_PROP_REFERENCED,
+	ZFS_PROP_RESERVATION,
+	ZFS_PROP_USED,
+	ZFS_PROP_VOLSIZE,
+	ZFS_PROP_INVAL
+};
+
+zfs_prop_t props_string[] = {
+	ZFS_PROP_ORIGIN,
+	/*	ZFS_PROP_TYPE, */
+	ZFS_PROP_INVAL
+};
+
+/*
+ * Static functions
+ */
+
+static jobject
+create_BooleanProperty(JNIEnv *env, zfs_handle_t *zhp, zfs_prop_t prop)
+{
+	jobject propertyObject = NULL;
+	char source[ZFS_MAXNAMELEN];
+	uint64_t value;
+	zfs_source_t srctype;
+
+	int result = zfs_prop_get_numeric(zhp, prop, &value,
+	    &srctype, source, sizeof (source));
+
+	if (result == 0) {
+		jclass class_BooleanProperty = (*env)->FindClass(env,
+		    ZFSJNI_PACKAGE_DATA "BooleanProperty");
+
+		jstring propName = (*env)->NewStringUTF(
+		    env, zfs_prop_to_name(prop));
+		jobject propValue = zjni_int_to_boolean(env, value);
+		jboolean readOnly = zfs_prop_readonly(prop) ?
+		    JNI_TRUE : JNI_FALSE;
+
+		jmethodID constructor_BooleanProperty;
+
+		if (srctype == ZFS_SRC_INHERITED) {
+
+			jstring propSource = (*env)->NewStringUTF(env, source);
+			constructor_BooleanProperty = (*env)->GetMethodID(
+			    env, class_BooleanProperty, "<init>",
+			    "(Ljava/lang/String;Ljava/lang/Boolean;ZL"
+			    "java/lang/String;)V");
+
+			propertyObject = (*env)->NewObject(
+			    env, class_BooleanProperty,
+			    constructor_BooleanProperty,
+			    propName, propValue, readOnly, propSource);
+		} else {
+			jobject lineage = zjni_get_lineage(env, srctype);
+
+			constructor_BooleanProperty = (*env)->GetMethodID(
+			    env, class_BooleanProperty, "<init>",
+			    "(Ljava/lang/String;Ljava/lang/Boolean;ZL"
+			    ZFSJNI_PACKAGE_DATA "Property$Lineage;)V");
+
+			propertyObject = (*env)->NewObject(
+			    env, class_BooleanProperty,
+			    constructor_BooleanProperty,
+			    propName, propValue, readOnly, lineage);
+		}
+	}
+
+	return (propertyObject);
+}
+
+static jobject
+create_ChecksumProperty(JNIEnv *env, zfs_handle_t *zhp)
+{
+	jobject propertyObject = NULL;
+	char propbuf[ZFS_MAXPROPLEN];
+	char source[ZFS_MAXNAMELEN];
+	zfs_source_t srctype;
+
+	int result = zfs_prop_get(zhp, ZFS_PROP_CHECKSUM,
+	    propbuf, sizeof (propbuf), &srctype, source, sizeof (source), 1);
+
+	if (result == 0) {
+		jobject propValue = str_to_checksum(env, propbuf);
+
+		if (propValue != NULL) {
+
+			jclass class_ChecksumProperty = (*env)->FindClass(env,
+			    ZFSJNI_PACKAGE_DATA "ChecksumProperty");
+
+			jmethodID constructor_ChecksumProperty;
+
+			if (srctype == ZFS_SRC_INHERITED) {
+
+				jstring propSource = (*env)->NewStringUTF(env,
+				    source);
+				constructor_ChecksumProperty =
+				    (*env)->GetMethodID(
+					env, class_ChecksumProperty, "<init>",
+					"(L" ZFSJNI_PACKAGE_DATA
+					"ChecksumProperty$Checksum;Ljava/lang/"
+					"String;)V");
+
+				propertyObject = (*env)->NewObject(env,
+				    class_ChecksumProperty,
+				    constructor_ChecksumProperty,
+				    propValue, propSource);
+
+			} else {
+				jobject lineage =
+				    zjni_get_lineage(env, srctype);
+				constructor_ChecksumProperty =
+				    (*env)->GetMethodID(
+					env, class_ChecksumProperty, "<init>",
+					"(L" ZFSJNI_PACKAGE_DATA
+					"ChecksumProperty$Checksum;L"
+					ZFSJNI_PACKAGE_DATA
+					"Property$Lineage;)V");
+
+				propertyObject = (*env)->NewObject(env,
+				    class_ChecksumProperty,
+				    constructor_ChecksumProperty,
+				    propValue, lineage);
+			}
+		}
+	}
+
+	return (propertyObject);
+}
+
+static jobject
+create_CompressionProperty(JNIEnv *env, zfs_handle_t *zhp)
+{
+	jobject propertyObject = NULL;
+	char propbuf[ZFS_MAXPROPLEN];
+	char source[ZFS_MAXNAMELEN];
+	zfs_source_t srctype;
+
+	int result = zfs_prop_get(zhp, ZFS_PROP_COMPRESSION,
+	    propbuf, sizeof (propbuf), &srctype, source, sizeof (source), 1);
+
+	if (result == 0) {
+		jobject propValue = str_to_compression(env, propbuf);
+
+		if (propValue != NULL) {
+
+			jclass class_CompressionProperty =
+			    (*env)->FindClass(env,
+				ZFSJNI_PACKAGE_DATA "CompressionProperty");
+
+			jmethodID constructor_CompressionProperty;
+
+			if (srctype == ZFS_SRC_INHERITED) {
+
+				jstring propSource = (*env)->NewStringUTF(env,
+				    source);
+				constructor_CompressionProperty =
+				    (*env)->GetMethodID(
+					env, class_CompressionProperty,
+					"<init>",
+					"(L" ZFSJNI_PACKAGE_DATA
+					"CompressionProperty$Compression;Ljava/"
+					"lang/String;)V");
+
+				propertyObject = (*env)->NewObject(env,
+				    class_CompressionProperty,
+				    constructor_CompressionProperty,
+				    propValue, propSource);
+			} else {
+				jobject lineage = zjni_get_lineage(env,
+				    srctype);
+
+				constructor_CompressionProperty =
+				    (*env)->GetMethodID(
+					env, class_CompressionProperty,
+					"<init>",
+					"(L" ZFSJNI_PACKAGE_DATA
+					"CompressionProperty$Compression;L"
+					ZFSJNI_PACKAGE_DATA
+					"Property$Lineage;)V");
+
+				propertyObject = (*env)->NewObject(env,
+				    class_CompressionProperty,
+				    constructor_CompressionProperty,
+				    propValue, lineage);
+			}
+		}
+	}
+
+	return (propertyObject);
+}
+
+static jobject
+create_DateProperty(JNIEnv *env, zfs_handle_t *zhp, zfs_prop_t prop)
+{
+	jobject propertyObject = NULL;
+	char propbuf[ZFS_MAXPROPLEN];
+	char source[ZFS_MAXNAMELEN];
+	zfs_source_t srctype;
+
+	int result = zfs_prop_get(zhp, prop, propbuf, sizeof (propbuf),
+	    &srctype, source, sizeof (source), 1);
+
+	if (result == 0) {
+
+		jobject propValue = zjni_str_to_date(env, propbuf);
+		if (propValue != NULL) {
+
+			jclass class_DateProperty = (*env)->FindClass(env,
+			    ZFSJNI_PACKAGE_DATA "DateProperty");
+
+			jstring propName = (*env)->NewStringUTF(
+			    env, zfs_prop_to_name(prop));
+			jboolean readOnly =
+			    zfs_prop_readonly(prop) ? JNI_TRUE : JNI_FALSE;
+
+			jmethodID constructor_DateProperty;
+
+			if (srctype == ZFS_SRC_INHERITED) {
+
+				jstring propSource = (*env)->NewStringUTF(env,
+				    source);
+				constructor_DateProperty = (*env)->GetMethodID(
+				    env, class_DateProperty, "<init>",
+				    "(Ljava/lang/String;Ljava/util/Date;ZL"
+				    "java/lang/String;)V");
+
+				propertyObject = (*env)->NewObject(
+				    env, class_DateProperty,
+				    constructor_DateProperty,
+				    propName, propValue, readOnly, propSource);
+			} else {
+				jobject lineage = zjni_get_lineage(env,
+				    srctype);
+
+				constructor_DateProperty = (*env)->GetMethodID(
+				    env, class_DateProperty, "<init>",
+				    "(Ljava/lang/String;Ljava/util/Date;ZL"
+				    ZFSJNI_PACKAGE_DATA "Property$Lineage;)V");
+
+				propertyObject = (*env)->NewObject(
+				    env, class_DateProperty,
+				    constructor_DateProperty,
+				    propName, propValue, readOnly, lineage);
+			}
+		}
+	}
+
+	return (propertyObject);
+}
+
+static jobject
+create_LongProperty(JNIEnv *env, zfs_handle_t *zhp, zfs_prop_t prop)
+{
+	jobject propertyObject = NULL;
+	char propbuf[ZFS_MAXPROPLEN];
+	char source[ZFS_MAXNAMELEN];
+	zfs_source_t srctype;
+
+	int result = zfs_prop_get(zhp, prop, propbuf, sizeof (propbuf),
+	    &srctype, source, sizeof (source), 1);
+
+	if (result == 0) {
+
+		jobject propValue = zjni_str_to_long(env, propbuf);
+		if (propValue != NULL) {
+
+			jclass class_LongProperty = (*env)->FindClass(env,
+			    ZFSJNI_PACKAGE_DATA "LongProperty");
+
+			jstring propName = (*env)->NewStringUTF(
+			    env, zfs_prop_to_name(prop));
+			jboolean readOnly =
+			    zfs_prop_readonly(prop) ? JNI_TRUE : JNI_FALSE;
+
+			jmethodID constructor_LongProperty;
+
+			if (srctype == ZFS_SRC_INHERITED) {
+
+				jstring propSource =
+				    (*env)->NewStringUTF(env, source);
+				constructor_LongProperty = (*env)->GetMethodID(
+				    env, class_LongProperty, "<init>",
+				    "(Ljava/lang/String;Ljava/lang/Long;ZL"
+				    "java/lang/String;)V");
+
+				propertyObject = (*env)->NewObject(
+				    env, class_LongProperty,
+				    constructor_LongProperty,
+				    propName, propValue, readOnly, propSource);
+			} else {
+				jobject lineage = zjni_get_lineage(env,
+				    srctype);
+
+				constructor_LongProperty = (*env)->GetMethodID(
+				    env, class_LongProperty, "<init>",
+				    "(Ljava/lang/String;Ljava/lang/Long;ZL"
+				    ZFSJNI_PACKAGE_DATA "Property$Lineage;)V");
+
+				propertyObject = (*env)->NewObject(
+				    env, class_LongProperty,
+				    constructor_LongProperty,
+				    propName, propValue, readOnly, lineage);
+			}
+		}
+	}
+
+	return (propertyObject);
+}
+
+static jobject
+create_RecordSizeProperty(JNIEnv *env, zfs_handle_t *zhp)
+{
+	jobject propertyObject = NULL;
+	char propbuf[ZFS_MAXPROPLEN];
+	char source[ZFS_MAXNAMELEN];
+	zfs_source_t srctype;
+
+	int result = zfs_prop_get(zhp, ZFS_PROP_RECORDSIZE,
+	    propbuf, sizeof (propbuf), &srctype, source, sizeof (source), 1);
+
+	if (result == 0) {
+
+		jobject propValue = zjni_str_to_long(env, propbuf);
+		if (propValue != NULL) {
+
+			jclass class_RecordSizeProperty = (*env)->FindClass(env,
+			    ZFSJNI_PACKAGE_DATA "RecordSizeProperty");
+
+			jmethodID constructor_RecordSizeProperty;
+
+			if (srctype == ZFS_SRC_INHERITED) {
+
+				jstring propSource =
+				    (*env)->NewStringUTF(env, source);
+				constructor_RecordSizeProperty =
+				    (*env)->GetMethodID(
+					env, class_RecordSizeProperty, "<init>",
+					"(Ljava/lang/Long;Ljava/lang/"
+					"String;)V");
+
+				propertyObject = (*env)->NewObject(env,
+				    class_RecordSizeProperty,
+				    constructor_RecordSizeProperty,
+				    propValue, propSource);
+			} else {
+				jobject lineage =
+				    zjni_get_lineage(env, srctype);
+
+				constructor_RecordSizeProperty =
+				    (*env)->GetMethodID(
+					env, class_RecordSizeProperty, "<init>",
+					"(Ljava/lang/Long;L"
+					ZFSJNI_PACKAGE_DATA
+					"Property$Lineage;)V");
+
+				propertyObject = (*env)->NewObject(env,
+				    class_RecordSizeProperty,
+				    constructor_RecordSizeProperty,
+				    propValue, lineage);
+			}
+		}
+	}
+
+	return (propertyObject);
+}
+
+static jobject
+create_StringProperty(JNIEnv *env, zfs_handle_t *zhp, zfs_prop_t prop)
+{
+	jobject propertyObject = NULL;
+	char propbuf[ZFS_MAXPROPLEN];
+	char source[ZFS_MAXNAMELEN];
+	zfs_source_t srctype;
+
+	int result = zfs_prop_get(zhp, prop, propbuf, sizeof (propbuf),
+	    &srctype, source, sizeof (source), 1);
+
+	if (result == 0) {
+		jmethodID constructor_StringProperty;
+
+		jclass class_StringProperty =
+		    (*env)->FindClass(env, ZFSJNI_PACKAGE_DATA
+			"StringProperty");
+
+		jstring propName =
+		    (*env)->NewStringUTF(env, zfs_prop_to_name(prop));
+
+		jobject propValue = (*env)->NewStringUTF(env, propbuf);
+		jboolean readOnly = zfs_prop_readonly(prop) ?
+		    JNI_TRUE : JNI_FALSE;
+
+		if (srctype == ZFS_SRC_INHERITED) {
+
+			jstring propSource = (*env)->NewStringUTF(env, source);
+			constructor_StringProperty = (*env)->GetMethodID(
+			    env, class_StringProperty, "<init>",
+			    "(Ljava/lang/String;Ljava/lang/String;ZL"
+			    "java/lang/String;)V");
+
+			propertyObject = (*env)->NewObject(
+			    env, class_StringProperty,
+			    constructor_StringProperty,
+			    propName, propValue, readOnly, propSource);
+		} else {
+			jobject lineage = zjni_get_lineage(env, srctype);
+
+			constructor_StringProperty = (*env)->GetMethodID(
+			    env, class_StringProperty, "<init>",
+			    "(Ljava/lang/String;Ljava/lang/String;ZL"
+			    ZFSJNI_PACKAGE_DATA "Property$Lineage;)V");
+
+			propertyObject = (*env)->NewObject(
+			    env, class_StringProperty,
+			    constructor_StringProperty,
+			    propName, propValue, readOnly, lineage);
+		}
+	}
+
+	return (propertyObject);
+}
+
+static jobject
+create_MountPointProperty(JNIEnv *env, zfs_handle_t *zhp)
+{
+	jobject propertyObject = NULL;
+	char propbuf[ZFS_MAXPROPLEN];
+	char source[ZFS_MAXNAMELEN];
+	zfs_source_t srctype;
+
+	int result = zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
+	    propbuf, sizeof (propbuf), &srctype, source, sizeof (source), 1);
+
+	if (result == 0) {
+		jmethodID constructor_MountPointProperty;
+
+		jclass class_MountPointProperty = (*env)->FindClass(
+		    env, ZFSJNI_PACKAGE_DATA "MountPointProperty");
+
+		jobject propValue = (*env)->NewStringUTF(env, propbuf);
+
+		if (srctype == ZFS_SRC_INHERITED) {
+
+			jstring propSource = (*env)->NewStringUTF(env, source);
+			constructor_MountPointProperty = (*env)->GetMethodID(
+			    env, class_MountPointProperty, "<init>",
+			    "(Ljava/lang/String;Ljava/lang/String;)V");
+
+			propertyObject = (*env)->NewObject(env,
+			    class_MountPointProperty,
+			    constructor_MountPointProperty,
+			    propValue, propSource);
+		} else {
+			jobject lineage = zjni_get_lineage(env, srctype);
+
+			constructor_MountPointProperty = (*env)->GetMethodID(
+			    env, class_MountPointProperty, "<init>",
+			    "(Ljava/lang/String;L"
+			    ZFSJNI_PACKAGE_DATA "Property$Lineage;)V");
+
+			propertyObject = (*env)->NewObject(env,
+			    class_MountPointProperty,
+			    constructor_MountPointProperty,
+			    propValue, lineage);
+		}
+	}
+
+	return (propertyObject);
+}
+
+static jobject
+create_ShareNFSProperty(JNIEnv *env, zfs_handle_t *zhp)
+{
+	jobject propertyObject = NULL;
+	char propbuf[ZFS_MAXPROPLEN];
+	char source[ZFS_MAXNAMELEN];
+	zfs_source_t srctype;
+
+	int result = zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
+	    propbuf, sizeof (propbuf), &srctype, source, sizeof (source), 1);
+
+	if (result == 0) {
+		jmethodID constructor_ShareNFSProperty;
+
+		jclass class_ShareNFSProperty = (*env)->FindClass(
+		    env, ZFSJNI_PACKAGE_DATA "ShareNFSProperty");
+
+		jobject propValue = (*env)->NewStringUTF(env, propbuf);
+
+		if (srctype == ZFS_SRC_INHERITED) {
+
+			jstring propSource = (*env)->NewStringUTF(env, source);
+			constructor_ShareNFSProperty = (*env)->GetMethodID(
+			    env, class_ShareNFSProperty, "<init>",
+			    "(Ljava/lang/String;Ljava/lang/String;)V");
+
+			propertyObject = (*env)->NewObject(
+			    env, class_ShareNFSProperty,
+			    constructor_ShareNFSProperty,
+			    propValue, propSource);
+		} else {
+			jobject lineage = zjni_get_lineage(env, srctype);
+
+			constructor_ShareNFSProperty = (*env)->GetMethodID(
+			    env, class_ShareNFSProperty, "<init>",
+			    "(Ljava/lang/String;L"
+			    ZFSJNI_PACKAGE_DATA "Property$Lineage;)V");
+
+			propertyObject = (*env)->NewObject(
+			    env, class_ShareNFSProperty,
+			    constructor_ShareNFSProperty,
+			    propValue, lineage);
+		}
+	}
+
+	return (propertyObject);
+}
+
+static jobject
+str_to_checksum(JNIEnv *env, char *str)
+{
+	jclass class_Checksum = (*env)->FindClass(
+	    env, ZFSJNI_PACKAGE_DATA "ChecksumProperty$Checksum");
+
+	jmethodID method_valueOf = (*env)->GetStaticMethodID(
+	    env, class_Checksum, "valueOf",
+	    "(Ljava/lang/String;)L"
+	    ZFSJNI_PACKAGE_DATA "ChecksumProperty$Checksum;");
+
+	jstring utf = (*env)->NewStringUTF(env, str);
+
+	return (*env)->CallStaticObjectMethod(
+	    env, class_Checksum, method_valueOf, utf);
+}
+
+static jobject
+str_to_compression(JNIEnv *env, char *str)
+{
+	jclass class_Compression = (*env)->FindClass(
+	    env, ZFSJNI_PACKAGE_DATA "CompressionProperty$Compression");
+
+	jmethodID method_valueOf = (*env)->GetStaticMethodID(
+	    env, class_Compression, "valueOf",
+	    "(Ljava/lang/String;)L"
+	    ZFSJNI_PACKAGE_DATA "CompressionProperty$Compression;");
+
+	jstring utf = (*env)->NewStringUTF(env, str);
+
+	return (*env)->CallStaticObjectMethod(
+	    env, class_Compression, method_valueOf, utf);
+}
+
+/*
+ * Package-private functions
+ */
+jobject
+zjni_get_default_property(JNIEnv *env, zfs_prop_t prop)
+{
+	int i;
+	for (i = 0; props_boolean[i] != ZFS_PROP_INVAL; i++) {
+		if (prop == props_boolean[i]) {
+			return (create_default_BooleanProperty(env, prop));
+		}
+	}
+
+	for (i = 0; props_long[i] != ZFS_PROP_INVAL; i++) {
+		if (prop == props_long[i]) {
+			return (create_default_LongProperty(env, prop));
+		}
+	}
+
+	for (i = 0; props_string[i] != ZFS_PROP_INVAL; i++) {
+		if (prop == props_string[i]) {
+			return (create_default_StringProperty(env, prop));
+		}
+	}
+
+	if (prop == ZFS_PROP_MOUNTPOINT) {
+		return (create_default_MountPointProperty(env));
+	}
+
+	if (prop == ZFS_PROP_SHARENFS) {
+		return (create_default_ShareNFSProperty(env));
+	}
+
+	if (prop == ZFS_PROP_CHECKSUM) {
+		return (create_default_ChecksumProperty(env));
+	}
+
+	if (prop == ZFS_PROP_COMPRESSION) {
+		return (create_default_CompressionProperty(env));
+	}
+
+	if (prop == ZFS_PROP_RECORDSIZE) {
+		return (create_default_RecordSizeProperty(env));
+	}
+
+	return (NULL);
+}
+
+zfs_prop_t
+zjni_get_property_from_name(const char *name)
+{
+	zfs_prop_t prop;
+	for (prop = 0; prop < ZFS_NPROP_VISIBLE; prop++) {
+		if (strcasecmp(name, zfs_prop_to_name(prop)) == 0) {
+			return (prop);
+		}
+	}
+
+	return (ZFS_PROP_INVAL);
+}
+
+jobject
+zjni_get_lineage(JNIEnv *env, zfs_source_t srctype)
+{
+	char *field;
+	jclass class_Lineage;
+	jfieldID id;
+
+	switch (srctype) {
+	case ZFS_SRC_NONE:
+		field = "ZFS_PROP_LINEAGE_NOTINHERITABLE";
+		break;
+
+	case ZFS_SRC_DEFAULT:
+		field = "ZFS_PROP_LINEAGE_DEFAULT";
+		break;
+
+	case ZFS_SRC_LOCAL:
+		field = "ZFS_PROP_LINEAGE_LOCAL";
+		break;
+
+	case ZFS_SRC_TEMPORARY:
+		field = "ZFS_PROP_LINEAGE_TEMPORARY";
+		break;
+
+	default:
+	case ZFS_SRC_INHERITED:
+		field = "ZFS_PROP_LINEAGE_INHERITED";
+		break;
+	}
+
+	class_Lineage = (*env)->FindClass(
+	    env, ZFSJNI_PACKAGE_DATA "Property$Lineage");
+
+	id = (*env)->GetStaticFieldID(env, class_Lineage,
+	    field, "L" ZFSJNI_PACKAGE_DATA "Property$Lineage;");
+
+	return (*env)->GetStaticObjectField(env, class_Lineage, id);
+}
+
+jobjectArray
+zjni_get_Dataset_properties(JNIEnv *env, zfs_handle_t *zhp)
+{
+	jobject prop;
+	int i;
+
+	/* Create an array list for the properties */
+	zjni_ArrayList_t proplist_obj = {0};
+	zjni_ArrayList_t *proplist = &proplist_obj;
+	zjni_new_ArrayList(env, proplist);
+
+	for (i = 0; props_boolean[i] != ZFS_PROP_INVAL; i++) {
+		/* Create property and add to list */
+		prop = create_BooleanProperty(env, zhp, props_boolean[i]);
+
+		/* Does this property apply to this object? */
+		if (prop != NULL) {
+
+			(*env)->CallBooleanMethod(
+			    env, ((zjni_Object_t *)proplist)->object,
+			    ((zjni_Collection_t *)proplist)->method_add, prop);
+		} else {
+
+			if ((*env)->ExceptionOccurred(env) != NULL) {
+				return (NULL);
+			}
+#ifdef	DEBUG
+			(void) fprintf(stderr, "Property %s is not appropriate "
+			    "for %s\n", zfs_prop_to_name(props_boolean[i]),
+			    zfs_get_name(zhp));
+#endif
+		}
+	}
+
+	for (i = 0; props_long[i] != ZFS_PROP_INVAL; i++) {
+		/* Create property and add to list */
+		prop = create_LongProperty(env, zhp, props_long[i]);
+
+		/* Does this property apply to this object? */
+		if (prop != NULL) {
+
+			(*env)->CallBooleanMethod(
+			    env, ((zjni_Object_t *)proplist)->object,
+			    ((zjni_Collection_t *)proplist)->method_add, prop);
+		} else {
+			if ((*env)->ExceptionOccurred(env) != NULL) {
+				return (NULL);
+			}
+#ifdef	DEBUG
+			(void) fprintf(stderr, "Property %s is not appropriate "
+			    "for %s\n", zfs_prop_to_name(props_long[i]),
+			    zfs_get_name(zhp));
+#endif
+		}
+	}
+
+	for (i = 0; props_string[i] != ZFS_PROP_INVAL; i++) {
+		/* Create property and add to list */
+		prop = create_StringProperty(env, zhp, props_string[i]);
+
+		/* Does this property apply to this object? */
+		if (prop != NULL) {
+
+			(*env)->CallBooleanMethod(
+			    env, ((zjni_Object_t *)proplist)->object,
+			    ((zjni_Collection_t *)proplist)->method_add, prop);
+		} else {
+			if ((*env)->ExceptionOccurred(env) != NULL) {
+				return (NULL);
+			}
+#ifdef	DEBUG
+			(void) fprintf(stderr, "Property %s is not appropriate "
+			    "for %s\n", zfs_prop_to_name(props_string[i]),
+			    zfs_get_name(zhp));
+#endif
+		}
+	}
+
+	prop = create_MountPointProperty(env, zhp);
+	/* Does this property apply to this object? */
+	if (prop != NULL) {
+
+		(*env)->CallBooleanMethod(env,
+		    ((zjni_Object_t *)proplist)->object,
+		    ((zjni_Collection_t *)proplist)->method_add, prop);
+	} else {
+		if ((*env)->ExceptionOccurred(env) != NULL) {
+			return (NULL);
+		}
+#ifdef	DEBUG
+		(void) fprintf(stderr, "Property %s is not appropriate "
+		    "for %s\n", zfs_prop_to_name(ZFS_PROP_MOUNTPOINT),
+		    zfs_get_name(zhp));
+#endif
+	}
+
+	prop = create_ShareNFSProperty(env, zhp);
+	/* Does this property apply to this object? */
+	if (prop != NULL) {
+
+		(*env)->CallBooleanMethod(env,
+		    ((zjni_Object_t *)proplist)->object,
+		    ((zjni_Collection_t *)proplist)->method_add, prop);
+	} else {
+		if ((*env)->ExceptionOccurred(env) != NULL) {
+			return (NULL);
+		}
+#ifdef	DEBUG
+		(void) fprintf(stderr, "Property %s is not appropriate "
+		    "for %s\n", zfs_prop_to_name(ZFS_PROP_SHARENFS),
+		    zfs_get_name(zhp));
+#endif
+	}
+
+	prop = create_ChecksumProperty(env, zhp);
+	/* Does this property apply to this object? */
+	if (prop != NULL) {
+
+		(*env)->CallBooleanMethod(env,
+		    ((zjni_Object_t *)proplist)->object,
+		    ((zjni_Collection_t *)proplist)->method_add, prop);
+	} else {
+		if ((*env)->ExceptionOccurred(env) != NULL) {
+			return (NULL);
+		}
+#ifdef	DEBUG
+		(void) fprintf(stderr, "Property %s is not appropriate "
+		    "for %s\n", zfs_prop_to_name(ZFS_PROP_CHECKSUM),
+		    zfs_get_name(zhp));
+#endif
+	}
+
+	prop = create_CompressionProperty(env, zhp);
+	/* Does this property apply to this object? */
+	if (prop != NULL) {
+
+		(*env)->CallBooleanMethod(env,
+		    ((zjni_Object_t *)proplist)->object,
+		    ((zjni_Collection_t *)proplist)->method_add, prop);
+	} else {
+		if ((*env)->ExceptionOccurred(env) != NULL) {
+			return (NULL);
+		}
+#ifdef	DEBUG
+		(void) fprintf(stderr, "Property %s is not appropriate "
+		    "for %s\n", zfs_prop_to_name(ZFS_PROP_COMPRESSION),
+		    zfs_get_name(zhp));
+#endif
+	}
+
+	prop = create_RecordSizeProperty(env, zhp);
+	/* Does this property apply to this object? */
+	if (prop != NULL) {
+
+		(*env)->CallBooleanMethod(env,
+		    ((zjni_Object_t *)proplist)->object,
+		    ((zjni_Collection_t *)proplist)->method_add, prop);
+	} else {
+		if ((*env)->ExceptionOccurred(env) != NULL) {
+			return (NULL);
+		}
+#ifdef	DEBUG
+		(void) fprintf(stderr, "Property %s is not appropriate "
+		    "for %s\n", zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+		    zfs_get_name(zhp));
+#endif
+	}
+
+	prop = create_DateProperty(env, zhp, ZFS_PROP_CREATION);
+	/* Does this property apply to this object? */
+	if (prop != NULL) {
+
+		(*env)->CallBooleanMethod(env,
+		    ((zjni_Object_t *)proplist)->object,
+		    ((zjni_Collection_t *)proplist)->method_add, prop);
+	} else {
+		if ((*env)->ExceptionOccurred(env) != NULL) {
+			return (NULL);
+		}
+#ifdef	DEBUG
+		(void) fprintf(stderr, "Property %s is not appropriate "
+		    "for %s\n", zfs_prop_to_name(ZFS_PROP_CREATION),
+		    zfs_get_name(zhp));
+#endif
+	}
+
+	return (zjni_Collection_to_array(
+	    env, (zjni_Collection_t *)proplist,
+	    ZFSJNI_PACKAGE_DATA "Property"));
+}
+
+static jobject
+create_default_BooleanProperty(JNIEnv *env, zfs_prop_t prop)
+{
+	jobject propertyObject = NULL;
+
+	if (!zfs_prop_readonly(prop)) {
+
+		jclass class_BooleanProperty = (*env)->FindClass(env,
+		    ZFSJNI_PACKAGE_DATA "BooleanProperty");
+
+		jmethodID constructor_BooleanProperty = (*env)->GetMethodID(
+		    env, class_BooleanProperty, "<init>",
+		    "(Ljava/lang/String;Ljava/lang/Boolean;ZL"
+		    ZFSJNI_PACKAGE_DATA "Property$Lineage;)V");
+
+		jstring propName =
+		    (*env)->NewStringUTF(env, zfs_prop_to_name(prop));
+		jobject propValue =
+		    zjni_int_to_boolean(env, zfs_prop_default_numeric(prop));
+		jboolean readOnly = zfs_prop_readonly(prop) ?
+		    JNI_TRUE : JNI_FALSE;
+		jobject lineage = zjni_get_lineage(env, ZFS_SRC_DEFAULT);
+
+		propertyObject = (*env)->NewObject(
+		    env, class_BooleanProperty, constructor_BooleanProperty,
+		    propName, propValue, readOnly, lineage);
+	}
+
+	return (propertyObject);
+}
+
+static jobject
+create_default_LongProperty(JNIEnv *env, zfs_prop_t prop)
+{
+	jobject propertyObject = NULL;
+
+	if (!zfs_prop_readonly(prop)) {
+
+		jclass class_LongProperty = (*env)->FindClass(env,
+		    ZFSJNI_PACKAGE_DATA "LongProperty");
+
+		jmethodID constructor_LongProperty = (*env)->GetMethodID(
+		    env, class_LongProperty, "<init>",
+		    "(Ljava/lang/String;Ljava/lang/Long;ZL"
+		    ZFSJNI_PACKAGE_DATA "Property$Lineage;)V");
+
+		jstring propName =
+		    (*env)->NewStringUTF(env, zfs_prop_to_name(prop));
+		jobject propValue =
+		    zjni_long_to_Long(env, zfs_prop_default_numeric(prop));
+		jboolean readOnly = zfs_prop_readonly(prop)
+		    ? JNI_TRUE : JNI_FALSE;
+		jobject lineage = zjni_get_lineage(env, ZFS_SRC_DEFAULT);
+
+		propertyObject = (*env)->NewObject(
+		    env, class_LongProperty, constructor_LongProperty,
+		    propName, propValue, readOnly, lineage);
+	}
+
+	return (propertyObject);
+}
+
+static jobject
+create_default_StringProperty(JNIEnv *env, zfs_prop_t prop)
+{
+	jobject propertyObject = NULL;
+
+	if (zfs_prop_is_string(prop) && !zfs_prop_readonly(prop)) {
+
+		char propbuf[ZFS_MAXPROPLEN];
+		jclass class_StringProperty;
+		jmethodID constructor_StringProperty;
+		jstring propName;
+		jobject propValue;
+		jboolean readOnly;
+		jobject lineage;
+
+		zfs_prop_default_string(prop, propbuf, sizeof (propbuf));
+
+		class_StringProperty =
+		    (*env)->FindClass(env,
+			ZFSJNI_PACKAGE_DATA "StringProperty");
+
+		constructor_StringProperty = (*env)->GetMethodID(
+		    env, class_StringProperty, "<init>",
+		    "(Ljava/lang/String;Ljava/lang/String;ZL"
+		    ZFSJNI_PACKAGE_DATA "Property$Lineage;)V");
+
+		propName = (*env)->NewStringUTF(env, zfs_prop_to_name(prop));
+		propValue = (*env)->NewStringUTF(env, propbuf);
+		readOnly = zfs_prop_readonly(prop) ? JNI_TRUE : JNI_FALSE;
+		lineage = zjni_get_lineage(env, ZFS_SRC_DEFAULT);
+
+		propertyObject = (*env)->NewObject(
+		    env, class_StringProperty, constructor_StringProperty,
+		    propName, propValue, readOnly, lineage);
+	}
+
+	return (propertyObject);
+}
+
+static jobject
+create_default_MountPointProperty(JNIEnv *env)
+{
+	jobject propertyObject = NULL;
+
+	char propbuf[ZFS_MAXPROPLEN];
+	jclass class_MountPointProperty;
+	jmethodID constructor_MountPointProperty;
+	jobject propValue;
+	jobject lineage;
+
+	zfs_prop_default_string(ZFS_PROP_MOUNTPOINT, propbuf, sizeof (propbuf));
+
+	class_MountPointProperty = (*env)->FindClass(
+	    env, ZFSJNI_PACKAGE_DATA "MountPointProperty");
+
+	propValue = (*env)->NewStringUTF(env, propbuf);
+	lineage = zjni_get_lineage(env, ZFS_SRC_DEFAULT);
+
+	constructor_MountPointProperty = (*env)->GetMethodID(
+	    env, class_MountPointProperty, "<init>",
+	    "(Ljava/lang/String;L"
+	    ZFSJNI_PACKAGE_DATA "Property$Lineage;)V");
+
+	propertyObject = (*env)->NewObject(
+	    env, class_MountPointProperty, constructor_MountPointProperty,
+	    propValue, lineage);
+
+	return (propertyObject);
+}
+
+static jobject
+create_default_ShareNFSProperty(JNIEnv *env)
+{
+	jobject propertyObject = NULL;
+
+	char propbuf[ZFS_MAXPROPLEN];
+	jclass class_ShareNFSProperty;
+	jmethodID constructor_ShareNFSProperty;
+	jobject propValue;
+	jobject lineage;
+
+	zfs_prop_default_string(ZFS_PROP_SHARENFS, propbuf, sizeof (propbuf));
+
+	class_ShareNFSProperty = (*env)->FindClass(
+	    env, ZFSJNI_PACKAGE_DATA "ShareNFSProperty");
+
+	propValue = (*env)->NewStringUTF(env, propbuf);
+	lineage = zjni_get_lineage(env, ZFS_SRC_DEFAULT);
+
+	constructor_ShareNFSProperty = (*env)->GetMethodID(
+	    env, class_ShareNFSProperty, "<init>",
+	    "(Ljava/lang/String;L"
+	    ZFSJNI_PACKAGE_DATA "Property$Lineage;)V");
+
+	propertyObject = (*env)->NewObject(
+	    env, class_ShareNFSProperty, constructor_ShareNFSProperty,
+	    propValue, lineage);
+
+	return (propertyObject);
+}
+
+static jobject
+create_default_ChecksumProperty(JNIEnv *env)
+{
+	jobject propertyObject = NULL;
+
+	char propbuf[ZFS_MAXPROPLEN];
+	jclass class_ChecksumProperty;
+	jmethodID constructor_ChecksumProperty;
+	jobject propValue;
+	jobject lineage;
+
+	zfs_prop_default_string(ZFS_PROP_CHECKSUM, propbuf, sizeof (propbuf));
+	propValue = str_to_checksum(env, propbuf);
+
+	class_ChecksumProperty = (*env)->FindClass(
+	    env, ZFSJNI_PACKAGE_DATA "ChecksumProperty");
+
+	lineage = zjni_get_lineage(env, ZFS_SRC_DEFAULT);
+
+	constructor_ChecksumProperty = (*env)->GetMethodID(
+	    env, class_ChecksumProperty, "<init>",
+	    "(L" ZFSJNI_PACKAGE_DATA "ChecksumProperty$Checksum;L"
+	    ZFSJNI_PACKAGE_DATA "Property$Lineage;)V");
+
+	propertyObject = (*env)->NewObject(env,
+	    class_ChecksumProperty, constructor_ChecksumProperty,
+	    propValue, lineage);
+
+	return (propertyObject);
+}
+
+static jobject
+create_default_CompressionProperty(JNIEnv *env)
+{
+	jobject propertyObject = NULL;
+
+	char propbuf[ZFS_MAXPROPLEN];
+	jclass class_CompressionProperty;
+	jmethodID constructor_CompressionProperty;
+	jobject propValue;
+	jobject lineage;
+
+	zfs_prop_default_string(
+	    ZFS_PROP_COMPRESSION, propbuf, sizeof (propbuf));
+	propValue = str_to_compression(env, propbuf);
+
+	class_CompressionProperty = (*env)->FindClass(
+	    env, ZFSJNI_PACKAGE_DATA "CompressionProperty");
+
+	lineage = zjni_get_lineage(env, ZFS_SRC_DEFAULT);
+
+	constructor_CompressionProperty = (*env)->GetMethodID(
+	    env, class_CompressionProperty, "<init>",
+	    "(L" ZFSJNI_PACKAGE_DATA "CompressionProperty$Compression;L"
+	    ZFSJNI_PACKAGE_DATA "Property$Lineage;)V");
+
+	propertyObject = (*env)->NewObject(env,
+	    class_CompressionProperty, constructor_CompressionProperty,
+	    propValue, lineage);
+
+	return (propertyObject);
+}
+
+static jobject
+create_default_RecordSizeProperty(JNIEnv *env)
+{
+	jclass class_RecordSizeProperty = (*env)->FindClass(env,
+	    ZFSJNI_PACKAGE_DATA "RecordSizeProperty");
+
+	jmethodID constructor_RecordSizeProperty = (*env)->GetMethodID(
+	    env, class_RecordSizeProperty, "<init>",
+	    "(Ljava/lang/Long;L"
+	    ZFSJNI_PACKAGE_DATA "Property$Lineage;)V");
+
+	jobject propValue = zjni_long_to_Long(
+	    env, zfs_prop_default_numeric(ZFS_PROP_RECORDSIZE));
+
+	jobject lineage = zjni_get_lineage(env, ZFS_SRC_DEFAULT);
+
+	jobject propertyObject = (*env)->NewObject(
+	    env, class_RecordSizeProperty, constructor_RecordSizeProperty,
+	    propValue, lineage);
+
+	return (propertyObject);
+}
diff --git a/usr/src/lib/libzfs_jni/common/libzfs_jni_property.h b/usr/src/lib/libzfs_jni/common/libzfs_jni_property.h
new file mode 100644
index 000000000000..8ab1c41b6a46
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_property.h
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LIBZFS_JNI_PROPERTY_H
+#define	_LIBZFS_JNI_PROPERTY_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Function prototypes
+ */
+
+#include <jni.h>
+#include <libzfs.h>
+
+jobject zjni_get_default_property(JNIEnv *, zfs_prop_t);
+jobject zjni_get_lineage(JNIEnv *, zfs_source_t);
+jobjectArray zjni_get_Dataset_properties(JNIEnv *, zfs_handle_t *);
+zfs_prop_t zjni_get_property_from_name(const char *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBZFS_JNI_PROPERTY_H */
diff --git a/usr/src/lib/libzfs_jni/common/libzfs_jni_util.c b/usr/src/lib/libzfs_jni/common/libzfs_jni_util.c
new file mode 100644
index 000000000000..4bbf63668eba
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_util.c
@@ -0,0 +1,310 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include "libzfs_jni_util.h"
+#include <strings.h>
+
+/*
+ * Package-private functions
+ */
+
+/*PRINTFLIKE2*/
+void
+zjni_throw_exception(JNIEnv *env, const char *fmt, ...)
+{
+	char error[1024];
+	va_list ap;
+	jclass class_UnsupportedOperationException;
+
+	va_start(ap, fmt);
+	(void) vsnprintf(error, sizeof (error), fmt, ap);
+	va_end(ap);
+
+	class_UnsupportedOperationException =
+	    (*env)->FindClass(env, "java/lang/UnsupportedOperationException");
+
+	(*env)->ThrowNew(env, class_UnsupportedOperationException, error);
+}
+
+jstring
+zjni_get_matched_string(JNIEnv *env, char *name, regmatch_t *match)
+{
+	jstring stringUTF = NULL;
+	if (match->rm_so != -1 && match->rm_eo != -1) {
+		char *end = name + match->rm_eo;
+		char tmp = *end;
+		*end = '\0';
+		stringUTF = (*env)->NewStringUTF(env, name + match->rm_so);
+		*end = tmp;
+	}
+	return (stringUTF);
+}
+
+void
+zjni_get_dataset_from_snapshot(const char *snapshot, char *dataset,
+    size_t len)
+{
+	char *at;
+	(void) strncpy(dataset, snapshot, len);
+	at = strchr(dataset, '@');
+	if (at != NULL) {
+		*at = '\0';
+	}
+}
+
+/* Convert a zjni_Collection to a (Java) array */
+jobjectArray
+zjni_Collection_to_array(JNIEnv *env, zjni_Collection_t *list, char *class)
+{
+	/* Get size of zjni_Collection */
+	jint length = (*env)->CallIntMethod(
+	    env, ((zjni_Object_t *)list)->object,
+	    ((zjni_Collection_t *)list)->method_size);
+
+	/* Create array to hold elements of list */
+	jobjectArray array = (*env)->NewObjectArray(
+	    env, length, (*env)->FindClass(env, class), NULL);
+
+	/* Copy list elements to array */
+	return (*env)->CallObjectMethod(env, ((zjni_Object_t *)list)->object,
+	    ((zjni_Collection_t *)list)->method_toArray, array);
+}
+
+/* Create a zjni_Collection */
+void
+new_Collection(JNIEnv *env, zjni_Collection_t *collection)
+{
+	zjni_Object_t *object = (zjni_Object_t *)collection;
+
+	collection->method_add = (*env)->GetMethodID(
+	    env, object->class, "add", "(Ljava/lang/Object;)Z");
+
+	collection->method_size =
+	    (*env)->GetMethodID(env, object->class, "size", "()I");
+
+	collection->method_toArray =
+	    (*env)->GetMethodID(env, object->class, "toArray",
+		"([Ljava/lang/Object;)[Ljava/lang/Object;");
+}
+
+/* Create an zjni_ArrayList */
+void
+zjni_new_ArrayList(JNIEnv *env, zjni_ArrayList_t *list)
+{
+	zjni_Object_t *object = (zjni_Object_t *)list;
+
+	if (object->object == NULL) {
+		object->class = (*env)->FindClass(env, "java/util/ArrayList");
+
+		object->constructor =
+		    (*env)->GetMethodID(env, object->class, "<init>", "()V");
+
+		object->object = (*env)->NewObject(
+		    env, object->class, object->constructor);
+	}
+
+	new_Collection(env, (zjni_Collection_t *)list);
+}
+
+/* Create an zjni_DatasetSet */
+void
+zjni_new_DatasetSet(JNIEnv *env, zjni_DatasetSet_t *list)
+{
+	zjni_Object_t *object = (zjni_Object_t *)list;
+
+	if (object->object == NULL) {
+		object->class = (*env)->FindClass(
+		    env, "com/sun/zfs/common/util/DatasetSet");
+
+		object->constructor =
+		    (*env)->GetMethodID(env, object->class, "<init>", "()V");
+
+		object->object = (*env)->NewObject(
+		    env, object->class, object->constructor);
+	}
+
+	new_Collection(env, (zjni_Collection_t *)list);
+}
+
+jobject
+zjni_int_to_boolean(JNIEnv *env, uint64_t value)
+{
+	jclass class_Boolean = (*env)->FindClass(
+	    env, "java/lang/Boolean");
+
+	jfieldID id = (*env)->GetStaticFieldID(env, class_Boolean,
+	    value ? "TRUE" : "FALSE", "Ljava/lang/Boolean;");
+
+	return (*env)->GetStaticObjectField(env, class_Boolean, id);
+}
+
+jobject
+zjni_str_to_long(JNIEnv *env, char *str)
+{
+	jobject value = NULL;
+	jclass class_Long = (*env)->FindClass(env, "java/lang/Long");
+
+	jmethodID method_valueOf = (*env)->GetStaticMethodID(env,
+	    class_Long, "valueOf", "(Ljava/lang/String;)Ljava/lang/Long;");
+
+	jstring utf = (*env)->NewStringUTF(env, str);
+
+	/* May throw a NumberFormatException */
+	value = (*env)->CallStaticObjectMethod(
+	    env, class_Long, method_valueOf, utf);
+
+	return (value);
+}
+
+jobject
+zjni_long_to_Long(JNIEnv *env, uint64_t value)
+{
+	jclass class_Long = (*env)->FindClass(env, "java/lang/Long");
+
+	jmethodID constructor_Long = (*env)->GetMethodID(
+	    env, class_Long, "<init>", "(J)V");
+
+	jobject obj = (*env)->NewObject(
+	    env, class_Long, constructor_Long, value);
+
+	return (obj);
+}
+
+jobject
+zjni_str_to_date(JNIEnv *env, char *str)
+{
+	jobject date = NULL;
+	jclass class_Long = (*env)->FindClass(env, "java/lang/Long");
+
+	jmethodID method_parseLong = (*env)->GetStaticMethodID(env,
+	    class_Long, "parseLong", "(Ljava/lang/String;)J");
+
+	jstring utf = (*env)->NewStringUTF(env, str);
+	if (utf != NULL) {
+
+		/* May throw a NumberFormatException */
+		jlong time = (*env)->CallStaticLongMethod(
+		    env, class_Long, method_parseLong, utf);
+
+		if ((*env)->ExceptionOccurred(env) == NULL) {
+
+			jclass class_Date = (*env)->FindClass(env,
+			    "java/util/Date");
+
+			jmethodID constructor_Date = (*env)->GetMethodID(
+			    env, class_Date, "<init>", "(J)V");
+
+			/* Date constructor takes epoch milliseconds */
+			time *= 1000;
+
+			date = (*env)->NewObject(
+			    env, class_Date, constructor_Date, time);
+		}
+	}
+
+	return (date);
+}
+
+jobjectArray
+zjni_string_array_to_String_array(JNIEnv *env, char **array, int n)
+{
+	int i;
+	jclass class_String = (*env)->FindClass(env, "java/lang/String");
+	jobjectArray jarray =
+	    (*env)->NewObjectArray(env, n, class_String, NULL);
+
+	for (i = 0; i < n; i++) {
+		jstring elementUTF = (*env)->NewStringUTF(env, array[i]);
+		(void) (*env)->SetObjectArrayElement(env, jarray, i,
+		    elementUTF);
+	}
+
+	return (jarray);
+}
+
+/*
+ * Counts the number of elements in the given NULL-terminated array.
+ * Does not include the terminating NULL in the count.
+ */
+int
+zjni_count_elements(void **array)
+{
+	int i = 0;
+	if (array != NULL) {
+		for (; array[i] != NULL; i++);
+	}
+	return (i);
+}
+
+/*
+ * Get a handle to the next nvpair with the specified name and data
+ * type in the list following the given nvpair.
+ *
+ * This function is needed because the nvlist_lookup_* routines can
+ * only be used with nvlists allocated with NV_UNIQUE_NAME or
+ * NV_UNIQUE_NAME_TYPE, ie. lists of unique name/value pairs.
+ *
+ * Some variation of this function will likely appear in the libnvpair
+ * library per 4981923.
+ *
+ * @param       nvl
+ *              the nvlist_t to search
+ *
+ * @param       name
+ *              the string key for the pair to find in the list, or
+ *              NULL to match any name
+ *
+ * @param       type
+ *              the data type for the pair to find in the list, or
+ *              DATA_TYPE_UNKNOWN to match any type
+ *
+ * @param       nvp
+ *              the pair to search from in the list, or NULL to search
+ *              from the beginning of the list
+ *
+ * @return      the next nvpair in the list matching the given
+ *              criteria, or NULL if no matching nvpair is found
+ */
+nvpair_t *
+zjni_nvlist_walk_nvpair(nvlist_t *nvl, const char *name, data_type_t type,
+    nvpair_t *nvp)
+{
+	/* For each nvpair in the list following nvp... */
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+
+		/* Does this pair's name match the given name? */
+		if ((name == NULL || strcmp(nvpair_name(nvp), name) == 0) &&
+
+		    /* Does this pair's type match the given type? */
+		    (type == DATA_TYPE_UNKNOWN || type == nvpair_type(nvp))) {
+			return (nvp);
+		}
+	}
+
+	return (NULL);
+}
diff --git a/usr/src/lib/libzfs_jni/common/libzfs_jni_util.h b/usr/src/lib/libzfs_jni/common/libzfs_jni_util.h
new file mode 100644
index 000000000000..48c60f34e52f
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_util.h
@@ -0,0 +1,100 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LIBZFS_JNI_UTIL_H
+#define	_LIBZFS_JNI_UTIL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <jni.h>
+#include <regex.h>
+#include <libnvpair.h>
+
+/*
+ * Constants
+ */
+
+#define	ZFSJNI_PACKAGE_DATA	"com/sun/zfs/common/model/"
+
+/*
+ * Types
+ */
+
+typedef struct zjni_Object {
+	jclass class;
+	jobject object;
+	jmethodID constructor;
+} zjni_Object_t;
+
+typedef struct zjni_Collection {
+	zjni_Object_t super;
+
+	jmethodID method_add;
+	jmethodID method_size;
+	jmethodID method_toArray;
+} zjni_Collection_t;
+
+typedef struct zjni_ArrayList {
+	zjni_Collection_t super;
+} zjni_ArrayList_t;
+
+typedef struct zjni_DatasetSet {
+	zjni_Collection_t super;
+} zjni_DatasetSet_t;
+
+typedef struct zjni_ArrayCallbackData {
+	JNIEnv *env;
+	zjni_Collection_t *list;
+} zjni_ArrayCallbackData_t;
+
+/*
+ * Function prototypes
+ */
+
+void zjni_throw_exception(JNIEnv *, const char *, ...);
+jstring zjni_get_matched_string(JNIEnv *, char *, regmatch_t *);
+void zjni_get_dataset_from_snapshot(const char *, char *, size_t);
+jobjectArray zjni_Collection_to_array(JNIEnv *, zjni_Collection_t *, char *);
+void zjni_new_ArrayList(JNIEnv *, zjni_ArrayList_t *);
+void zjni_new_DatasetSet(JNIEnv *, zjni_DatasetSet_t *);
+jobject zjni_int_to_boolean(JNIEnv *, uint64_t);
+jobject zjni_str_to_long(JNIEnv *, char *);
+jobject zjni_long_to_Long(JNIEnv *, uint64_t);
+jobject zjni_str_to_date(JNIEnv *, char *);
+jobjectArray zjni_string_array_to_String_array(JNIEnv *, char **, int);
+int zjni_count_elements(void **);
+nvpair_t *zjni_nvlist_walk_nvpair(
+	nvlist_t *, const char *, data_type_t, nvpair_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBZFS_JNI_UTIL_H */
diff --git a/usr/src/lib/libzfs_jni/common/llib-lzfs_jni b/usr/src/lib/libzfs_jni/common/llib-lzfs_jni
new file mode 100644
index 000000000000..3e7aa11d57f2
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/common/llib-lzfs_jni
@@ -0,0 +1,33 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*LINTLIBRARY*/
+/*PROTOLIB1*/
+
+#include <libzfs_jni_main.h>
+#include <libzfs_jni_diskmgt.h>
diff --git a/usr/src/lib/libzfs_jni/i386/Makefile b/usr/src/lib/libzfs_jni/i386/Makefile
new file mode 100644
index 000000000000..cd02883abf56
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/i386/Makefile
@@ -0,0 +1,30 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include ../Makefile.com
+
+install: all $(ROOTLIBS) $(ROOTLINKS) $(ROOTLINT)
diff --git a/usr/src/lib/libzfs_jni/sparc/Makefile b/usr/src/lib/libzfs_jni/sparc/Makefile
new file mode 100644
index 000000000000..cd02883abf56
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/sparc/Makefile
@@ -0,0 +1,30 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include ../Makefile.com
+
+install: all $(ROOTLIBS) $(ROOTLINKS) $(ROOTLINT)
diff --git a/usr/src/lib/libzfs_jni/sparcv9/Makefile b/usr/src/lib/libzfs_jni/sparcv9/Makefile
new file mode 100644
index 000000000000..44075ed1bddf
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/sparcv9/Makefile
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include ../Makefile.com
+include ../../Makefile.lib.64
+
+install: all $(ROOTLIBS64) $(ROOTLINKS64)
diff --git a/usr/src/lib/libzfs_jni/spec/Makefile b/usr/src/lib/libzfs_jni/spec/Makefile
new file mode 100644
index 000000000000..2cb984bfc990
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/spec/Makefile
@@ -0,0 +1,28 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+include $(SRC)/lib/Makefile.spec.arch
diff --git a/usr/src/lib/libzfs_jni/spec/Makefile.targ b/usr/src/lib/libzfs_jni/spec/Makefile.targ
new file mode 100644
index 000000000000..0a844e13a9b6
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/spec/Makefile.targ
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+LIBRARY=	libzfs_jni.a
+VERS=		.1
+
+OBJECTS=	libzfs_jni.o
diff --git a/usr/src/lib/libzfs_jni/spec/amd64/Makefile b/usr/src/lib/libzfs_jni/spec/amd64/Makefile
new file mode 100644
index 000000000000..98db1f927173
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/spec/amd64/Makefile
@@ -0,0 +1,35 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+.KEEP_STATE:
+
+include ../Makefile.targ
+include $(SRC)/lib/Makefile.lib
+include $(SRC)/lib/Makefile.lib.64
+include $(SRC)/lib/Makefile.spec
+
+install: $(ROOTABILIB64)
diff --git a/usr/src/lib/libzfs_jni/spec/i386/Makefile b/usr/src/lib/libzfs_jni/spec/i386/Makefile
new file mode 100644
index 000000000000..6256c68c81d7
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/spec/i386/Makefile
@@ -0,0 +1,34 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+.KEEP_STATE:
+
+include ../Makefile.targ
+include $(SRC)/lib/Makefile.lib
+include $(SRC)/lib/Makefile.spec
+
+install: $(ROOTABILIB)
diff --git a/usr/src/lib/libzfs_jni/spec/libzfs_jni.spec b/usr/src/lib/libzfs_jni/spec/libzfs_jni.spec
new file mode 100644
index 000000000000..8531b0f40771
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/spec/libzfs_jni.spec
@@ -0,0 +1,113 @@
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getPools
+version SUNWprivate_1.1
+end
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getPool
+version SUNWprivate_1.1
+end
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getFileSystems
+version SUNWprivate_1.1
+end
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getFileSystem
+version SUNWprivate_1.1
+end
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getVolumes
+version SUNWprivate_1.1
+end
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getVolume
+version SUNWprivate_1.1
+end
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getSnapshots
+version SUNWprivate_1.1
+end
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getSnapshot
+version SUNWprivate_1.1
+end
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getDatasets
+version SUNWprivate_1.1
+end
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getDataset
+version SUNWprivate_1.1
+end
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getVirtualDevice
+version SUNWprivate_1.1
+end
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getVirtualDevices__Ljava_lang_String_2
+version SUNWprivate_1.1
+end
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getVirtualDevices__Ljava_lang_String_2J
+version SUNWprivate_1.1
+end
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getAvailableDisks
+version SUNWprivate_1.1
+end
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getDependents
+version SUNWprivate_1.1
+end
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getPropertyDefault
+version SUNWprivate_1.1
+end
+
+function Java_com_sun_zfs_common_model_SystemDataModel_getValidPropertyNames
+version SUNWprivate_1.1
+end
+
+function dmgt_avail_disk_iter
+version SUNWprivate_1.1
+end
+
+function dmgt_free_disk
+version SUNWprivate_1.1
+end
+
+function dmgt_free_slice
+version SUNWprivate_1.1
+end
+
+function dmgt_set_error_handler
+version SUNWprivate_1.1
+end
diff --git a/usr/src/lib/libzfs_jni/spec/sparc/Makefile b/usr/src/lib/libzfs_jni/spec/sparc/Makefile
new file mode 100644
index 000000000000..6256c68c81d7
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/spec/sparc/Makefile
@@ -0,0 +1,34 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+.KEEP_STATE:
+
+include ../Makefile.targ
+include $(SRC)/lib/Makefile.lib
+include $(SRC)/lib/Makefile.spec
+
+install: $(ROOTABILIB)
diff --git a/usr/src/lib/libzfs_jni/spec/sparcv9/Makefile b/usr/src/lib/libzfs_jni/spec/sparcv9/Makefile
new file mode 100644
index 000000000000..98db1f927173
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/spec/sparcv9/Makefile
@@ -0,0 +1,35 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+.KEEP_STATE:
+
+include ../Makefile.targ
+include $(SRC)/lib/Makefile.lib
+include $(SRC)/lib/Makefile.lib.64
+include $(SRC)/lib/Makefile.spec
+
+install: $(ROOTABILIB64)
diff --git a/usr/src/lib/libzfs_jni/spec/versions b/usr/src/lib/libzfs_jni/spec/versions
new file mode 100644
index 000000000000..5b067849f586
--- /dev/null
+++ b/usr/src/lib/libzfs_jni/spec/versions
@@ -0,0 +1,43 @@
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+sparc {
+	SUNWprivate_1.1;
+}
+sparcv9 {
+	SUNWprivate_1.1;
+}
+i386 {
+	SUNWprivate_1.1;
+}
+amd64 {
+	SUNWprivate_1.1;
+}
diff --git a/usr/src/lib/libzonecfg/common/libzonecfg.c b/usr/src/lib/libzonecfg/common/libzonecfg.c
index b6a6a59cee75..416be740e314 100644
--- a/usr/src/lib/libzonecfg/common/libzonecfg.c
+++ b/usr/src/lib/libzonecfg/common/libzonecfg.c
@@ -67,6 +67,7 @@
 #define	DTD_ELEM_RCTL		(const xmlChar *) "rctl"
 #define	DTD_ELEM_RCTLVALUE	(const xmlChar *) "rctl-value"
 #define	DTD_ELEM_ZONE		(const xmlChar *) "zone"
+#define	DTD_ELEM_DATASET	(const xmlChar *) "dataset"
 
 #define	DTD_ATTR_ACTION		(const xmlChar *) "action"
 #define	DTD_ATTR_ADDRESS	(const xmlChar *) "address"
@@ -1907,6 +1908,7 @@ static const char *standard_devs[] = {
 #endif
 	"cpu/self/cpuid",
 	"dtrace/helper",
+	"zfs",
 	NULL
 };
 
@@ -3518,3 +3520,176 @@ zonecfg_valid_rctl(const char *name, const rctlblk_t *rctlblk)
 
 	return (B_TRUE);
 }
+
+static int
+zonecfg_add_ds_core(zone_dochandle_t handle, struct zone_dstab *tabptr)
+{
+	xmlNodePtr newnode, cur = handle->zone_dh_cur;
+	int err;
+
+	newnode = xmlNewTextChild(cur, NULL, DTD_ELEM_DATASET, NULL);
+	if ((err = newprop(newnode, DTD_ATTR_NAME,
+	    tabptr->zone_dataset_name)) != Z_OK)
+		return (err);
+	return (Z_OK);
+}
+
+int
+zonecfg_add_ds(zone_dochandle_t handle, struct zone_dstab *tabptr)
+{
+	int err;
+
+	if (tabptr == NULL)
+		return (Z_INVAL);
+
+	if ((err = operation_prep(handle)) != Z_OK)
+		return (err);
+
+	if ((err = zonecfg_add_ds_core(handle, tabptr)) != Z_OK)
+		return (err);
+
+	return (Z_OK);
+}
+
+static int
+zonecfg_delete_ds_core(zone_dochandle_t handle, struct zone_dstab *tabptr)
+{
+	xmlNodePtr cur = handle->zone_dh_cur;
+
+	for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) {
+		if (xmlStrcmp(cur->name, DTD_ELEM_DATASET))
+			continue;
+
+		if (match_prop(cur, DTD_ATTR_NAME,
+		    tabptr->zone_dataset_name)) {
+			xmlUnlinkNode(cur);
+			xmlFreeNode(cur);
+			return (Z_OK);
+		}
+	}
+	return (Z_NO_RESOURCE_ID);
+}
+
+int
+zonecfg_delete_ds(zone_dochandle_t handle, struct zone_dstab *tabptr)
+{
+	int err;
+
+	if (tabptr == NULL)
+		return (Z_INVAL);
+
+	if ((err = operation_prep(handle)) != Z_OK)
+		return (err);
+
+	if ((err = zonecfg_delete_ds_core(handle, tabptr)) != Z_OK)
+		return (err);
+
+	return (Z_OK);
+}
+
+int
+zonecfg_modify_ds(
+	zone_dochandle_t handle,
+	struct zone_dstab *oldtabptr,
+	struct zone_dstab *newtabptr)
+{
+	int err;
+
+	if (oldtabptr == NULL || newtabptr == NULL)
+		return (Z_INVAL);
+
+	if ((err = operation_prep(handle)) != Z_OK)
+		return (err);
+
+	if ((err = zonecfg_delete_ds_core(handle, oldtabptr)) != Z_OK)
+		return (err);
+
+	if ((err = zonecfg_add_ds_core(handle, newtabptr)) != Z_OK)
+		return (err);
+
+	return (Z_OK);
+}
+
+int
+zonecfg_lookup_ds(zone_dochandle_t handle, struct zone_dstab *tabptr)
+{
+	xmlNodePtr cur, firstmatch;
+	int err;
+	char dataset[MAXNAMELEN];
+
+	if (tabptr == NULL)
+		return (Z_INVAL);
+
+	if ((err = operation_prep(handle)) != Z_OK)
+		return (err);
+
+	cur = handle->zone_dh_cur;
+	firstmatch = NULL;
+	for (cur = cur->xmlChildrenNode; cur != NULL; cur = cur->next) {
+		if (xmlStrcmp(cur->name, DTD_ELEM_DATASET))
+			continue;
+		if (strlen(tabptr->zone_dataset_name) > 0) {
+			if ((fetchprop(cur, DTD_ATTR_NAME, dataset,
+			    sizeof (dataset)) == Z_OK) &&
+			    (strcmp(tabptr->zone_dataset_name,
+			    dataset) == 0)) {
+				if (firstmatch == NULL)
+					firstmatch = cur;
+				else
+					return (Z_INSUFFICIENT_SPEC);
+			}
+		}
+	}
+	if (firstmatch == NULL)
+		return (Z_NO_RESOURCE_ID);
+
+	cur = firstmatch;
+
+	if ((err = fetchprop(cur, DTD_ATTR_NAME, tabptr->zone_dataset_name,
+	    sizeof (tabptr->zone_dataset_name))) != Z_OK)
+		return (err);
+
+	return (Z_OK);
+}
+
+int
+zonecfg_setdsent(zone_dochandle_t handle)
+{
+	return (zonecfg_setent(handle));
+}
+
+int
+zonecfg_getdsent(zone_dochandle_t handle, struct zone_dstab *tabptr)
+{
+	xmlNodePtr cur;
+	int err;
+
+	if (handle == NULL)
+		return (Z_INVAL);
+
+	if ((cur = handle->zone_dh_cur) == NULL)
+		return (Z_NO_ENTRY);
+
+	for (; cur != NULL; cur = cur->next)
+		if (!xmlStrcmp(cur->name, DTD_ELEM_DATASET))
+			break;
+	if (cur == NULL) {
+		handle->zone_dh_cur = handle->zone_dh_top;
+		return (Z_NO_ENTRY);
+	}
+
+	if ((err = fetchprop(cur, DTD_ATTR_NAME, tabptr->zone_dataset_name,
+	    sizeof (tabptr->zone_dataset_name))) != Z_OK) {
+		handle->zone_dh_cur = handle->zone_dh_top;
+		return (err);
+	}
+
+	handle->zone_dh_cur = cur->next;
+	return (Z_OK);
+}
+
+int
+zonecfg_enddsent(zone_dochandle_t handle)
+{
+	return (zonecfg_endent(handle));
+}
diff --git a/usr/src/lib/libzonecfg/dtd/zonecfg.dtd.1 b/usr/src/lib/libzonecfg/dtd/zonecfg.dtd.1
index d10bb5283d7a..d15fe65fcdf6 100644
--- a/usr/src/lib/libzonecfg/dtd/zonecfg.dtd.1
+++ b/usr/src/lib/libzonecfg/dtd/zonecfg.dtd.1
@@ -1,9 +1,6 @@
 <?xml version='1.0' encoding='UTF-8' ?>
 
 <!--
- Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- Use is subject to license terms.
-
  CDDL HEADER START
 
  The contents of this file are subject to the terms of the
@@ -24,6 +21,9 @@
 
  CDDL HEADER END
 
+ Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ Use is subject to license terms.
+
     ident	"%Z%%M%	%I%	%E% SMI"
 -->
 
@@ -69,7 +69,11 @@
 					#REQUIRED
 			value		CDATA #REQUIRED>
 
-<!ELEMENT zone		(filesystem | inherited-pkg-dir | network | device | rctl | attr)*>
+<!ELEMENT dataset	EMPTY>
+
+<!ATTLIST dataset	name		CDATA #REQUIRED>
+
+<!ELEMENT zone		(filesystem | inherited-pkg-dir | network | device | rctl | attr | dataset)*>
 
 <!ATTLIST zone		name		CDATA #REQUIRED
 			zonepath	CDATA #REQUIRED
diff --git a/usr/src/lib/libzonecfg/spec/libzonecfg.spec b/usr/src/lib/libzonecfg/spec/libzonecfg.spec
index dbc9f5da9aad..a0cdb3db985b 100644
--- a/usr/src/lib/libzonecfg/spec/libzonecfg.spec
+++ b/usr/src/lib/libzonecfg/spec/libzonecfg.spec
@@ -279,6 +279,30 @@ declaration     int zonecfg_lookup_attr(zone_dochandle_t, struct zone_attrtab *)
 version         SUNWprivate_1.1
 end             
 
+function	zonecfg_add_ds
+include		<libzonecfg.h>
+declaration	int zonecfg_add_ds(zone_dochandle_t, struct zone_dstab *)
+version		SUNWprivate_1.1
+end		
+
+function	zonecfg_delete_ds
+include		<libzonecfg.h>
+declaration	int zonecfg_delete_ds(zone_dochandle_t, struct zone_dstab *)
+version		SUNWprivate_1.1
+end		
+
+function	zonecfg_modify_ds
+include		<libzonecfg.h>
+declaration	int zonecfg_modify_ds(zone_dochandle_t, struct zone_dstab *, struct zone_dstab *)
+version		SUNWprivate_1.1
+end		
+
+function	zonecfg_lookup_ds
+include		<libzonecfg.h>
+declaration	int zonecfg_lookup_ds(zone_dochandle_t, struct zone_dstab *)
+version		SUNWprivate_1.1
+end		
+
 function        zonecfg_get_attr_boolean
 include         <libzonecfg.h>
 declaration     int zonecfg_get_attr_boolean(const struct zone_attrtab *, boolean_t *)
@@ -459,6 +483,24 @@ declaration	int zonecfg_endrctlent(zone_dochandle_t)
 version		SUNWprivate_1.1
 end		
 
+function	zonecfg_setdsent
+include		<libzonecfg.h>
+declaration	int zonecfg_setdsent(zone_dochandle_t);
+version		SUNWprivate_1.1
+end		
+
+function	zonecfg_getdsent
+include		<libzonecfg.h>
+declaration	int zonecfg_getdsent(zone_dochandle_t, struct zone_dstab *)
+version		SUNWprivate_1.1
+end		
+
+function	zonecfg_enddsent
+include		<libzonecfg.h>
+declaration	int zonecfg_enddsent(zone_dochandle_t)
+version		SUNWprivate_1.1
+end		
+
 function	zonecfg_destroy
 include		<libzonecfg.h>
 declaration	int zonecfg_destroy(const char *, boolean_t)
diff --git a/usr/src/lib/libzpool/Makefile b/usr/src/lib/libzpool/Makefile
new file mode 100644
index 000000000000..f77568c08ac7
--- /dev/null
+++ b/usr/src/lib/libzpool/Makefile
@@ -0,0 +1,51 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include		../Makefile.lib
+
+$(INTEL_BLD)SUBDIRS	= $(MACH)
+$(BUILD64)SUBDIRS	+= $(MACH64)
+
+all :=		TARGET= all
+clean :=	TARGET= clean
+clobber :=	TARGET= clobber
+install :=	TARGET= install
+lint :=		TARGET= lint
+
+.KEEP_STATE:
+
+all clean clobber install lint: $(SUBDIRS)
+
+install_h: $(ROOTHDRS)
+
+check: $(CHECKHDRS)
+
+$(MACH) $(MACH64): FRC
+	@cd $@; pwd; $(MAKE) $(TARGET)
+
+FRC:
diff --git a/usr/src/lib/libzpool/Makefile.com b/usr/src/lib/libzpool/Makefile.com
new file mode 100644
index 000000000000..c39f5f08ed89
--- /dev/null
+++ b/usr/src/lib/libzpool/Makefile.com
@@ -0,0 +1,83 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+
+LIBRARY= libzpool.a
+VERS= .1
+
+include ../../../uts/common/Makefile.files
+KERNEL_OBJS = kernel.o taskq.o util.o
+LIST_OBJS = list.o
+
+OBJECTS=$(ZFS_COMMON_OBJS) $(ZFS_SHARED_OBJS) $(KERNEL_OBJS) $(LIST_OBJS)
+
+# include library definitions
+include ../../Makefile.lib
+
+ZFS_COMMON_SRCS=	$(ZFS_COMMON_OBJS:%.o=../../../uts/common/fs/zfs/%.c)
+SHARED_SRCS=		$(ZFS_SHARED_OBJS:%.o=../../../common/zfs/%.c)
+KERNEL_SRCS=		$(KERNEL_OBJS:%.o=../common/%.c)
+LIST_SRCS=		$(LIST_OBJS:%.o=../../../uts/common/os/%.c)
+
+SRCS=$(ZFS_COMMON_SRCS) $(KERNEL_SRCS) $(LIST_SRCS)
+SRCDIR=		../common
+
+LIBS +=		$(LINTLIB)
+
+INCS += -I../common
+INCS += -I../../../uts/common/fs/zfs
+INCS += -I../../../common/zfs
+
+$(LINTLIB) := SRCS=	$(SRCDIR)/$(LINTSRC)
+
+C99MODE=	-xc99=%all
+C99LMODE=	-Xc99=%all
+
+CFLAGS +=	-g $(CCVERBOSE) $(CNOGLOBAL)
+CFLAGS64 +=	-g $(CCVERBOSE) $(CNOGLOBAL)
+LDLIBS +=	-lumem -lavl -lnvpair -lc
+CPPFLAGS +=	$(INCS)
+
+.KEEP_STATE:
+
+all: $(LIBS)
+
+lint: $(LINTLIB)
+
+include ../../Makefile.targ
+
+objs/%.o pics/%.o: ../../../uts/common/fs/zfs/%.c
+	$(COMPILE.c) -o $@ $<
+	$(POST_PROCESS_O)
+
+objs/%.o pics/%.o: ../../../common/zfs/%.c
+	$(COMPILE.c) -o $@ $<
+	$(POST_PROCESS_O)
+
+objs/%.o pics/%.o: ../../../uts/common/os/%.c
+	$(COMPILE.c) -o $@ $<
+	$(POST_PROCESS_O)
+
diff --git a/usr/src/lib/libzpool/amd64/Makefile b/usr/src/lib/libzpool/amd64/Makefile
new file mode 100644
index 000000000000..f484bb496dfc
--- /dev/null
+++ b/usr/src/lib/libzpool/amd64/Makefile
@@ -0,0 +1,32 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+include ../../Makefile.lib.64
+
+install: all $(ROOTLIBS64) $(ROOTLINKS64) $(ROOTLINT)
diff --git a/usr/src/lib/libzpool/common/kernel.c b/usr/src/lib/libzpool/common/kernel.c
new file mode 100644
index 000000000000..83155b480f96
--- /dev/null
+++ b/usr/src/lib/libzpool/common/kernel.c
@@ -0,0 +1,675 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <assert.h>
+#include <sys/zfs_context.h>
+#include <poll.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/spa.h>
+#include <sys/processor.h>
+
+/*
+ * Emulation of kernel services in userland.
+ */
+
+uint64_t physmem;
+vnode_t *rootdir = (vnode_t *)0xabcd1234;
+
+/*
+ * =========================================================================
+ * threads
+ * =========================================================================
+ */
+/*ARGSUSED*/
+kthread_t *
+zk_thread_create(void (*func)(), void *arg)
+{
+	thread_t tid;
+
+	VERIFY(thr_create(0, 0, (void *(*)(void *))func, arg, THR_DETACHED,
+	    &tid) == 0);
+
+	return ((void *)(uintptr_t)tid);
+}
+
+/*
+ * =========================================================================
+ * mutexes
+ * =========================================================================
+ */
+void
+zmutex_init(kmutex_t *mp)
+{
+	mp->m_owner = NULL;
+	(void) _mutex_init(&mp->m_lock, USYNC_THREAD, NULL);
+}
+
+void
+zmutex_destroy(kmutex_t *mp)
+{
+	ASSERT(mp->m_owner == NULL);
+	(void) _mutex_destroy(&(mp)->m_lock);
+	mp->m_owner = (void *)-1UL;
+}
+
+void
+mutex_enter(kmutex_t *mp)
+{
+	ASSERT(mp->m_owner != (void *)-1UL);
+	ASSERT(mp->m_owner != curthread);
+	(void) mutex_lock(&mp->m_lock);
+	ASSERT(mp->m_owner == NULL);
+	mp->m_owner = curthread;
+}
+
+int
+mutex_tryenter(kmutex_t *mp)
+{
+	ASSERT(mp->m_owner != (void *)-1UL);
+	if (0 == mutex_trylock(&mp->m_lock)) {
+		ASSERT(mp->m_owner == NULL);
+		mp->m_owner = curthread;
+		return (1);
+	} else {
+		return (0);
+	}
+}
+
+void
+mutex_exit(kmutex_t *mp)
+{
+	ASSERT(mutex_owner(mp) == curthread);
+	mp->m_owner = NULL;
+	(void) mutex_unlock(&mp->m_lock);
+}
+
+void *
+mutex_owner(kmutex_t *mp)
+{
+	return (mp->m_owner);
+}
+
+/*
+ * =========================================================================
+ * rwlocks
+ * =========================================================================
+ */
+/*ARGSUSED*/
+void
+rw_init(krwlock_t *rwlp, char *name, int type, void *arg)
+{
+	rwlock_init(&rwlp->rw_lock, USYNC_THREAD, NULL);
+	rwlp->rw_owner = NULL;
+}
+
+void
+rw_destroy(krwlock_t *rwlp)
+{
+	rwlock_destroy(&rwlp->rw_lock);
+	rwlp->rw_owner = (void *)-1UL;
+}
+
+void
+rw_enter(krwlock_t *rwlp, krw_t rw)
+{
+	ASSERT(!RW_LOCK_HELD(rwlp));
+	ASSERT(rwlp->rw_owner != (void *)-1UL);
+	ASSERT(rwlp->rw_owner != curthread);
+
+	if (rw == RW_READER)
+		(void) rw_rdlock(&rwlp->rw_lock);
+	else
+		(void) rw_wrlock(&rwlp->rw_lock);
+
+	rwlp->rw_owner = curthread;
+}
+
+void
+rw_exit(krwlock_t *rwlp)
+{
+	ASSERT(rwlp->rw_owner != (void *)-1UL);
+
+	rwlp->rw_owner = NULL;
+	(void) rw_unlock(&rwlp->rw_lock);
+}
+
+int
+rw_tryenter(krwlock_t *rwlp, krw_t rw)
+{
+	int rv;
+
+	ASSERT(rwlp->rw_owner != (void *)-1UL);
+
+	if (rw == RW_READER)
+		rv = rw_tryrdlock(&rwlp->rw_lock);
+	else
+		rv = rw_trywrlock(&rwlp->rw_lock);
+
+	if (rv == 0) {
+		rwlp->rw_owner = curthread;
+		return (1);
+	}
+
+	return (0);
+}
+
+/*ARGSUSED*/
+int
+rw_tryupgrade(krwlock_t *rwlp)
+{
+	ASSERT(rwlp->rw_owner != (void *)-1UL);
+
+	return (0);
+}
+
+/*
+ * =========================================================================
+ * condition variables
+ * =========================================================================
+ */
+/*ARGSUSED*/
+void
+cv_init(kcondvar_t *cv, char *name, int type, void *arg)
+{
+	(void) cond_init(cv, type, NULL);
+}
+
+void
+cv_destroy(kcondvar_t *cv)
+{
+	(void) cond_destroy(cv);
+}
+
+void
+cv_wait(kcondvar_t *cv, kmutex_t *mp)
+{
+	ASSERT(mutex_owner(mp) == curthread);
+	mp->m_owner = NULL;
+	(void) cond_wait(cv, &mp->m_lock);
+	mp->m_owner = curthread;
+}
+
+clock_t
+cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
+{
+	int error;
+	timestruc_t ts;
+	clock_t delta;
+
+top:
+	delta = abstime - lbolt;
+	if (delta <= 0)
+		return (-1);
+
+	ts.tv_sec = delta / hz;
+	ts.tv_nsec = (delta % hz) * (NANOSEC / hz);
+
+	ASSERT(mutex_owner(mp) == curthread);
+	mp->m_owner = NULL;
+	error = cond_reltimedwait(cv, &mp->m_lock, &ts);
+	mp->m_owner = curthread;
+
+	if (error == ETIME)
+		return (-1);
+
+	if (error == EINTR)
+		goto top;
+
+	ASSERT(error == 0);
+
+	return (1);
+}
+
+void
+cv_signal(kcondvar_t *cv)
+{
+	(void) cond_signal(cv);
+}
+
+void
+cv_broadcast(kcondvar_t *cv)
+{
+	(void) cond_broadcast(cv);
+}
+
+/*
+ * =========================================================================
+ * vnode operations
+ * =========================================================================
+ */
+/*
+ * Note: for the xxxat() versions of these functions, we assume that the
+ * starting vp is always rootdir (which is true for spa_directory.c, the only
+ * ZFS consumer of these interfaces).  We assert this is true, and then emulate
+ * them by adding '/' in front of the path.
+ */
+
+/*ARGSUSED*/
+int
+vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
+{
+	int fd;
+	vnode_t *vp;
+	int old_umask;
+	char realpath[MAXPATHLEN];
+	struct stat64 st;
+
+	/*
+	 * If we're accessing a real disk from userland, we need to use
+	 * the character interface to avoid caching.  This is particularly
+	 * important if we're trying to look at a real in-kernel storage
+	 * pool from userland, e.g. via zdb, because otherwise we won't
+	 * see the changes occurring under the segmap cache.
+	 * On the other hand, the stupid character device returns zero
+	 * for its size.  So -- gag -- we open the block device to get
+	 * its size, and remember it for subsequent VOP_GETATTR().
+	 */
+	if (strncmp(path, "/dev/", 5) == 0) {
+		char *dsk;
+		fd = open64(path, O_RDONLY);
+		if (fd == -1)
+			return (errno);
+		if (fstat64(fd, &st) == -1) {
+			close(fd);
+			return (errno);
+		}
+		close(fd);
+		(void) sprintf(realpath, "%s", path);
+		dsk = strstr(path, "/dsk/");
+		if (dsk != NULL)
+			(void) sprintf(realpath + (dsk - path) + 1, "r%s",
+			    dsk + 1);
+	} else {
+		(void) sprintf(realpath, "%s", path);
+		if (!(flags & FCREAT) && stat64(realpath, &st) == -1)
+			return (errno);
+	}
+
+	if (flags & FCREAT)
+		old_umask = umask(0);
+
+	/*
+	 * The construct 'flags - FREAD' conveniently maps combinations of
+	 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR.
+	 */
+	fd = open64(realpath, flags - FREAD, mode);
+
+	if (flags & FCREAT)
+		(void) umask(old_umask);
+
+	if (fd == -1)
+		return (errno);
+
+	if (fstat64(fd, &st) == -1) {
+		close(fd);
+		return (errno);
+	}
+
+	(void) fcntl(fd, F_SETFD, FD_CLOEXEC);
+
+	*vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL);
+
+	vp->v_fd = fd;
+	vp->v_size = st.st_size;
+	vp->v_path = spa_strdup(path);
+
+	return (0);
+}
+
+int
+vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2,
+    int x3, vnode_t *startvp)
+{
+	char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL);
+	int ret;
+
+	ASSERT(startvp == rootdir);
+	(void) sprintf(realpath, "/%s", path);
+
+	ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3);
+
+	umem_free(realpath, strlen(path) + 2);
+
+	return (ret);
+}
+
+/*ARGSUSED*/
+int
+vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset,
+	int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp)
+{
+	ssize_t iolen, split;
+
+	if (uio == UIO_READ) {
+		iolen = pread64(vp->v_fd, addr, len, offset);
+	} else {
+		/*
+		 * To simulate partial disk writes, we split writes into two
+		 * system calls so that the process can be killed in between.
+		 */
+		split = (len > 0 ? rand() % len : 0);
+		iolen = pwrite64(vp->v_fd, addr, split, offset);
+		iolen += pwrite64(vp->v_fd, (char *)addr + split,
+		    len - split, offset + split);
+	}
+
+	if (iolen == -1)
+		return (errno);
+	if (residp)
+		*residp = len - iolen;
+	else if (iolen != len)
+		return (EIO);
+	return (0);
+}
+
+void
+vn_close(vnode_t *vp)
+{
+	close(vp->v_fd);
+	spa_strfree(vp->v_path);
+	umem_free(vp, sizeof (vnode_t));
+}
+
+#ifdef ZFS_DEBUG
+
+/*
+ * =========================================================================
+ * Figure out which debugging statements to print
+ * =========================================================================
+ */
+
+static char *dprintf_string;
+static int dprintf_print_all;
+
+int
+dprintf_find_string(const char *string)
+{
+	char *tmp_str = dprintf_string;
+	int len = strlen(string);
+
+	/*
+	 * Find out if this is a string we want to print.
+	 * String format: file1.c,function_name1,file2.c,file3.c
+	 */
+
+	while (tmp_str != NULL) {
+		if (strncmp(tmp_str, string, len) == 0 &&
+		    (tmp_str[len] == ',' || tmp_str[len] == '\0'))
+			return (1);
+		tmp_str = strchr(tmp_str, ',');
+		if (tmp_str != NULL)
+			tmp_str++; /* Get rid of , */
+	}
+	return (0);
+}
+
+void
+dprintf_setup(int *argc, char **argv)
+{
+	int i, j;
+
+	/*
+	 * Debugging can be specified two ways: by setting the
+	 * environment variable ZFS_DEBUG, or by including a
+	 * "debug=..."  argument on the command line.  The command
+	 * line setting overrides the environment variable.
+	 */
+
+	for (i = 1; i < *argc; i++) {
+		int len = strlen("debug=");
+		/* First look for a command line argument */
+		if (strncmp("debug=", argv[i], len) == 0) {
+			dprintf_string = argv[i] + len;
+			/* Remove from args */
+			for (j = i; j < *argc; j++)
+				argv[j] = argv[j+1];
+			argv[j] = NULL;
+			(*argc)--;
+		}
+	}
+
+	if (dprintf_string == NULL) {
+		/* Look for ZFS_DEBUG environment variable */
+		dprintf_string = getenv("ZFS_DEBUG");
+	}
+
+	/*
+	 * Are we just turning on all debugging?
+	 */
+	if (dprintf_find_string("on"))
+		dprintf_print_all = 1;
+}
+
+/*
+ * =========================================================================
+ * debug printfs
+ * =========================================================================
+ */
+void
+__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
+{
+	const char *newfile;
+	va_list adx;
+
+	/*
+	 * Get rid of annoying "../common/" prefix to filename.
+	 */
+	newfile = strrchr(file, '/');
+	if (newfile != NULL) {
+		newfile = newfile + 1; /* Get rid of leading / */
+	} else {
+		newfile = file;
+	}
+
+	if (dprintf_print_all ||
+	    dprintf_find_string(newfile) ||
+	    dprintf_find_string(func)) {
+		/* Print out just the function name if requested */
+		flockfile(stdout);
+		if (dprintf_find_string("pid"))
+			(void) printf("%d ", getpid());
+		if (dprintf_find_string("tid"))
+			(void) printf("%u ", thr_self());
+		if (dprintf_find_string("cpu"))
+			(void) printf("%u ", getcpuid());
+		if (dprintf_find_string("time"))
+			(void) printf("%llu ", gethrtime());
+		if (dprintf_find_string("long"))
+			(void) printf("%s, line %d: ", newfile, line);
+		(void) printf("%s: ", func);
+		va_start(adx, fmt);
+		(void) vprintf(fmt, adx);
+		va_end(adx);
+		funlockfile(stdout);
+	}
+}
+
+#endif /* ZFS_DEBUG */
+
+/*
+ * =========================================================================
+ * cmn_err() and panic()
+ * =========================================================================
+ */
+static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" };
+static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" };
+
+void
+vpanic(const char *fmt, va_list adx)
+{
+	(void) fprintf(stderr, "error: ");
+	(void) vfprintf(stderr, fmt, adx);
+	(void) fprintf(stderr, "\n");
+
+	abort();	/* think of it as a "user-level crash dump" */
+}
+
+void
+panic(const char *fmt, ...)
+{
+	va_list adx;
+
+	va_start(adx, fmt);
+	vpanic(fmt, adx);
+	va_end(adx);
+}
+
+/*PRINTFLIKE2*/
+void
+cmn_err(int ce, const char *fmt, ...)
+{
+	va_list adx;
+
+	va_start(adx, fmt);
+	if (ce == CE_PANIC)
+		vpanic(fmt, adx);
+	if (ce != CE_NOTE) {	/* suppress noise in userland stress testing */
+		(void) fprintf(stderr, "%s", ce_prefix[ce]);
+		(void) vfprintf(stderr, fmt, adx);
+		(void) fprintf(stderr, "%s", ce_suffix[ce]);
+	}
+	va_end(adx);
+}
+
+/*
+ * =========================================================================
+ * misc routines
+ * =========================================================================
+ */
+
+void
+delay(clock_t ticks)
+{
+	poll(0, 0, ticks * (1000 / hz));
+}
+
+/*
+ * Find highest one bit set.
+ *	Returns bit number + 1 of highest bit that is set, otherwise returns 0.
+ * High order bit is 31 (or 63 in _LP64 kernel).
+ */
+int
+highbit(ulong_t i)
+{
+	register int h = 1;
+
+	if (i == 0)
+		return (0);
+#ifdef _LP64
+	if (i & 0xffffffff00000000ul) {
+		h += 32; i >>= 32;
+	}
+#endif
+	if (i & 0xffff0000) {
+		h += 16; i >>= 16;
+	}
+	if (i & 0xff00) {
+		h += 8; i >>= 8;
+	}
+	if (i & 0xf0) {
+		h += 4; i >>= 4;
+	}
+	if (i & 0xc) {
+		h += 2; i >>= 2;
+	}
+	if (i & 0x2) {
+		h += 1;
+	}
+	return (h);
+}
+
+static int
+random_get_bytes_common(uint8_t *ptr, size_t len, char *devname)
+{
+	int fd = open(devname, O_RDONLY);
+	size_t resid = len;
+	ssize_t bytes;
+
+	ASSERT(fd != -1);
+
+	while (resid != 0) {
+		bytes = read(fd, ptr, resid);
+		ASSERT(bytes >= 0);
+		ptr += bytes;
+		resid -= bytes;
+	}
+
+	close(fd);
+
+	return (0);
+}
+
+int
+random_get_bytes(uint8_t *ptr, size_t len)
+{
+	return (random_get_bytes_common(ptr, len, "/dev/random"));
+}
+
+int
+random_get_pseudo_bytes(uint8_t *ptr, size_t len)
+{
+	return (random_get_bytes_common(ptr, len, "/dev/urandom"));
+}
+
+/*
+ * =========================================================================
+ * kernel emulation setup & teardown
+ * =========================================================================
+ */
+static int
+umem_out_of_memory(void)
+{
+	char errmsg[] = "out of memory -- generating core dump\n";
+
+	write(fileno(stderr), errmsg, sizeof (errmsg));
+	abort();
+	return (0);
+}
+
+void
+kernel_init(int mode)
+{
+	umem_nofail_callback(umem_out_of_memory);
+
+	physmem = sysconf(_SC_PHYS_PAGES);
+
+	dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
+	    (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
+
+	spa_init(mode);
+}
+
+void
+kernel_fini(void)
+{
+	spa_fini();
+}
diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/lib/libzpool/common/llib-lzpool
new file mode 100644
index 000000000000..90c2d6c4fece
--- /dev/null
+++ b/usr/src/lib/libzpool/common/llib-lzpool
@@ -0,0 +1,51 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/* LINTLIBRARY */
+/* PROTOLIB1 */
+
+#include <sys/zfs_context.h>
+#include <sys/list.h>
+#include <sys/list_impl.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dnode.h>
+#include <sys/dsl_prop.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/space_map.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+#include <sys/zil.h>
+#include <sys/bplist.h>
+
+extern uint64_t zio_gang_bang;
diff --git a/usr/src/lib/libzpool/common/sys/zfs_context.h b/usr/src/lib/libzpool/common/sys/zfs_context.h
new file mode 100644
index 000000000000..243258be98de
--- /dev/null
+++ b/usr/src/lib/libzpool/common/sys/zfs_context.h
@@ -0,0 +1,411 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_CONTEXT_H
+#define	_SYS_ZFS_CONTEXT_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	_SYS_MUTEX_H
+#define	_SYS_RWLOCK_H
+#define	_SYS_CONDVAR_H
+#define	_SYS_SYSTM_H
+#define	_SYS_DEBUG_H
+#define	_SYS_T_LOCK_H
+#define	_SYS_VNODE_H
+#define	_SYS_VFS_H
+#define	_SYS_SUNDDI_H
+#define	_SYS_CALLB_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <strings.h>
+#include <synch.h>
+#include <thread.h>
+#include <assert.h>
+#include <alloca.h>
+#include <umem.h>
+#include <limits.h>
+#include <atomic.h>
+#include <dirent.h>
+#include <time.h>
+#include <sys/note.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/bitmap.h>
+#include <sys/resource.h>
+#include <sys/byteorder.h>
+#include <sys/list.h>
+#include <sys/uio.h>
+#include <sys/zfs_debug.h>
+#include <sys/sdt.h>
+
+/*
+ * Debugging
+ */
+
+/*
+ * Note that we are not using the debugging levels.
+ */
+
+#define	CE_CONT		0	/* continuation		*/
+#define	CE_NOTE		1	/* notice		*/
+#define	CE_WARN		2	/* warning		*/
+#define	CE_PANIC	3	/* panic		*/
+#define	CE_IGNORE	4	/* print nothing	*/
+
+/*
+ * ZFS debugging
+ */
+
+#ifdef ZFS_DEBUG
+extern void dprintf_setup(int *argc, char **argv);
+#endif /* ZFS_DEBUG */
+
+extern void cmn_err(int, const char *, ...);
+extern void panic(const char *, ...);
+extern void vpanic(const char *, __va_list);
+
+/* This definition is copied from assert.h. */
+#if defined(__STDC__)
+#if __STDC_VERSION__ - 0 >= 199901L
+#define	verify(EX) (void)((EX) || \
+	(__assert_c99(#EX, __FILE__, __LINE__, __func__), 0))
+#else
+#define	verify(EX) (void)((EX) || (__assert(#EX, __FILE__, __LINE__), 0))
+#endif /* __STDC_VERSION__ - 0 >= 199901L */
+#else
+#define	verify(EX) (void)((EX) || (_assert("EX", __FILE__, __LINE__), 0))
+#endif	/* __STDC__ */
+
+
+#define	VERIFY	verify
+#define	ASSERT	assert
+
+extern void __assert(const char *, const char *, int);
+
+#ifdef lint
+#define	VERIFY3_IMPL(x, y, z, t)	if (x == z) ((void)0)
+#else
+/* BEGIN CSTYLED */
+#define	VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE) do { \
+	const TYPE __left = (TYPE)(LEFT); \
+	const TYPE __right = (TYPE)(RIGHT); \
+	if (!(__left OP __right)) { \
+		char *__buf = alloca(256); \
+		(void) snprintf(__buf, 256, "%s %s %s (0x%llx %s 0x%llx)", \
+			#LEFT, #OP, #RIGHT, \
+			(u_longlong_t)__left, #OP, (u_longlong_t)__right); \
+		__assert(__buf, __FILE__, __LINE__); \
+	} \
+_NOTE(CONSTCOND) } while (0)
+/* END CSTYLED */
+#endif /* lint */
+
+#define	VERIFY3S(x, y, z)	VERIFY3_IMPL(x, y, z, int64_t)
+#define	VERIFY3U(x, y, z)	VERIFY3_IMPL(x, y, z, uint64_t)
+#define	VERIFY3P(x, y, z)	VERIFY3_IMPL(x, y, z, uintptr_t)
+
+#ifdef NDEBUG
+#define	ASSERT3S(x, y, z)	((void)0)
+#define	ASSERT3U(x, y, z)	((void)0)
+#define	ASSERT3P(x, y, z)	((void)0)
+#else
+#define	ASSERT3S(x, y, z)	VERIFY3S(x, y, z)
+#define	ASSERT3U(x, y, z)	VERIFY3U(x, y, z)
+#define	ASSERT3P(x, y, z)	VERIFY3P(x, y, z)
+#endif
+
+/*
+ * Dtrace SDT probes have different signatures in userland than they do in
+ * kernel.  If they're being used in kernel code, re-define them out of
+ * existence for their counterparts in libzpool.
+ */
+
+#ifdef DTRACE_PROBE1
+#undef	DTRACE_PROBE1
+#define	DTRACE_PROBE1(a, b, c)	((void)0)
+#endif	/* DTRACE_PROBE1 */
+
+#ifdef DTRACE_PROBE2
+#undef	DTRACE_PROBE2
+#define	DTRACE_PROBE2(a, b, c, d, e)	((void)0)
+#endif	/* DTRACE_PROBE2 */
+
+/*
+ * Threads
+ */
+#define	curthread	((void *)(uintptr_t)thr_self())
+
+typedef struct kthread kthread_t;
+
+#define	thread_create(stk, stksize, func, arg, len, pp, state, pri)	\
+	zk_thread_create(func, arg)
+#define	thread_exit() thr_exit(0)
+
+extern kthread_t *zk_thread_create(void (*func)(), void *arg);
+
+#define	issig(why)	(FALSE)
+#define	ISSIG(thr, why)	(FALSE)
+
+/*
+ * Mutexes
+ */
+typedef struct kmutex {
+	void	*m_owner;
+	mutex_t	m_lock;
+} kmutex_t;
+
+#define	MUTEX_DEFAULT	USYNC_THREAD
+#undef MUTEX_HELD
+#define	MUTEX_HELD(m) _mutex_held(&(m)->m_lock)
+
+/*
+ * Argh -- we have to get cheesy here because the kernel and userland
+ * have different signatures for the same routine.
+ */
+extern int _mutex_init(mutex_t *mp, int type, void *arg);
+extern int _mutex_destroy(mutex_t *mp);
+
+#define	mutex_init(mp, b, c, d)		zmutex_init((kmutex_t *)(mp))
+#define	mutex_destroy(mp)		zmutex_destroy((kmutex_t *)(mp))
+
+extern void zmutex_init(kmutex_t *mp);
+extern void zmutex_destroy(kmutex_t *mp);
+extern void mutex_enter(kmutex_t *mp);
+extern void mutex_exit(kmutex_t *mp);
+extern int mutex_tryenter(kmutex_t *mp);
+extern void *mutex_owner(kmutex_t *mp);
+
+/*
+ * RW locks
+ */
+typedef struct krwlock {
+	void		*rw_owner;
+	rwlock_t	rw_lock;
+} krwlock_t;
+
+typedef int krw_t;
+
+#define	RW_READER	0
+#define	RW_WRITER	1
+#define	RW_DEFAULT	USYNC_THREAD
+
+#undef RW_READ_HELD
+#define	RW_READ_HELD(x)		_rw_read_held(&(x)->rw_lock)
+
+#undef RW_WRITE_HELD
+#define	RW_WRITE_HELD(x)	_rw_write_held(&(x)->rw_lock)
+
+extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg);
+extern void rw_destroy(krwlock_t *rwlp);
+extern void rw_enter(krwlock_t *rwlp, krw_t rw);
+extern int rw_tryenter(krwlock_t *rwlp, krw_t rw);
+extern int rw_tryupgrade(krwlock_t *rwlp);
+extern void rw_exit(krwlock_t *rwlp);
+#define	rw_downgrade(rwlp) do { } while (0)
+
+/*
+ * Condition variables
+ */
+typedef cond_t kcondvar_t;
+
+#define	CV_DEFAULT	USYNC_THREAD
+
+extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg);
+extern void cv_destroy(kcondvar_t *cv);
+extern void cv_wait(kcondvar_t *cv, kmutex_t *mp);
+extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime);
+extern void cv_signal(kcondvar_t *cv);
+extern void cv_broadcast(kcondvar_t *cv);
+
+/*
+ * Kernel memory
+ */
+#define	KM_SLEEP		UMEM_NOFAIL
+#define	KM_NOSLEEP		UMEM_DEFAULT
+#define	kmem_alloc(_s, _f)	umem_alloc(_s, _f)
+#define	kmem_zalloc(_s, _f)	umem_zalloc(_s, _f)
+#define	kmem_free(_b, _s)	umem_free(_b, _s)
+#define	kmem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) \
+	umem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i)
+#define	kmem_cache_destroy(_c)	umem_cache_destroy(_c)
+#define	kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f)
+#define	kmem_cache_free(_c, _b)	umem_cache_free(_c, _b)
+#define	kmem_debugging()	0
+#define	kmem_cache_reap_now(c)
+
+typedef umem_cache_t kmem_cache_t;
+
+/*
+ * Task queues
+ */
+typedef struct taskq taskq_t;
+typedef uintptr_t taskqid_t;
+typedef void (task_func_t)(void *);
+
+#define	TASKQ_PREPOPULATE	0x0001
+#define	TASKQ_CPR_SAFE		0x0002	/* Use CPR safe protocol */
+#define	TASKQ_DYNAMIC		0x0004	/* Use dynamic thread scheduling */
+
+#define	TQ_SLEEP	KM_SLEEP	/* Can block for memory */
+#define	TQ_NOSLEEP	KM_NOSLEEP	/* cannot block for memory; may fail */
+#define	TQ_NOQUEUE	0x02	/* Do not enqueue if can't dispatch */
+
+extern taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
+extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
+extern void	taskq_destroy(taskq_t *);
+extern void	taskq_wait(taskq_t *);
+extern int	taskq_member(taskq_t *, void *);
+
+/*
+ * vnodes
+ */
+typedef struct vnode {
+	uint64_t	v_size;
+	int		v_fd;
+	char		*v_path;
+} vnode_t;
+
+typedef struct vattr {
+	uint_t		va_mask;	/* bit-mask of attributes */
+	u_offset_t	va_size;	/* file size in bytes */
+} vattr_t;
+
+#define	AT_TYPE		0x0001
+#define	AT_MODE		0x0002
+#define	AT_UID		0x0004
+#define	AT_GID		0x0008
+#define	AT_FSID		0x0010
+#define	AT_NODEID	0x0020
+#define	AT_NLINK	0x0040
+#define	AT_SIZE		0x0080
+#define	AT_ATIME	0x0100
+#define	AT_MTIME	0x0200
+#define	AT_CTIME	0x0400
+#define	AT_RDEV		0x0800
+#define	AT_BLKSIZE	0x1000
+#define	AT_NBLOCKS	0x2000
+#define	AT_SEQ		0x8000
+
+#define	CRCREAT		0
+
+#define	VOP_CLOSE(vp, f, c, o, cr)	0
+#define	VOP_PUTPAGE(vp, of, sz, fl, cr)	0
+#define	VOP_GETATTR(vp, vap, fl, cr)	((vap)->va_size = (vp)->v_size, 0)
+
+#define	VOP_FSYNC(vp, f, cr)	fsync((vp)->v_fd)
+
+#define	VN_RELE(vp)	vn_close(vp)
+
+extern int vn_open(char *path, int x1, int oflags, int mode, vnode_t **vpp,
+    int x2, int x3);
+extern int vn_openat(char *path, int x1, int oflags, int mode, vnode_t **vpp,
+    int x2, int x3, vnode_t *vp);
+extern int vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len,
+    offset_t offset, int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp);
+extern void vn_close(vnode_t *vp);
+
+#define	vn_remove(path, x1, x2)		remove(path)
+#define	vn_rename(from, to, seg)	rename((from), (to))
+
+extern vnode_t *rootdir;
+
+#include <sys/file.h>		/* for FREAD, FWRITE, etc */
+
+/*
+ * Random stuff
+ */
+#define	lbolt	(gethrtime() >> 23)
+#define	lbolt64	(gethrtime() >> 23)
+#define	hz	119	/* frequency when using gethrtime() >> 23 for lbolt */
+
+extern void delay(clock_t ticks);
+
+#define	gethrestime_sec() time(NULL)
+
+#define	max_ncpus	64
+
+#define	minclsyspri	60
+#define	maxclsyspri	99
+
+#define	CPU_SEQID	(thr_self() & (max_ncpus - 1))
+
+#define	kcred		NULL
+#define	CRED()		NULL
+
+extern uint64_t physmem;
+
+extern int highbit(ulong_t i);
+extern int random_get_bytes(uint8_t *ptr, size_t len);
+extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len);
+
+extern void kernel_init(int);
+extern void kernel_fini(void);
+
+struct spa;
+extern void nicenum(uint64_t num, char *buf);
+extern void show_pool_stats(struct spa *);
+
+typedef struct callb_cpr {
+	kmutex_t	*cc_lockp;
+} callb_cpr_t;
+
+#define	CALLB_CPR_INIT(cp, lockp, func, name)	{		\
+	(cp)->cc_lockp = lockp;					\
+}
+
+#define	CALLB_CPR_SAFE_BEGIN(cp) {				\
+	ASSERT(MUTEX_HELD((cp)->cc_lockp));			\
+}
+
+#define	CALLB_CPR_SAFE_END(cp, lockp) {				\
+	ASSERT(MUTEX_HELD((cp)->cc_lockp));			\
+}
+
+#define	CALLB_CPR_EXIT(cp) {					\
+	ASSERT(MUTEX_HELD((cp)->cc_lockp));			\
+	mutex_exit((cp)->cc_lockp);				\
+}
+
+#define	zone_dataset_visible(x, y)	(1)
+#define	INGLOBALZONE(z)			(1)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZFS_CONTEXT_H */
diff --git a/usr/src/lib/libzpool/common/taskq.c b/usr/src/lib/libzpool/common/taskq.c
new file mode 100644
index 000000000000..f7b65718c35a
--- /dev/null
+++ b/usr/src/lib/libzpool/common/taskq.c
@@ -0,0 +1,250 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+
+int taskq_now;
+
+typedef struct task {
+	struct task	*task_next;
+	struct task	*task_prev;
+	task_func_t	*task_func;
+	void		*task_arg;
+} task_t;
+
+#define	TASKQ_ACTIVE	0x00010000
+
+struct taskq {
+	kmutex_t	tq_lock;
+	krwlock_t	tq_threadlock;
+	kcondvar_t	tq_dispatch_cv;
+	kcondvar_t	tq_wait_cv;
+	thread_t	*tq_threadlist;
+	int		tq_flags;
+	int		tq_active;
+	int		tq_nthreads;
+	int		tq_nalloc;
+	int		tq_minalloc;
+	int		tq_maxalloc;
+	task_t		*tq_freelist;
+	task_t		tq_task;
+};
+
+static task_t *
+task_alloc(taskq_t *tq, int tqflags)
+{
+	task_t *t;
+
+	if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
+		tq->tq_freelist = t->task_next;
+	} else {
+		mutex_exit(&tq->tq_lock);
+		if (tq->tq_nalloc >= tq->tq_maxalloc) {
+			if (!(tqflags & KM_SLEEP)) {
+				mutex_enter(&tq->tq_lock);
+				return (NULL);
+			}
+			/*
+			 * We don't want to exceed tq_maxalloc, but we can't
+			 * wait for other tasks to complete (and thus free up
+			 * task structures) without risking deadlock with
+			 * the caller.  So, we just delay for one second
+			 * to throttle the allocation rate.
+			 */
+			delay(hz);
+		}
+		t = kmem_alloc(sizeof (task_t), tqflags);
+		mutex_enter(&tq->tq_lock);
+		if (t != NULL)
+			tq->tq_nalloc++;
+	}
+	return (t);
+}
+
+static void
+task_free(taskq_t *tq, task_t *t)
+{
+	if (tq->tq_nalloc <= tq->tq_minalloc) {
+		t->task_next = tq->tq_freelist;
+		tq->tq_freelist = t;
+	} else {
+		tq->tq_nalloc--;
+		mutex_exit(&tq->tq_lock);
+		kmem_free(t, sizeof (task_t));
+		mutex_enter(&tq->tq_lock);
+	}
+}
+
+taskqid_t
+taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
+{
+	task_t *t;
+
+	if (taskq_now) {
+		func(arg);
+		return (1);
+	}
+
+	mutex_enter(&tq->tq_lock);
+	ASSERT(tq->tq_flags & TASKQ_ACTIVE);
+	if ((t = task_alloc(tq, tqflags)) == NULL) {
+		mutex_exit(&tq->tq_lock);
+		return (0);
+	}
+	t->task_next = &tq->tq_task;
+	t->task_prev = tq->tq_task.task_prev;
+	t->task_next->task_prev = t;
+	t->task_prev->task_next = t;
+	t->task_func = func;
+	t->task_arg = arg;
+	cv_signal(&tq->tq_dispatch_cv);
+	mutex_exit(&tq->tq_lock);
+	return (1);
+}
+
+void
+taskq_wait(taskq_t *tq)
+{
+	mutex_enter(&tq->tq_lock);
+	while (tq->tq_task.task_next != &tq->tq_task || tq->tq_active != 0)
+		cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
+	mutex_exit(&tq->tq_lock);
+}
+
+static void *
+taskq_thread(void *arg)
+{
+	taskq_t *tq = arg;
+	task_t *t;
+
+	mutex_enter(&tq->tq_lock);
+	while (tq->tq_flags & TASKQ_ACTIVE) {
+		if ((t = tq->tq_task.task_next) == &tq->tq_task) {
+			if (--tq->tq_active == 0)
+				cv_broadcast(&tq->tq_wait_cv);
+			cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock);
+			tq->tq_active++;
+			continue;
+		}
+		t->task_prev->task_next = t->task_next;
+		t->task_next->task_prev = t->task_prev;
+		mutex_exit(&tq->tq_lock);
+
+		rw_enter(&tq->tq_threadlock, RW_READER);
+		t->task_func(t->task_arg);
+		rw_exit(&tq->tq_threadlock);
+
+		mutex_enter(&tq->tq_lock);
+		task_free(tq, t);
+	}
+	tq->tq_nthreads--;
+	cv_broadcast(&tq->tq_wait_cv);
+	mutex_exit(&tq->tq_lock);
+	return (NULL);
+}
+
+/*ARGSUSED*/
+taskq_t *
+taskq_create(const char *name, int nthreads, pri_t pri,
+	int minalloc, int maxalloc, uint_t flags)
+{
+	taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP);
+	int t;
+
+	rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL);
+	tq->tq_flags = flags | TASKQ_ACTIVE;
+	tq->tq_active = nthreads;
+	tq->tq_nthreads = nthreads;
+	tq->tq_minalloc = minalloc;
+	tq->tq_maxalloc = maxalloc;
+	tq->tq_task.task_next = &tq->tq_task;
+	tq->tq_task.task_prev = &tq->tq_task;
+	tq->tq_threadlist = kmem_alloc(nthreads * sizeof (thread_t), KM_SLEEP);
+
+	if (flags & TASKQ_PREPOPULATE) {
+		mutex_enter(&tq->tq_lock);
+		while (minalloc-- > 0)
+			task_free(tq, task_alloc(tq, KM_SLEEP));
+		mutex_exit(&tq->tq_lock);
+	}
+
+	for (t = 0; t < nthreads; t++)
+		(void) thr_create(0, 0, taskq_thread,
+		    tq, THR_BOUND, &tq->tq_threadlist[t]);
+
+	return (tq);
+}
+
+void
+taskq_destroy(taskq_t *tq)
+{
+	int t;
+	int nthreads = tq->tq_nthreads;
+
+	taskq_wait(tq);
+
+	mutex_enter(&tq->tq_lock);
+
+	tq->tq_flags &= ~TASKQ_ACTIVE;
+	cv_broadcast(&tq->tq_dispatch_cv);
+
+	while (tq->tq_nthreads != 0)
+		cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
+
+	tq->tq_minalloc = 0;
+	while (tq->tq_nalloc != 0) {
+		ASSERT(tq->tq_freelist != NULL);
+		task_free(tq, task_alloc(tq, KM_SLEEP));
+	}
+
+	mutex_exit(&tq->tq_lock);
+
+	for (t = 0; t < nthreads; t++)
+		(void) thr_join(tq->tq_threadlist[t], NULL, NULL);
+
+	kmem_free(tq->tq_threadlist, nthreads * sizeof (thread_t));
+
+	rw_destroy(&tq->tq_threadlock);
+
+	kmem_free(tq, sizeof (taskq_t));
+}
+
+int
+taskq_member(taskq_t *tq, void *t)
+{
+	int i;
+
+	if (taskq_now)
+		return (1);
+
+	for (i = 0; i < tq->tq_nthreads; i++)
+		if (tq->tq_threadlist[i] == (thread_t)(uintptr_t)t)
+			return (1);
+
+	return (0);
+}
diff --git a/usr/src/lib/libzpool/common/util.c b/usr/src/lib/libzpool/common/util.c
new file mode 100644
index 000000000000..28a670470270
--- /dev/null
+++ b/usr/src/lib/libzpool/common/util.c
@@ -0,0 +1,135 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <assert.h>
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/spa.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Routines needed by more than one client of libzpool.
+ */
+
+void
+nicenum(uint64_t num, char *buf)
+{
+	uint64_t n = num;
+	int index = 0;
+	char u;
+
+	while (n >= 1024) {
+		n = (n + (1024 / 2)) / 1024; /* Round up or down */
+		index++;
+	}
+
+	u = " KMGTPE"[index];
+
+	if (index == 0) {
+		(void) sprintf(buf, "%llu", (u_longlong_t)n);
+	} else if (n < 10 && (num & (num - 1)) != 0) {
+		(void) sprintf(buf, "%.2f%c",
+		    (double)num / (1ULL << 10 * index), u);
+	} else if (n < 100 && (num & (num - 1)) != 0) {
+		(void) sprintf(buf, "%.1f%c",
+		    (double)num / (1ULL << 10 * index), u);
+	} else {
+		(void) sprintf(buf, "%llu%c", (u_longlong_t)n, u);
+	}
+}
+
+static void
+show_vdev_stats(const char *desc, nvlist_t *nv, int indent)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	vdev_stat_t *vs;
+	uint64_t sec;
+	char used[6], avail[6];
+	char rops[6], wops[6], rbytes[6], wbytes[6], rerr[6], werr[6], cerr[6];
+
+	if (indent == 0) {
+		(void) printf("                     "
+		    " capacity   operations   bandwidth  ---- errors ----\n");
+		(void) printf("description          "
+		    "used avail  read write  read write  read write cksum\n");
+	}
+
+	VERIFY(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+	    (uint64_t **)&vs, &c) == 0);
+
+	sec = MAX(1, vs->vs_timestamp / NANOSEC);
+
+	nicenum(vs->vs_alloc, used);
+	nicenum(vs->vs_space - vs->vs_alloc, avail);
+	nicenum(vs->vs_ops[ZIO_TYPE_READ] / sec, rops);
+	nicenum(vs->vs_ops[ZIO_TYPE_WRITE] / sec, wops);
+	nicenum(vs->vs_bytes[ZIO_TYPE_READ] / sec, rbytes);
+	nicenum(vs->vs_bytes[ZIO_TYPE_WRITE] / sec, wbytes);
+	nicenum(vs->vs_read_errors, rerr);
+	nicenum(vs->vs_write_errors, werr);
+	nicenum(vs->vs_checksum_errors, cerr);
+
+	(void) printf("%*s%*s%*s%*s %5s %5s %5s %5s %5s %5s %5s\n",
+	    indent, "",
+	    indent - 19 - (vs->vs_space ? 0 : 12), desc,
+	    vs->vs_space ? 6 : 0, vs->vs_space ? used : "",
+	    vs->vs_space ? 6 : 0, vs->vs_space ? avail : "",
+	    rops, wops, rbytes, wbytes, rerr, werr, cerr);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		return;
+
+	for (c = 0; c < children; c++) {
+		nvlist_t *cnv = child[c];
+		char *cname;
+		if (nvlist_lookup_string(cnv, ZPOOL_CONFIG_PATH, &cname) &&
+		    nvlist_lookup_string(cnv, ZPOOL_CONFIG_TYPE, &cname))
+			cname = "<unknown>";
+		show_vdev_stats(cname, cnv, indent + 2);
+	}
+}
+
+void
+show_pool_stats(spa_t *spa)
+{
+	nvlist_t *config = NULL;
+	nvlist_t *nvroot = NULL;
+
+	spa_config_enter(spa, RW_READER);
+	VERIFY(spa_get_stats(spa_name(spa), &config) == 0);
+	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+
+	show_vdev_stats(spa_name(spa), nvroot, 0);
+	spa_config_exit(spa);
+}
diff --git a/usr/src/lib/libzpool/i386/Makefile b/usr/src/lib/libzpool/i386/Makefile
new file mode 100644
index 000000000000..3ae822d92044
--- /dev/null
+++ b/usr/src/lib/libzpool/i386/Makefile
@@ -0,0 +1,31 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+
+install: all $(ROOTLIBS) $(ROOTLINKS) $(ROOTLINT)
diff --git a/usr/src/lib/libzpool/inc.flg b/usr/src/lib/libzpool/inc.flg
new file mode 100644
index 000000000000..94a1191086e4
--- /dev/null
+++ b/usr/src/lib/libzpool/inc.flg
@@ -0,0 +1,31 @@
+#!/bin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+
+find_files "s.*" usr/src/common/zfs
+find_files "s.*" usr/src/uts/common/fs/zfs/sys
+echo_file usr/src/uts/common/sys/fs/zfs.h
diff --git a/usr/src/lib/libzpool/sparcv9/Makefile b/usr/src/lib/libzpool/sparcv9/Makefile
new file mode 100644
index 000000000000..49cb41334230
--- /dev/null
+++ b/usr/src/lib/libzpool/sparcv9/Makefile
@@ -0,0 +1,34 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+include ../../Makefile.lib.64
+
+sparcv9_C_PICFLAGS =	$(sparcv9_C_BIGPICFLAGS)
+
+install: all $(ROOTLIBS64) $(ROOTLINKS64) $(ROOTLINT)
diff --git a/usr/src/pkgdefs/Makefile b/usr/src/pkgdefs/Makefile
index 6d28956f2730..a4f8fb383b0d 100644
--- a/usr/src/pkgdefs/Makefile
+++ b/usr/src/pkgdefs/Makefile
@@ -2,9 +2,8 @@
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -19,6 +18,7 @@
 #
 # CDDL HEADER END
 #
+
 #
 # Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
@@ -327,6 +327,8 @@ COMMON_SUBDIRS= \
 	SUNWqosu \
 	SUNWxge \
 	SUNWchxge \
+	SUNWzfsr \
+	SUNWzfsu \
 	SUNWzoneint \
 	SUNWzoner \
 	SUNWzoneu
diff --git a/usr/src/pkgdefs/SUNW0on/prototype_com b/usr/src/pkgdefs/SUNW0on/prototype_com
index 367e5ea542f8..bb5046fd1562 100644
--- a/usr/src/pkgdefs/SUNW0on/prototype_com
+++ b/usr/src/pkgdefs/SUNW0on/prototype_com
@@ -50,6 +50,7 @@ f none usr/lib/locale/C/LC_MESSAGES/SUN4.po              644 root bin
 f none usr/lib/locale/C/LC_MESSAGES/SUN4U.po             644 root bin
 f none usr/lib/locale/C/LC_MESSAGES/SUNOS.po             644 root bin
 f none usr/lib/locale/C/LC_MESSAGES/PCI.po               644 root bin
+f none usr/lib/locale/C/LC_MESSAGES/ZFS.po               644 root bin
 f none usr/lib/locale/C/LC_MESSAGES/SUNW_OST_ADMIN.po    644 root sys
 f none usr/lib/locale/C/LC_MESSAGES/SUNW_OST_NETRPC.po   644 root sys
 f none usr/lib/locale/C/LC_MESSAGES/SUNW_OST_OSCMD.po    644 root sys
@@ -269,6 +270,8 @@ f none usr/lib/help/profiles/locale/RtFileSysSecurity.html 444 root bin
 f none usr/lib/help/profiles/locale/RtDHCPMngmnt.html 444 root bin
 f none usr/lib/help/profiles/locale/RtIPFilterMngmnt.html 444 root bin
 f none usr/lib/help/profiles/locale/RtDatAdmin.html 444 root bin
+f none usr/lib/help/profiles/locale/RtZFSFileSysMngmnt.html 444 root bin
+f none usr/lib/help/profiles/locale/RtZFSStorageMngmnt.html 444 root bin
 f none usr/lib/help/profiles/locale/RtZoneMngmnt.html 444 root bin
 #
 #
diff --git a/usr/src/pkgdefs/SUNWcsu/prototype_com b/usr/src/pkgdefs/SUNWcsu/prototype_com
index c95f405f0d70..dc37682cb9f0 100644
--- a/usr/src/pkgdefs/SUNWcsu/prototype_com
+++ b/usr/src/pkgdefs/SUNWcsu/prototype_com
@@ -516,6 +516,8 @@ f none usr/lib/help/profiles/locale/C/RtKerberosClntMngmnt.html 444 root bin
 f none usr/lib/help/profiles/locale/C/RtKerberosSrvrMngmnt.html 444 root bin
 f none usr/lib/help/profiles/locale/C/RtIPFilterMngmnt.html 444 root bin
 f none usr/lib/help/profiles/locale/C/RtDatAdmin.html 444 root bin
+f none usr/lib/help/profiles/locale/C/RtZFSFileSysMngmnt.html 444 root bin
+f none usr/lib/help/profiles/locale/C/RtZFSStorageMngmnt.html 444 root bin
 f none usr/lib/help/profiles/locale/C/RtZoneMngmnt.html 444 root bin
 d none usr/lib/iconv 755 root bin
 f none usr/lib/iconv/646da.8859.t 444 root bin
diff --git a/usr/src/pkgdefs/SUNWfmd/prototype_com b/usr/src/pkgdefs/SUNWfmd/prototype_com
index fbffebe0bd90..e0fd2e8da368 100644
--- a/usr/src/pkgdefs/SUNWfmd/prototype_com
+++ b/usr/src/pkgdefs/SUNWfmd/prototype_com
@@ -48,6 +48,7 @@ f none usr/lib/fm/dict/FMD.dict 444 root bin
 f none usr/lib/fm/dict/SMF.dict 444 root bin
 f none usr/lib/fm/dict/SUNOS.dict 444 root bin
 f none usr/lib/fm/dict/PCI.dict 444 root bin
+f none usr/lib/fm/dict/ZFS.dict 444 root bin
 d none usr/lib/fm/eft 755 root bin
 f none usr/lib/fm/eft/pci.eft 444 root bin
 d none usr/lib/fm/fmd 755 root bin
@@ -101,6 +102,7 @@ f none usr/lib/locale/C/LC_MESSAGES/FMD.mo 444 root bin
 f none usr/lib/locale/C/LC_MESSAGES/SMF.mo 444 root bin
 f none usr/lib/locale/C/LC_MESSAGES/SUNOS.mo 444 root bin
 f none usr/lib/locale/C/LC_MESSAGES/PCI.mo 444 root bin
+f none usr/lib/locale/C/LC_MESSAGES/ZFS.mo 444 root bin
 d none usr/lib/mdb 755 root sys
 d none usr/lib/mdb/proc 755 root sys
 f none usr/lib/mdb/proc/fmd.so 555 root sys
diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com
index b0cb465c8081..0f7d9b1d1e35 100644
--- a/usr/src/pkgdefs/SUNWhea/prototype_com
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com
@@ -187,6 +187,7 @@ f none usr/include/libsysevent.h 644 root bin
 f none usr/include/libsysevent_impl.h 644 root bin
 f none usr/include/libsvm.h 644 root bin
 f none usr/include/libw.h 644 root bin
+f none usr/include/libzfs.h 644 root bin
 f none usr/include/libzoneinfo.h 644 root bin
 f none usr/include/limits.h 644 root bin
 f none usr/include/linenum.h 644 root bin
@@ -714,6 +715,7 @@ f none usr/include/sys/fs/ufs_prot.h 644 root bin
 f none usr/include/sys/fs/ufs_quota.h 644 root bin
 f none usr/include/sys/fs/ufs_snap.h 644 root bin
 f none usr/include/sys/fs/ufs_trans.h 644 root bin
+f none usr/include/sys/fs/zfs.h 644 root bin
 f none usr/include/sys/fs_subr.h 644 root bin
 f none usr/include/sys/fsid.h 644 root bin
 f none usr/include/sys/fssnap.h 644 root bin
diff --git a/usr/src/pkgdefs/SUNWmdb/prototype_com b/usr/src/pkgdefs/SUNWmdb/prototype_com
index 718ccb5b8c78..2779f8c1cdc3 100644
--- a/usr/src/pkgdefs/SUNWmdb/prototype_com
+++ b/usr/src/pkgdefs/SUNWmdb/prototype_com
@@ -44,6 +44,7 @@ d none usr/lib/mdb/disasm 755 root sys
 d none usr/lib/mdb/kvm 755 root sys
 d none usr/lib/mdb/proc 755 root sys
 f none usr/lib/mdb/proc/ld.so 555 root sys
+f none usr/lib/mdb/proc/libavl.so 555 root sys
 f none usr/lib/mdb/proc/libc.so 555 root sys
 f none usr/lib/mdb/proc/libnvpair.so 555 root sys
 f none usr/lib/mdb/proc/libsysevent.so 555 root sys
diff --git a/usr/src/pkgdefs/SUNWmdb/prototype_i386 b/usr/src/pkgdefs/SUNWmdb/prototype_i386
index 0e1cdbce269d..d49479b7d8e7 100644
--- a/usr/src/pkgdefs/SUNWmdb/prototype_i386
+++ b/usr/src/pkgdefs/SUNWmdb/prototype_i386
@@ -20,7 +20,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -93,6 +93,7 @@ f none usr/lib/mdb/kvm/ufs_log.so 555 root sys
 f none usr/lib/mdb/kvm/uhci.so 555 root sys
 f none usr/lib/mdb/kvm/usba.so 555 root sys
 d none usr/lib/mdb/proc/amd64 755 root bin
+f none usr/lib/mdb/proc/amd64/libavl.so 555 root sys
 f none usr/lib/mdb/proc/amd64/libc.so 555 root sys
 f none usr/lib/mdb/proc/amd64/libnvpair.so 555 root sys
 f none usr/lib/mdb/proc/amd64/libsysevent.so 555 root sys
diff --git a/usr/src/pkgdefs/SUNWmdb/prototype_sparc b/usr/src/pkgdefs/SUNWmdb/prototype_sparc
index 88f196c2a33b..9099b759cfa5 100644
--- a/usr/src/pkgdefs/SUNWmdb/prototype_sparc
+++ b/usr/src/pkgdefs/SUNWmdb/prototype_sparc
@@ -70,6 +70,7 @@ f none usr/lib/mdb/kvm/sparcv9/ufs.so 555 root sys
 f none usr/lib/mdb/kvm/sparcv9/ufs_log.so 555 root sys
 f none usr/lib/mdb/kvm/sparcv9/usba.so 555 root sys
 d none usr/lib/mdb/proc/sparcv9 755 root sys
+f none usr/lib/mdb/proc/sparcv9/libavl.so 555 root sys
 f none usr/lib/mdb/proc/sparcv9/libc.so 555 root sys
 f none usr/lib/mdb/proc/sparcv9/libnvpair.so 555 root sys
 f none usr/lib/mdb/proc/sparcv9/libsysevent.so 555 root sys
diff --git a/usr/src/pkgdefs/SUNWzfsr/Makefile b/usr/src/pkgdefs/SUNWzfsr/Makefile
new file mode 100644
index 000000000000..1cb8f7d86fba
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWzfsr/Makefile
@@ -0,0 +1,37 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+
+.KEEP_STATE:
+
+all: $(FILES) depend
+
+install: all pkg
+
+include ../Makefile.targ
diff --git a/usr/src/pkgdefs/SUNWzfsr/depend b/usr/src/pkgdefs/SUNWzfsr/depend
new file mode 100644
index 000000000000..8c48366b94cc
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWzfsr/depend
@@ -0,0 +1,52 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+# This package information file defines software dependencies associated
+# with the pkg.  You can define three types of pkg dependencies with this file:
+#	 P indicates a prerequisite for installation
+#	 I indicates an incompatible package
+#	 R indicates a reverse dependency
+# <pkg.abbr> see pkginfo(4), PKG parameter
+# <name> see pkginfo(4), NAME parameter
+# <version> see pkginfo(4), VERSION parameter
+# <arch> see pkginfo(4), ARCH parameter
+# <type> <pkg.abbr> <name>
+#	(<arch>)<version>
+#	(<arch>)<version>
+#	...
+# <type> <pkg.abbr> <name>
+# ...
+#
+
+P SUNWcar	Core Architecture, (Root)
+P SUNWcakr	Core Solaris Kernel Architecture (Root)
+P SUNWkvm	Core Architecture, (Kvm)
+P SUNWcsr	Core Solaris, (Root)
+P SUNWckr	Core Solaris Kernel (Root)
+P SUNWcsu	Core Solaris, (Usr)
+P SUNWcsd	Core Solaris Devices
+P SUNWcsl	Core Solaris Libraries
diff --git a/usr/src/pkgdefs/SUNWzfsr/pkginfo.tmpl b/usr/src/pkgdefs/SUNWzfsr/pkginfo.tmpl
new file mode 100644
index 000000000000..14d937677c1d
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWzfsr/pkginfo.tmpl
@@ -0,0 +1,47 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+# This required package information file describes characteristics of the
+# package, such as package abbreviation, full package name, package version,
+# and package architecture.
+#
+PKG="SUNWzfsr"
+NAME="ZFS (Root)"
+ARCH="ISA"
+VERSION="ONVERS,REV=0.0.0"
+SUNW_PRODNAME="SunOS"
+SUNW_PRODVERS="RELEASE/VERSION"
+SUNW_PKGTYPE="root"
+MAXINST="1000"
+CATEGORY="system"
+DESC="ZFS root components"
+VENDOR="Sun Microsystems, Inc."
+HOTLINE="Please contact your local service provider"
+EMAIL=""
+CLASSES="none"
+BASEDIR=/
+SUNW_PKGVERS="1.0"
+SUNW_PKG_ALLZONES="true"
+SUNW_PKG_HOLLOW="true"
diff --git a/usr/src/pkgdefs/SUNWzfsr/prototype_com b/usr/src/pkgdefs/SUNWzfsr/prototype_com
new file mode 100644
index 000000000000..ff884b96eb6b
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWzfsr/prototype_com
@@ -0,0 +1,50 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+# packaging files
+i copyright
+i depend
+i pkginfo
+
+#
+# SUNWzfsr
+#
+d none etc 755 root sys
+d none etc/fs 755 root sys
+d none etc/fs/zfs 755 root sys
+s none etc/fs/zfs/mount=../../../sbin/zfs
+s none etc/fs/zfs/umount=../../../sbin/zfs
+d none etc/zfs 755 root sys
+d none kernel 755 root sys
+d none kernel/kmdb 755 root sys
+d none kernel/drv 755 root sys
+f none kernel/drv/zfs.conf 644 root sys
+d none kernel/fs 755 root sys
+d none sbin 755 root sys
+f none sbin/zfs 555 root bin
+f none sbin/zpool 555 root bin
diff --git a/usr/src/pkgdefs/SUNWzfsr/prototype_i386 b/usr/src/pkgdefs/SUNWzfsr/prototype_i386
new file mode 100644
index 000000000000..152f73c49140
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWzfsr/prototype_i386
@@ -0,0 +1,42 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+!include prototype_com
+
+#
+# SUNWzfsr
+#
+f none kernel/drv/zfs 755 root sys
+d none kernel/drv/amd64 755 root sys
+f none kernel/drv/amd64/zfs 755 root sys
+l none kernel/fs/zfs=../../kernel/drv/zfs
+d none kernel/fs/amd64 755 root sys
+l none kernel/fs/amd64/zfs=../../../kernel/drv/amd64/zfs
+f none kernel/kmdb/zfs 555 root sys
+d none kernel/kmdb/amd64 755 root sys
+f none kernel/kmdb/amd64/zfs 555 root sys
diff --git a/usr/src/pkgdefs/SUNWzfsr/prototype_sparc b/usr/src/pkgdefs/SUNWzfsr/prototype_sparc
new file mode 100644
index 000000000000..94914a9e5f92
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWzfsr/prototype_sparc
@@ -0,0 +1,40 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+!include prototype_com
+
+#
+# SUNWzfsr
+#
+d none kernel/drv/sparcv9 755 root sys
+f none kernel/drv/sparcv9/zfs 755 root sys
+d none kernel/fs/sparcv9 755 root sys
+l none kernel/fs/sparcv9/zfs=../../../kernel/drv/sparcv9/zfs
+d none kernel/kmdb/sparcv9 755 root sys
+f none kernel/kmdb/sparcv9/zfs 555 root sys
+
diff --git a/usr/src/pkgdefs/SUNWzfsu/Makefile b/usr/src/pkgdefs/SUNWzfsu/Makefile
new file mode 100644
index 000000000000..1cb8f7d86fba
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWzfsu/Makefile
@@ -0,0 +1,37 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+
+.KEEP_STATE:
+
+all: $(FILES) depend
+
+install: all pkg
+
+include ../Makefile.targ
diff --git a/usr/src/pkgdefs/SUNWzfsu/depend b/usr/src/pkgdefs/SUNWzfsu/depend
new file mode 100644
index 000000000000..b76754137eb8
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWzfsu/depend
@@ -0,0 +1,54 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+# This package information file defines software dependencies associated
+# with the pkg.  You can define three types of pkg dependencies with this file:
+#	 P indicates a prerequisite for installation
+#	 I indicates an incompatible package
+#	 R indicates a reverse dependency
+# <pkg.abbr> see pkginfo(4), PKG parameter
+# <name> see pkginfo(4), NAME parameter
+# <version> see pkginfo(4), VERSION parameter
+# <arch> see pkginfo(4), ARCH parameter
+# <type> <pkg.abbr> <name>
+#	(<arch>)<version>
+#	(<arch>)<version>
+#	...
+# <type> <pkg.abbr> <name>
+# ...
+#
+
+P SUNWcar	Core Architecture, (Root)
+P SUNWcakr	Core Solaris Kernel Architecture (Root)
+P SUNWkvm	Core Architecture, (Kvm)
+P SUNWcsr	Core Solaris, (Root)
+P SUNWckr	Core Solaris Kernel (Root)
+P SUNWcsu	Core Solaris, (Usr)
+P SUNWcsd	Core Solaris Devices
+P SUNWcsl	Core Solaris Libraries
+P SUNWzfsr	ZFS (Root)
+P SUNWsmapi	Storage Management APIs
diff --git a/usr/src/pkgdefs/SUNWzfsu/pkginfo.tmpl b/usr/src/pkgdefs/SUNWzfsu/pkginfo.tmpl
new file mode 100644
index 000000000000..b3b1df7501ff
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWzfsu/pkginfo.tmpl
@@ -0,0 +1,47 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+# This required package information file describes characteristics of the
+# package, such as package abbreviation, full package name, package version,
+# and package architecture.
+#
+PKG="SUNWzfsu"
+NAME="ZFS (Usr)"
+ARCH="ISA"
+VERSION="ONVERS,REV=0.0.0"
+SUNW_PRODNAME="SunOS"
+SUNW_PRODVERS="RELEASE/VERSION"
+SUNW_PKGTYPE="usr"
+MAXINST="1000"
+CATEGORY="system"
+DESC="ZFS libraries and commands"
+VENDOR="Sun Microsystems, Inc."
+HOTLINE="Please contact your local service provider"
+EMAIL=""
+CLASSES="none"
+BASEDIR=/
+SUNW_PKGVERS="1.0"
+SUNW_PKG_ALLZONES="true"
+SUNW_PKG_HOLLOW="false"
diff --git a/usr/src/pkgdefs/SUNWzfsu/prototype_com b/usr/src/pkgdefs/SUNWzfsu/prototype_com
new file mode 100644
index 000000000000..5af24f36af1c
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWzfsu/prototype_com
@@ -0,0 +1,60 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+# packaging files
+i copyright
+i depend
+i pkginfo
+
+#
+# SUNWzfsu
+#
+d none usr 755 root sys
+d none usr/lib 755 root bin
+d none usr/lib/zfs 755 root bin
+f none usr/lib/zfs/availdevs 555 root bin
+d none usr/lib/devfsadm 755 root sys
+d none usr/lib/devfsadm/linkmod 755 root sys
+f none usr/lib/devfsadm/linkmod/SUNW_zfs_link.so 755 root sys
+d none usr/lib/fs 755 root sys
+d none usr/lib/fs/zfs 755 root sys
+f none usr/lib/fs/zfs/fstyp 555 root bin
+s none usr/lib/fs/zfs/mount=../../../../sbin/zfs
+s none usr/lib/fs/zfs/umount=../../../../sbin/zfs
+f none usr/lib/libzfs.so.1 755 root bin
+s none usr/lib/libzfs.so=libzfs.so.1
+f none usr/lib/libzfs_jni.so.1 755 root bin
+s none usr/lib/libzfs_jni.so=libzfs_jni.so.1
+d none usr/lib/mdb 755 root sys
+d none usr/lib/mdb/kvm 755 root sys
+d none usr/lib/mdb/proc 755 root sys
+f none usr/lib/mdb/proc/libzpool.so 555 root sys
+d none usr/sbin 755 root bin
+l none usr/sbin/zdb=../../usr/lib/isaexec
+s none usr/sbin/zfs=../../sbin/zfs
+s none usr/sbin/zpool=../../sbin/zpool
diff --git a/usr/src/pkgdefs/SUNWzfsu/prototype_i386 b/usr/src/pkgdefs/SUNWzfsu/prototype_i386
new file mode 100644
index 000000000000..7ba736791c1f
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWzfsu/prototype_i386
@@ -0,0 +1,51 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+!include prototype_com
+
+#
+# SUNWzfsu
+#
+d none usr/lib/amd64 755 root bin
+f none usr/lib/amd64/libzfs.so.1 755 root bin
+s none usr/lib/amd64/libzfs.so=libzfs.so.1
+f none usr/lib/amd64/libzfs_jni.so.1 755 root bin
+s none usr/lib/amd64/libzfs_jni.so=libzfs_jni.so.1
+f none usr/lib/amd64/libzpool.so.1 755 root bin
+s none usr/lib/amd64/libzpool.so=libzpool.so.1
+f none usr/lib/libzpool.so.1 755 root bin
+s none usr/lib/libzpool.so=libzpool.so.1
+d none usr/lib/mdb/kvm/amd64 755 root sys
+f none usr/lib/mdb/kvm/amd64/zfs.so 555 root sys
+f none usr/lib/mdb/kvm/zfs.so 555 root sys
+d none usr/lib/mdb/proc/amd64 755 root bin
+f none usr/lib/mdb/proc/amd64/libzpool.so 555 root sys
+d none usr/sbin/i86 755 root bin
+f none usr/sbin/i86/zdb 555 root bin
+d none usr/sbin/amd64 755 root bin
+f none usr/sbin/amd64/zdb 555 root bin
diff --git a/usr/src/pkgdefs/SUNWzfsu/prototype_sparc b/usr/src/pkgdefs/SUNWzfsu/prototype_sparc
new file mode 100644
index 000000000000..308a157ed4bd
--- /dev/null
+++ b/usr/src/pkgdefs/SUNWzfsu/prototype_sparc
@@ -0,0 +1,46 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+!include prototype_com
+
+#
+# SUNWzfsu
+#
+d none usr/lib/mdb/kvm/sparcv9 755 root sys
+f none usr/lib/mdb/kvm/sparcv9/zfs.so 555 root sys
+d none usr/lib/mdb/proc/sparcv9 755 root sys
+f none usr/lib/mdb/proc/sparcv9/libzpool.so 555 root sys
+d none usr/lib/sparcv9 755 root bin
+f none usr/lib/sparcv9/libzfs.so.1 755 root bin
+s none usr/lib/sparcv9/libzfs.so=libzfs.so.1
+f none usr/lib/sparcv9/libzfs_jni.so.1 755 root bin
+s none usr/lib/sparcv9/libzfs_jni.so=libzfs_jni.so.1
+f none usr/lib/sparcv9/libzpool.so.1 755 root bin
+s none usr/lib/sparcv9/libzpool.so=libzpool.so.1
+d none usr/sbin/sparcv9 755 root bin
+f none usr/sbin/sparcv9/zdb 555 root bin
diff --git a/usr/src/pkgdefs/SUNWzoneu/depend b/usr/src/pkgdefs/SUNWzoneu/depend
index f478d84f57fe..1c668182be02 100644
--- a/usr/src/pkgdefs/SUNWzoneu/depend
+++ b/usr/src/pkgdefs/SUNWzoneu/depend
@@ -1,7 +1,4 @@
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
@@ -22,7 +19,14 @@
 #
 # CDDL HEADER END
 #
+
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
 # ident	"%Z%%M%	%I%	%E% SMI"
+#
+
 #
 # This package information file defines software dependencies associated
 # with the pkg.  You can define three types of pkg dependencies with this file:
@@ -56,3 +60,4 @@ P SUNWpool	Resource Pools
 P SUNWluu	Live Upgrade (usr)
 P SUNWluzone	Live Upgrade (zones support)
 P SUNWtecla	Tecla command-line editing library
+P SUNWzfsu	ZFS (Usr)
diff --git a/usr/src/pkgdefs/common_files/i.minorperm_i386 b/usr/src/pkgdefs/common_files/i.minorperm_i386
index 2048234fa362..0d36871d11ee 100644
--- a/usr/src/pkgdefs/common_files/i.minorperm_i386
+++ b/usr/src/pkgdefs/common_files/i.minorperm_i386
@@ -220,6 +220,8 @@ bmc:bmc
 dld:*
 aggr:*
 smbios:smbios
+zfs:*
+zfs:zfs
 EOF
 }
 
diff --git a/usr/src/pkgdefs/common_files/i.minorperm_sparc b/usr/src/pkgdefs/common_files/i.minorperm_sparc
index fc9bbc4ea9eb..180351df7b5d 100644
--- a/usr/src/pkgdefs/common_files/i.minorperm_sparc
+++ b/usr/src/pkgdefs/common_files/i.minorperm_sparc
@@ -283,6 +283,8 @@ ntwdt:*
 dld:*
 aggr:*
 mdesc:*
+zfs:*
+zfs:zfs
 EOF
 }
 
diff --git a/usr/src/pkgdefs/common_files/i.rbac b/usr/src/pkgdefs/common_files/i.rbac
index 1abe037852a6..27d75eab59e1 100644
--- a/usr/src/pkgdefs/common_files/i.rbac
+++ b/usr/src/pkgdefs/common_files/i.rbac
@@ -25,7 +25,7 @@
 #
 # i.rbac
 #
-# Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # class action script for "rbac" class files
@@ -45,14 +45,17 @@
 
 tmp_dir=/tmp
 
-basename_cmd=/usr/bin/basename
-cp_cmd=/usr/bin/cp
-egrep_cmd=/usr/bin/egrep
-mv_cmd=/usr/bin/mv
-nawk_cmd=/usr/bin/nawk
-rm_cmd=/usr/bin/rm
-sed_cmd=/usr/bin/sed
-sort_cmd=/usr/bin/sort
+PATH="/usr/bin:/usr/sbin:${PATH}"
+export PATH
+
+basename_cmd=basename
+cp_cmd=cp
+egrep_cmd=egrep
+mv_cmd=mv
+nawk_cmd=nawk
+rm_cmd=rm
+sed_cmd=sed
+sort_cmd=sort
 
 # $1 is the type
 # $2 is the "old/existing file"
diff --git a/usr/src/pkgdefs/etc/exception_list_i386 b/usr/src/pkgdefs/etc/exception_list_i386
index 0333f2eacc5d..845ab53571a3 100644
--- a/usr/src/pkgdefs/etc/exception_list_i386
+++ b/usr/src/pkgdefs/etc/exception_list_i386
@@ -699,6 +699,12 @@ lib/amd64/llib-lcmdutils.ln		i386
 lib/amd64/libcmdutils.so		i386
 usr/lib/amd64/libcmdutils.so		i386
 usr/lib/amd64/llib-lcmdutils.ln		i386
+
+#
+# Private interfaces in libsec
+#
+usr/include/aclutils.h			i386
+
 #
 # User<->kernel interface used by cfgadm/IB only
 #
@@ -721,3 +727,19 @@ usr/include/sys/libdevid.h		i386
 #
 lib/libc_i18n.a				i386
 lib/amd64/libc_i18n.a			i386
+
+#
+# ZFS internal tools and lint libraries
+#
+usr/bin/ztest				i386
+usr/bin/i86/ztest			i386
+usr/bin/amd64/ztest			i386
+usr/lib/llib-lzfs.ln			i386
+usr/lib/llib-lzfs			i386
+usr/lib/llib-lzfs_jni			i386
+usr/lib/llib-lzfs_jni.ln		i386
+usr/lib/llib-lzpool			i386
+usr/lib/llib-lzpool.ln			i386
+usr/lib/amd64/llib-lzfs.ln		i386
+usr/lib/amd64/llib-lzfs_jni.ln		i386
+usr/lib/amd64/llib-lzpool.ln		i386
diff --git a/usr/src/pkgdefs/etc/exception_list_sparc b/usr/src/pkgdefs/etc/exception_list_sparc
index 4be255ae8b9b..500564a3eb3a 100644
--- a/usr/src/pkgdefs/etc/exception_list_sparc
+++ b/usr/src/pkgdefs/etc/exception_list_sparc
@@ -733,6 +733,11 @@ usr/lib/llib-lcmdutils.ln			sparc
 usr/lib/libcmdutils.so				sparc
 usr/lib/sparcv9/llib-lcmdutils.ln		sparc
 usr/lib/sparcv9/libcmdutils.so			sparc
+#
+# Private interfaces in libsec
+#
+usr/include/aclutils.h				sparc
+
 #
 # User<->kernel interface used by cfgadm/IB only
 #
@@ -792,3 +797,16 @@ lib/sparcv9/libc_i18n.a				sparc
 # Only the shared object is shipped.
 #
 usr/platform/SUNW,Sun-Fire-T200/lib/llib-lpcp.ln	sparc
+#
+# ZFS internal tools and lint libraries
+#
+usr/bin/ztest				sparc
+usr/bin/sparcv9/ztest			sparc
+usr/lib/llib-lzfs			sparc
+usr/lib/llib-lzfs.ln			sparc
+usr/lib/llib-lzfs_jni			sparc
+usr/lib/llib-lzfs_jni.ln		sparc
+usr/lib/llib-lzpool			sparc
+usr/lib/sparcv9/llib-lzfs.ln		sparc
+usr/lib/sparcv9/llib-lzfs_jni.ln	sparc
+usr/lib/sparcv9/llib-lzpool.ln		sparc
diff --git a/usr/src/psm/promif/ieee1275/common/prom_boot.c b/usr/src/psm/promif/ieee1275/common/prom_boot.c
index bebe45bd0073..a300d4fd7157 100644
--- a/usr/src/psm/promif/ieee1275/common/prom_boot.c
+++ b/usr/src/psm/promif/ieee1275/common/prom_boot.c
@@ -20,8 +20,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 1991-1994, by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -33,7 +33,7 @@ char *
 prom_bootargs(void)
 {
 	int length;
-	dnode_t node;
+	pnode_t node;
 	static char *name = "bootargs";
 	static char bootargs[OBP_MAXPATHLEN];
 
@@ -65,7 +65,7 @@ prom_bootpath(void)
 {
 	static char bootpath[OBP_MAXPATHLEN];
 	int length;
-	dnode_t node;
+	pnode_t node;
 	static char *name = "bootpath";
 
 	if (bootpath[0] != (char)0)
diff --git a/usr/src/psm/promif/ieee1275/common/prom_devtype.c b/usr/src/psm/promif/ieee1275/common/prom_devtype.c
index 33103763372d..626da4695a34 100644
--- a/usr/src/psm/promif/ieee1275/common/prom_devtype.c
+++ b/usr/src/psm/promif/ieee1275/common/prom_devtype.c
@@ -20,8 +20,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 1991-1994, by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -30,7 +30,7 @@
 #include <sys/promimpl.h>
 
 int
-prom_devicetype(dnode_t id, char *type)
+prom_devicetype(pnode_t id, char *type)
 {
 	register int len;
 	char buf[OBP_MAXDRVNAME];
@@ -48,7 +48,7 @@ prom_devicetype(dnode_t id, char *type)
 }
 
 int
-prom_getnode_byname(dnode_t id, char *name)
+prom_getnode_byname(pnode_t id, char *name)
 {
 	int len;
 	char buf[OBP_MAXDRVNAME];
diff --git a/usr/src/psm/promif/ieee1275/common/prom_fb.c b/usr/src/psm/promif/ieee1275/common/prom_fb.c
index bc3cb5f8bfa5..a44f4f599a3a 100644
--- a/usr/src/psm/promif/ieee1275/common/prom_fb.c
+++ b/usr/src/psm/promif/ieee1275/common/prom_fb.c
@@ -20,8 +20,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 1995, by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -35,7 +35,7 @@ prom_stdout_is_framebuffer(void)
 	static int remember = -1;
 
 	if (remember == -1)
-		remember = prom_devicetype((dnode_t) prom_stdout_node(),
+		remember = prom_devicetype((pnode_t)prom_stdout_node(),
 			OBP_DISPLAY);
 	return (remember);
 }
diff --git a/usr/src/psm/promif/ieee1275/common/prom_node.c b/usr/src/psm/promif/ieee1275/common/prom_node.c
index a0c6fe0af1a4..efcd18a3045e 100644
--- a/usr/src/psm/promif/ieee1275/common/prom_node.c
+++ b/usr/src/psm/promif/ieee1275/common/prom_node.c
@@ -32,8 +32,8 @@
 /*
  * Routines for walking the PROMs devinfo tree
  */
-dnode_t
-prom_nextnode(dnode_t nodeid)
+pnode_t
+prom_nextnode(pnode_t nodeid)
 {
 	cell_t ci[5];
 
@@ -50,8 +50,8 @@ prom_nextnode(dnode_t nodeid)
 	return (p1275_cell2dnode(ci[4]));	/* Res1: peer phandle */
 }
 
-dnode_t
-prom_childnode(dnode_t nodeid)
+pnode_t
+prom_childnode(pnode_t nodeid)
 {
 	cell_t ci[5];
 
@@ -74,10 +74,10 @@ prom_childnode(dnode_t nodeid)
  * overhead of a recursive implementation.
  */
 void
-prom_walk_devs(dnode_t node, int (*cb)(dnode_t, void *, void *), void *arg,
+prom_walk_devs(pnode_t node, int (*cb)(pnode_t, void *, void *), void *arg,
     void *result)
 {
-	dnode_t stack[OBP_STACKDEPTH];
+	pnode_t stack[OBP_STACKDEPTH];
 	int stackidx = 0;
 
 	if (node == OBP_NONODE || node == OBP_BADNODE) {
@@ -87,8 +87,8 @@ prom_walk_devs(dnode_t node, int (*cb)(dnode_t, void *, void *), void *arg,
 	stack[0] = node;
 
 	for (;;) {
-		dnode_t curnode = stack[stackidx];
-		dnode_t child;
+		pnode_t curnode = stack[stackidx];
+		pnode_t child;
 
 		/*
 		 * We're out of stuff to do at this level, bump back up a level
@@ -137,19 +137,19 @@ prom_walk_devs(dnode_t node, int (*cb)(dnode_t, void *, void *), void *arg,
  * supplied in 'devtype'.
  */
 static int
-bytype_cb(dnode_t node, void *arg, void *result)
+bytype_cb(pnode_t node, void *arg, void *result)
 {
 	if (prom_devicetype(node, (char *)arg)) {
-		*((dnode_t *)result) = node;
+		*((pnode_t *)result) = node;
 		return (PROM_WALK_TERMINATE);
 	}
 	return (PROM_WALK_CONTINUE);
 }
 
-dnode_t
-prom_findnode_bydevtype(dnode_t node, char *devtype)
+pnode_t
+prom_findnode_bydevtype(pnode_t node, char *devtype)
 {
-	dnode_t result = OBP_NONODE;
+	pnode_t result = OBP_NONODE;
 	prom_walk_devs(node, bytype_cb, devtype, &result);
 	return (result);
 }
@@ -160,19 +160,19 @@ prom_findnode_bydevtype(dnode_t node, char *devtype)
  * returns the first node whose name matches the name supplied in 'name'.
  */
 static int
-byname_cb(dnode_t node, void *arg, void *result)
+byname_cb(pnode_t node, void *arg, void *result)
 {
 	if (prom_getnode_byname(node, (char *)arg)) {
-		*((dnode_t *)result) = node;
+		*((pnode_t *)result) = node;
 		return (PROM_WALK_TERMINATE);
 	}
 	return (PROM_WALK_CONTINUE);
 }
 
-dnode_t
-prom_findnode_byname(dnode_t node, char *name)
+pnode_t
+prom_findnode_byname(pnode_t node, char *name)
 {
-	dnode_t result = OBP_NONODE;
+	pnode_t result = OBP_NONODE;
 	prom_walk_devs(node, byname_cb, name, &result);
 	return (result);
 }
@@ -181,16 +181,16 @@ prom_findnode_byname(dnode_t node, char *name)
  * Return the root nodeid.
  * Calling prom_nextnode(0) returns the root nodeid.
  */
-dnode_t
+pnode_t
 prom_rootnode(void)
 {
-	static dnode_t rootnode;
+	static pnode_t rootnode;
 
 	return (rootnode ? rootnode : (rootnode = prom_nextnode(OBP_NONODE)));
 }
 
-dnode_t
-prom_parentnode(dnode_t nodeid)
+pnode_t
+prom_parentnode(pnode_t nodeid)
 {
 	cell_t ci[5];
 
@@ -207,7 +207,7 @@ prom_parentnode(dnode_t nodeid)
 	return (p1275_cell2dnode(ci[4]));	/* Res1: parent phandle */
 }
 
-dnode_t
+pnode_t
 prom_finddevice(char *path)
 {
 	cell_t ci[5];
@@ -243,14 +243,14 @@ prom_finddevice(char *path)
 		promplat_free(path, len);
 #endif
 
-	return ((dnode_t)p1275_cell2dnode(ci[4])); /* Res1: phandle */
+	return ((pnode_t)p1275_cell2dnode(ci[4])); /* Res1: phandle */
 }
 
-dnode_t
+pnode_t
 prom_chosennode(void)
 {
-	static dnode_t chosen;
-	dnode_t	node;
+	static pnode_t chosen;
+	pnode_t	node;
 
 	if (chosen)
 		return (chosen);
@@ -267,7 +267,7 @@ prom_chosennode(void)
 	 * gcc doesn't recognize "NOTREACHED" and puts the warning.
 	 * To surpress it, returning an integer value is required.
 	 */
-	return ((dnode_t)0);
+	return ((pnode_t)0);
 }
 
 /*
@@ -275,10 +275,10 @@ prom_chosennode(void)
  * /aliases exists in OBP >= 2.4 and in Open Firmware.
  * Returns OBP_BADNODE if it doesn't exist.
  */
-dnode_t
+pnode_t
 prom_alias_node(void)
 {
-	static dnode_t node;
+	static pnode_t node;
 
 	if (node == 0)
 		node = prom_finddevice("/aliases");
@@ -289,10 +289,10 @@ prom_alias_node(void)
  * Returns the nodeid of /options.
  * Returns OBP_BADNODE if it doesn't exist.
  */
-dnode_t
+pnode_t
 prom_optionsnode(void)
 {
-	static dnode_t node;
+	static pnode_t node;
 
 	if (node == 0)
 		node = prom_finddevice("/options");
diff --git a/usr/src/psm/promif/ieee1275/common/prom_prop.c b/usr/src/psm/promif/ieee1275/common/prom_prop.c
index 5440be79e06b..723cc96c22a5 100644
--- a/usr/src/psm/promif/ieee1275/common/prom_prop.c
+++ b/usr/src/psm/promif/ieee1275/common/prom_prop.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -179,7 +179,7 @@ prom_setprop_null(void)
 }
 
 int
-prom_getproplen(dnode_t nodeid, caddr_t name)
+prom_getproplen(pnode_t nodeid, caddr_t name)
 {
 	cell_t ci[6];
 
@@ -199,7 +199,7 @@ prom_getproplen(dnode_t nodeid, caddr_t name)
 
 
 int
-prom_getprop(dnode_t nodeid, caddr_t name, caddr_t value)
+prom_getprop(pnode_t nodeid, caddr_t name, caddr_t value)
 {
 	int len, rv;
 	cell_t ci[8];
@@ -235,7 +235,7 @@ prom_getprop(dnode_t nodeid, caddr_t name, caddr_t value)
 }
 
 int
-prom_bounded_getprop(dnode_t nodeid, caddr_t name, caddr_t value, int len)
+prom_bounded_getprop(pnode_t nodeid, caddr_t name, caddr_t value, int len)
 {
 	cell_t ci[8];
 
@@ -256,7 +256,7 @@ prom_bounded_getprop(dnode_t nodeid, caddr_t name, caddr_t value, int len)
 }
 
 caddr_t
-prom_nextprop(dnode_t nodeid, caddr_t previous, caddr_t next)
+prom_nextprop(pnode_t nodeid, caddr_t previous, caddr_t next)
 {
 	cell_t ci[7];
 
@@ -277,7 +277,7 @@ prom_nextprop(dnode_t nodeid, caddr_t previous, caddr_t next)
 }
 
 int
-prom_setprop(dnode_t nodeid, caddr_t name, caddr_t value, int len)
+prom_setprop(pnode_t nodeid, caddr_t name, caddr_t value, int len)
 {
 	cell_t ci[8];
 #ifdef PROM_32BIT_ADDRS
diff --git a/usr/src/psm/promif/ieee1275/common/prom_stdin.c b/usr/src/psm/promif/ieee1275/common/prom_stdin.c
index f371d5e4ade0..65b5bd6a91b6 100644
--- a/usr/src/psm/promif/ieee1275/common/prom_stdin.c
+++ b/usr/src/psm/promif/ieee1275/common/prom_stdin.c
@@ -20,8 +20,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 1994, by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -57,7 +57,7 @@ prom_stdin_ihandle(void)
 /*
  * Return phandle of stdin
  */
-dnode_t
+pnode_t
 prom_stdin_node(void)
 {
 	static phandle_t pstdin;
@@ -67,7 +67,7 @@ prom_stdin_node(void)
 		return (pstdin);
 
 	if ((istdin = prom_stdin_ihandle()) == (ihandle_t)-1)
-		return (pstdin = (dnode_t)OBP_BADNODE);
+		return (pstdin = (pnode_t)OBP_BADNODE);
 
 	return (pstdin = prom_getphandle(istdin));
 }
diff --git a/usr/src/psm/promif/ieee1275/common/prom_stdout.c b/usr/src/psm/promif/ieee1275/common/prom_stdout.c
index f30bc03119e2..37641d25b074 100644
--- a/usr/src/psm/promif/ieee1275/common/prom_stdout.c
+++ b/usr/src/psm/promif/ieee1275/common/prom_stdout.c
@@ -20,8 +20,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 1991-1994, by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -55,7 +55,7 @@ prom_stdout_ihandle(void)
 /*
  * Return phandle of stdout
  */
-dnode_t
+pnode_t
 prom_stdout_node(void)
 {
 	static phandle_t pstdout;
@@ -65,7 +65,7 @@ prom_stdout_node(void)
 		return (pstdout);
 
 	if ((istdout = prom_stdout_ihandle()) == (ihandle_t)-1)
-		return (pstdout = (dnode_t)OBP_BADNODE);
+		return (pstdout = (pnode_t)OBP_BADNODE);
 
 	return (pstdout = prom_getphandle(istdout));
 }
diff --git a/usr/src/psm/promif/ieee1275/common/prom_test.c b/usr/src/psm/promif/ieee1275/common/prom_test.c
index 3fe84be7de6c..cab00ef1b7e8 100644
--- a/usr/src/psm/promif/ieee1275/common/prom_test.c
+++ b/usr/src/psm/promif/ieee1275/common/prom_test.c
@@ -51,7 +51,7 @@ prom_test(char *service)
 }
 
 int
-prom_test_method(char *method, dnode_t node)
+prom_test_method(char *method, pnode_t node)
 {
 	cell_t ci[6];
 	int rv;
diff --git a/usr/src/psm/promif/ieee1275/sun4/prom_cpuctl.c b/usr/src/psm/promif/ieee1275/sun4/prom_cpuctl.c
index 515c88bffb8d..792e7742a38e 100644
--- a/usr/src/psm/promif/ieee1275/sun4/prom_cpuctl.c
+++ b/usr/src/psm/promif/ieee1275/sun4/prom_cpuctl.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1994-2000, 2002 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -48,7 +48,7 @@ prom_stopcpu_bycpuid(int cpuid)
 
 
 int
-prom_startcpu(dnode_t node, caddr_t pc, int arg)
+prom_startcpu(pnode_t node, caddr_t pc, int arg)
 {
 	cell_t ci[6];
 
@@ -86,7 +86,7 @@ prom_startcpu_bycpuid(int cpuid, caddr_t pc, int arg)
 }
 
 int
-prom_wakeupcpu(dnode_t node)
+prom_wakeupcpu(pnode_t node)
 {
 	cell_t ci[5];
 	int	rv;
@@ -107,7 +107,7 @@ prom_wakeupcpu(dnode_t node)
 }
 
 int
-prom_cpuoff(dnode_t node)
+prom_cpuoff(pnode_t node)
 {
 	cell_t ci[5];
 	int rv;
diff --git a/usr/src/psm/promif/ieee1275/sun4/prom_macaddr.c b/usr/src/psm/promif/ieee1275/sun4/prom_macaddr.c
index 492b66869d0a..1440e390eb92 100644
--- a/usr/src/psm/promif/ieee1275/sun4/prom_macaddr.c
+++ b/usr/src/psm/promif/ieee1275/sun4/prom_macaddr.c
@@ -20,8 +20,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 1994, by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -39,7 +39,7 @@ int
 prom_getmacaddr(ihandle_t hd, caddr_t ea)
 {
 	idprom_t idprom;
-	dnode_t macnodeid;
+	pnode_t macnodeid;
 
 	/*
 	 * Look for the 'mac-address' property in the device node
@@ -64,9 +64,9 @@ prom_getmacaddr(ihandle_t hd, caddr_t ea)
 	 * This code (idprom) is SMCC (and compatibles) platform-centric.
 	 * This code always returns the platform mac address.
 	 */
-	if (prom_getidprom((caddr_t) &idprom, sizeof (idprom)) == 0) {
-		register char *f = (char *) idprom.id_ether;
-		register char *t = ea;
+	if (prom_getidprom((caddr_t)&idprom, sizeof (idprom)) == 0) {
+		char *f = (char *)idprom.id_ether;
+		char *t = ea;
 		int i;
 
 		for (i = 0; i < sizeof (idprom.id_ether); ++i)
diff --git a/usr/src/psm/promif/ieee1275/sun4/prom_vername.c b/usr/src/psm/promif/ieee1275/sun4/prom_vername.c
index c56baf16924f..3d049d7803d5 100644
--- a/usr/src/psm/promif/ieee1275/sun4/prom_vername.c
+++ b/usr/src/psm/promif/ieee1275/sun4/prom_vername.c
@@ -20,8 +20,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 1991-1995, by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -45,7 +45,7 @@
 int
 prom_version_name(char *buf, int buflen)
 {
-	dnode_t nodeid;
+	pnode_t nodeid;
 	int proplen;
 	char *unknown = "unknown";
 
@@ -59,7 +59,7 @@ prom_version_name(char *buf, int buflen)
 	 */
 
 	nodeid = prom_finddevice("/openprom");
-	if (nodeid == (dnode_t)-1)
+	if (nodeid == (pnode_t)-1)
 		return (-1);
 
 	proplen = prom_bounded_getprop(nodeid, "version", buf, buflen - 1);
diff --git a/usr/src/psm/promif/ieee1275/sun4u/prom_serengeti.c b/usr/src/psm/promif/ieee1275/sun4u/prom_serengeti.c
index 7e99303d7b69..fbdb0f556556 100644
--- a/usr/src/psm/promif/ieee1275/sun4u/prom_serengeti.c
+++ b/usr/src/psm/promif/ieee1275/sun4u/prom_serengeti.c
@@ -20,8 +20,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 2000 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -128,7 +128,7 @@ prom_serengeti_tunnel_switch(uint_t node, uint_t board)
 }
 
 int
-prom_serengeti_cpu_off(dnode_t node)
+prom_serengeti_cpu_off(pnode_t node)
 {
 	cell_t ci[5];
 	int rv;
@@ -197,7 +197,7 @@ prom_serengeti_get_ecacheunum(int cpuid, unsigned long long physaddr, char *buf,
 }
 
 int
-prom_serengeti_wakeupcpu(dnode_t node)
+prom_serengeti_wakeupcpu(pnode_t node)
 {
 	cell_t ci[5];
 	int	rv;
diff --git a/usr/src/psm/promif/ieee1275/sun4u/prom_vercheck.c b/usr/src/psm/promif/ieee1275/sun4u/prom_vercheck.c
index 734a53886b83..608b9156c724 100644
--- a/usr/src/psm/promif/ieee1275/sun4u/prom_vercheck.c
+++ b/usr/src/psm/promif/ieee1275/sun4u/prom_vercheck.c
@@ -164,7 +164,7 @@ obp_timestamp(char *v)
  */
 
 static struct obp_rev_table *flashprom_ortp;
-static dnode_t flashprom_node;
+static pnode_t flashprom_node;
 static int flashprom_checked;
 static int flashprom_return_code;
 
@@ -201,12 +201,12 @@ check_timestamp(char *model, int tstamp)
 	return (0);
 }
 
-static dnode_t
-visit(dnode_t node)
+static pnode_t
+visit(pnode_t node)
 {
 	int tstamp, plen, i;
 	char vers[512], model[64];
-	static dnode_t openprom_node;
+	static pnode_t openprom_node;
 	static char version[] = "version";
 	static char model_name[] = "model";
 	static char flashprom[] = "flashprom";
@@ -215,24 +215,24 @@ visit(dnode_t node)
 	 * if name isn't 'flashprom', continue.
 	 */
 	if (prom_getproplen(node, OBP_NAME) != sizeof (flashprom))
-		return ((dnode_t)0);
+		return ((pnode_t)0);
 	(void) prom_getprop(node, OBP_NAME, model);
 	if (prom_strncmp(model, flashprom, sizeof (flashprom)) != 0)
-		return ((dnode_t)0);
+		return ((pnode_t)0);
 
 	plen = prom_getproplen(node, version);
 	if (plen <= 0 || plen > sizeof (vers))
-		return ((dnode_t)0);
+		return ((pnode_t)0);
 	(void) prom_getprop(node, version, vers);
 	vers[plen] = '\0';
 
 	/* Make sure it's an OBP flashprom */
 	if (vers[0] != 'O' && vers[1] != 'B' && vers[2] != 'P')
-		return ((dnode_t)0);
+		return ((pnode_t)0);
 
 	plen = prom_getproplen(node, model_name);
 	if (plen <= 0 || plen > sizeof (model))
-		return ((dnode_t)0);
+		return ((pnode_t)0);
 	(void) prom_getprop(node, model_name, model);
 	model[plen] = '\0';
 
@@ -241,13 +241,13 @@ visit(dnode_t node)
 		prom_printf("prom_version_check: node contains "
 		    "improperly formatted version property,\n"
 		    "\tnot checking prom version");
-		return ((dnode_t)0);
+		return ((pnode_t)0);
 	}
 
 	i = check_timestamp(model, tstamp);
 
 	if (i == 0)
-		return ((dnode_t)0);
+		return ((pnode_t)0);
 
 	/*
 	 * We know that "node"'s flashprom image contains downrev firmware,
@@ -300,19 +300,19 @@ visit(dnode_t node)
 /*
  * visit each node in the device tree, until we get a non-null answer
  */
-static dnode_t
-walk(dnode_t node)
+static pnode_t
+walk(pnode_t node)
 {
-	dnode_t id;
+	pnode_t id;
 
 	if (visit(node))
 		return (node);
 
 	for (node = prom_childnode(node); node; node = prom_nextnode(node))
-		if ((id = walk(node)) != (dnode_t)0)
+		if ((id = walk(node)) != (pnode_t)0)
 			return (id);
 
-	return ((dnode_t)0);
+	return ((pnode_t)0);
 }
 
 /*
@@ -327,10 +327,10 @@ walk(dnode_t node)
  * and a printable message in *buf, buflen.
  */
 int
-prom_version_check(char *buf, size_t buflen, dnode_t *nodeid)
+prom_version_check(char *buf, size_t buflen, pnode_t *nodeid)
 {
 	char *p;
-	dnode_t node = flashprom_node;
+	pnode_t node = flashprom_node;
 	size_t i;
 
 	/*
@@ -344,7 +344,7 @@ prom_version_check(char *buf, size_t buflen, dnode_t *nodeid)
 	if (nodeid)
 		*nodeid = node;
 
-	if (node == (dnode_t)0) {
+	if (node == (pnode_t)0) {
 		if (buf && buflen)
 			*buf = '\0';
 		return (PROM_VER64_OK);
diff --git a/usr/src/psm/stand/boot/sparc/common/boot_plat.c b/usr/src/psm/stand/boot/sparc/common/boot_plat.c
index 0a74ed0980f9..ff1f0fe23b93 100644
--- a/usr/src/psm/stand/boot/sparc/common/boot_plat.c
+++ b/usr/src/psm/stand/boot/sparc/common/boot_plat.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -429,7 +429,7 @@ init_bootargs(char *fname_buf, int fname_buf_sz, char *bargs_buf,
 boolean_t
 is_netdev(char *devpath)
 {
-	dnode_t	node = prom_finddevice(devpath);
+	pnode_t	node = prom_finddevice(devpath);
 	char *options;
 
 	if ((node == OBP_NONODE) || (node == OBP_BADNODE))
@@ -473,7 +473,7 @@ is_netdev(char *devpath)
 void
 mangle_os_bootpath(char *bpath)
 {
-	dnode_t node;
+	pnode_t node;
 	char *stripped_pathname;
 
 	node = prom_finddevice(bpath);
diff --git a/usr/src/psm/stand/boot/sparc/common/sun4dep.c b/usr/src/psm/stand/boot/sparc/common/sun4dep.c
index b18456a47b74..7a6cb4937ceb 100644
--- a/usr/src/psm/stand/boot/sparc/common/sun4dep.c
+++ b/usr/src/psm/stand/boot/sparc/common/sun4dep.c
@@ -46,7 +46,7 @@ fiximp(void)
 void
 setup_aux(void)
 {
-	dnode_t node;
+	pnode_t node;
 	/* big enough for OBP_NAME and for a reasonably sized OBP_COMPATIBLE. */
 	static char cpubuf[5 * OBP_MAXDRVNAME];
 	extern uint_t icache_flush;
diff --git a/usr/src/psm/stand/boot/sparc/common/sun4u_memlist.c b/usr/src/psm/stand/boot/sparc/common/sun4u_memlist.c
index e25ea32d71e2..aca63e393e64 100644
--- a/usr/src/psm/stand/boot/sparc/common/sun4u_memlist.c
+++ b/usr/src/psm/stand/boot/sparc/common/sun4u_memlist.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1992-2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -77,14 +77,14 @@ init_memlists(void)
 struct memlist *
 fill_memlists(char *name, char *prop, struct memlist *old)
 {
-	static dnode_t pmem = 0;
-	static dnode_t pmmu = 0;
-	dnode_t node;
+	static pnode_t pmem = 0;
+	static pnode_t pmmu = 0;
+	pnode_t node;
 	size_t links;
 	struct memlist *al;
 	struct sun4u_prom_memlist *pm = scratch_memlist;
 
-	if (pmem == (dnode_t)0)  {
+	if (pmem == (pnode_t)0)  {
 
 		/*
 		 * Figure out the interesting phandles, one time
diff --git a/usr/src/psm/stand/boot/sparc/common/sun4x_standalloc.c b/usr/src/psm/stand/boot/sparc/common/sun4x_standalloc.c
index 5f4d25856529..707c7ff5c2d9 100644
--- a/usr/src/psm/stand/boot/sparc/common/sun4x_standalloc.c
+++ b/usr/src/psm/stand/boot/sparc/common/sun4x_standalloc.c
@@ -107,7 +107,7 @@ static caddr_t top_resvmem, scratchresvp;
 static int
 impl_name(char *buf, size_t bufsz)
 {
-	dnode_t n = prom_rootnode();
+	pnode_t n = prom_rootnode();
 	size_t len = prom_getproplen(n, "name");
 
 	if (len == 0 || len >= bufsz)
diff --git a/usr/src/psm/stand/boot/sparc/common/wanboot.c b/usr/src/psm/stand/boot/sparc/common/wanboot.c
index b1f3ae836fd8..ddcbadf7c559 100644
--- a/usr/src/psm/stand/boot/sparc/common/wanboot.c
+++ b/usr/src/psm/stand/boot/sparc/common/wanboot.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -1556,7 +1556,7 @@ get_wanbootfs(const url_t *server_url)
 static boolean_t
 init_netdev(char *bpath)
 {
-	dnode_t		anode;
+	pnode_t		anode;
 	int		proplen;
 	static char	netalias[OBP_MAXPATHLEN];
 
diff --git a/usr/src/psm/stand/boot/sparcv9/sun4u/machdep.c b/usr/src/psm/stand/boot/sparcv9/sun4u/machdep.c
index 48b1bcadf8f1..59de73936ba0 100644
--- a/usr/src/psm/stand/boot/sparcv9/sun4u/machdep.c
+++ b/usr/src/psm/stand/boot/sparcv9/sun4u/machdep.c
@@ -52,8 +52,8 @@ int vac = 1;
  */
 #define	IMPL_US_I	0x10
 
-static dnode_t
-visit(dnode_t node)
+static pnode_t
+visit(pnode_t node)
 {
 	int impl, manu;
 	char name[32];
@@ -65,24 +65,24 @@ visit(dnode_t node)
 	 * if name isn't 'SUNW,UltraSPARC', continue.
 	 */
 	if (prom_getproplen(node, "name") != sizeof (ultrasparc))
-		return ((dnode_t)0);
+		return ((pnode_t)0);
 	(void) prom_getprop(node, "name", name);
 	if (strncmp(name, ultrasparc, sizeof (ultrasparc)) != 0)
-		return ((dnode_t)0);
+		return ((pnode_t)0);
 
 	if (prom_getproplen(node, manufacturer) != sizeof (int))
-		return ((dnode_t)0);
+		return ((pnode_t)0);
 	(void) prom_getprop(node, manufacturer, (caddr_t)&manu);
 
 	if ((manu != SUNW_JEDEC) && (manu != TI_JEDEC))
-		return ((dnode_t)0);
+		return ((pnode_t)0);
 
 	if (prom_getproplen(node, implementation) != sizeof (int))
-		return ((dnode_t)0);
+		return ((pnode_t)0);
 	(void) prom_getprop(node, implementation, (caddr_t)&impl);
 
 	if (impl != IMPL_US_I)
-		return ((dnode_t)0);
+		return ((pnode_t)0);
 
 	return (node);
 }
@@ -90,19 +90,19 @@ visit(dnode_t node)
 /*
  * visit each node in the device tree, until we get a non-null answer
  */
-static dnode_t
-walk(dnode_t node)
+static pnode_t
+walk(pnode_t node)
 {
-	dnode_t id;
+	pnode_t id;
 
 	if (visit(node))
 		return (node);
 
 	for (node = prom_childnode(node); node; node = prom_nextnode(node))
-		if ((id = walk(node)) != (dnode_t)0)
+		if ((id = walk(node)) != (pnode_t)0)
 			return (id);
 
-	return ((dnode_t)0);
+	return ((pnode_t)0);
 }
 
 /*
diff --git a/usr/src/psm/stand/bootblks/obp-c/common/romp.h b/usr/src/psm/stand/bootblks/obp-c/common/romp.h
index faa6ceca4a0d..999564996d62 100644
--- a/usr/src/psm/stand/bootblks/obp-c/common/romp.h
+++ b/usr/src/psm/stand/bootblks/obp-c/common/romp.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2002 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -42,17 +42,17 @@ extern "C" {
 
 struct config_ops {
 #ifdef	_KERNEL
-	dnode_t	(*devr_next)(/* dnode_t nodeid */);
-	dnode_t	(*devr_child)(/* dnode_t nodeid */);
+	pnode_t	(*devr_next)(/* pnode_t nodeid */);
+	pnode_t	(*devr_child)(/* pnode_t nodeid */);
 #else	/* _KERNEL */
-	int	(*devr_next)(/* dnode_t nodeid */);
-	int	(*devr_child)(/* dnode_t nodeid */);
+	int	(*devr_next)(/* pnode_t nodeid */);
+	int	(*devr_child)(/* pnode_t nodeid */);
 #endif	/* _KERNEL */
-	int	(*devr_getproplen)(/* dnode_t nodeid, char *name */);
-	int	(*devr_getprop)(/* dnode_t nodeid, char *name, caddr_t buf */);
-	int	(*devr_setprop)(/* dnode_t nodeid, char *name, caddr_t value,
+	int	(*devr_getproplen)(/* pnode_t nodeid, char *name */);
+	int	(*devr_getprop)(/* pnode_t nodeid, char *name, caddr_t buf */);
+	int	(*devr_setprop)(/* pnode_t nodeid, char *name, caddr_t value,
 	    uint_t size */);
-	caddr_t	(*devr_nextprop)(/* dnode_t nodeid, char *previous */);
+	caddr_t	(*devr_nextprop)(/* pnode_t nodeid, char *previous */);
 };
 
 struct romvec_obp {
@@ -204,13 +204,13 @@ struct romvec_obp {
 	 * V3 MP only functions: It's a fatal error to call these from a UP.
 	 */
 
-	int (*op3_startcpu)(/* dnode_t moduleid, dev_reg_t contextable,
+	int (*op3_startcpu)(/* pnode_t moduleid, dev_reg_t contextable,
 	    int whichcontext, caddr_t pc */);
 
-	int (*op3_stopcpu)(/* dnode_t */);
+	int (*op3_stopcpu)(/* pnode_t */);
 
-	int (*op3_idlecpu)(/* dnode_t */);
-	int (*op3_resumecpu)(/* dnode_t */);
+	int (*op3_idlecpu)(/* pnode_t */);
+	int (*op3_resumecpu)(/* pnode_t */);
 };
 
 union sunromvec {
diff --git a/usr/src/psm/stand/cpr/common/support.c b/usr/src/psm/stand/cpr/common/support.c
index 6201e223dd0e..268fcd444591 100644
--- a/usr/src/psm/stand/cpr/common/support.c
+++ b/usr/src/psm/stand/cpr/common/support.c
@@ -99,7 +99,7 @@ cpr_reset_properties(void)
 	int fd, len, rc, prop_errors;
 	cprop_t *prop, *tail;
 	cdef_t cdef;
-	dnode_t node;
+	pnode_t node;
 
 	str = "cpr_reset_properties";
 	default_path = CPR_DEFAULT;
diff --git a/usr/src/psm/stand/cpr/sparcv9/sun4u/bitmap.c b/usr/src/psm/stand/cpr/sparcv9/sun4u/bitmap.c
index d033ab70aa7a..c462ce0b5323 100644
--- a/usr/src/psm/stand/cpr/sparcv9/sun4u/bitmap.c
+++ b/usr/src/psm/stand/cpr/sparcv9/sun4u/bitmap.c
@@ -416,7 +416,7 @@ cb_get_physavail(void)
 {
 	int len, glen, scnt, need, space;
 	char *str, *pdev, *mem_prop;
-	dnode_t mem_node;
+	pnode_t mem_node;
 	physaddr_t phys;
 	pgcnt_t pages;
 	arange_t *arp;
diff --git a/usr/src/psm/stand/cpr/sparcv9/sun4u/util.c b/usr/src/psm/stand/cpr/sparcv9/sun4u/util.c
index e2e666792922..d59258c8dbf6 100644
--- a/usr/src/psm/stand/cpr/sparcv9/sun4u/util.c
+++ b/usr/src/psm/stand/cpr/sparcv9/sun4u/util.c
@@ -261,7 +261,7 @@ cb_alloc(size_t size, uint_t align, caddr_t *vap, physaddr_t *pap)
 
 
 static int
-get_intprop(dnode_t node, caddr_t prop, void *dst)
+get_intprop(pnode_t node, caddr_t prop, void *dst)
 {
 	int len, glen;
 
@@ -280,11 +280,11 @@ get_intprop(dnode_t node, caddr_t prop, void *dst)
  * sets globals:
  * 	cb_mid
  */
-static dnode_t
+static pnode_t
 get_cpu_node(void)
 {
 	static char *props[] = { "upa-portid", "portid", NULL };
-	dnode_t node;
+	pnode_t node;
 	char *str, *name, **propp;
 	uint_t cpu_id;
 	int err;
@@ -331,7 +331,7 @@ int
 cb_get_props(void)
 {
 	uint_t clock_mhz;
-	dnode_t node;
+	pnode_t node;
 	struct cb_props *cbp;
 	static struct cb_props cpu_data[] = {
 		"#dtlb-entries", &cb_dents,
diff --git a/usr/src/psm/stand/lib/names/sparc/common/mfgname.c b/usr/src/psm/stand/lib/names/sparc/common/mfgname.c
index 358c0ac38fe0..7cb0aed7c55f 100644
--- a/usr/src/psm/stand/lib/names/sparc/common/mfgname.c
+++ b/usr/src/psm/stand/lib/names/sparc/common/mfgname.c
@@ -43,7 +43,7 @@
 char *
 get_mfg_name(void)
 {
-	dnode_t n;
+	pnode_t n;
 	int len;
 
 	static char mfgname[MAXNMLEN];
diff --git a/usr/src/psm/stand/lib/names/sparc/common/uname-i.c b/usr/src/psm/stand/lib/names/sparc/common/uname-i.c
index 42f8a90bd4c4..6e7e9401f065 100644
--- a/usr/src/psm/stand/lib/names/sparc/common/uname-i.c
+++ b/usr/src/psm/stand/lib/names/sparc/common/uname-i.c
@@ -69,7 +69,7 @@ get_impl_arch_name(enum ia_state_mach *state, int use_default)
 	static int len;
 	static char *ia;
 
-	dnode_t n;
+	pnode_t n;
 	char *cp;
 	char *namename;
 
@@ -78,7 +78,7 @@ get_impl_arch_name(enum ia_state_mach *state, int use_default)
 	case STATE_NAME:
 		*state = STATE_COMPAT_INIT;
 		namename = OBP_NAME;
-		n = (dnode_t)prom_rootnode();
+		n = (pnode_t)prom_rootnode();
 		len = prom_getproplen(n, namename);
 		if (len <= 0 || len >= MAXNMLEN)
 			goto newstate;
@@ -90,7 +90,7 @@ get_impl_arch_name(enum ia_state_mach *state, int use_default)
 	case STATE_COMPAT_INIT:
 		*state = STATE_COMPAT;
 		namename = OBP_COMPATIBLE;
-		n = (dnode_t)prom_rootnode();
+		n = (pnode_t)prom_rootnode();
 		len = prom_getproplen(n, namename);
 		if (len <= 0 || len >= MAXNMLEN) {
 			*state = STATE_DEFAULT;
diff --git a/usr/src/stand/lib/inet/dhcpv4.c b/usr/src/stand/lib/inet/dhcpv4.c
index c09f8965b7f0..3ddd0389af02 100644
--- a/usr/src/stand/lib/inet/dhcpv4.c
+++ b/usr/src/stand/lib/inet/dhcpv4.c
@@ -1110,12 +1110,12 @@ prom_cached_reply(int cache_present)
 	if ((len = pxe_ack_cache(&ack)) <= 0)
 		return (B_FALSE);
 #else
-	dnode_t	chosen;
+	pnode_t	chosen;
 	char	*prop = PROM_BOOT_CACHED;
 
 	chosen = prom_finddevice("/chosen");
 	if (chosen == OBP_NONODE || chosen == OBP_BADNODE)
-		chosen = prom_nextnode((dnode_t)0);	/* root node */
+		chosen = prom_nextnode((pnode_t)0);	/* root node */
 
 	if ((len = prom_getproplen(chosen, prop)) <= 0)
 		return (B_FALSE);
diff --git a/usr/src/stand/lib/inet/ibd.c b/usr/src/stand/lib/inet/ibd.c
index f7ed38cdb9ed..3294ba46ffb8 100644
--- a/usr/src/stand/lib/inet/ibd.c
+++ b/usr/src/stand/lib/inet/ibd.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -564,7 +564,7 @@ ibd_output(int index, struct inetgram *ogp)
 void
 ibd_init(void)
 {
-	dnode_t	chosen;
+	pnode_t	chosen;
 	char	*mtuprop = "ipib-frame-size";
 	char	*bcastprop = "ipib-broadcast";
 	char	*addrprop = "ipib-address";
diff --git a/usr/src/stand/lib/inet/mac.c b/usr/src/stand/lib/inet/mac.c
index aff32286fecf..009523b5f9ba 100644
--- a/usr/src/stand/lib/inet/mac.c
+++ b/usr/src/stand/lib/inet/mac.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -83,7 +83,7 @@ mac_init(char *bootdevicename)
 	static char	*chosen_net = "chosen-network-type";
 	static char	*supported_net = "supported-network-types";
 	static char	*netiftype = "network-interface-type";
-	dnode_t		node;
+	pnode_t		node;
 	char		*wp, *media_type;
 	int		len = 0, i;
 #endif	/* !__i386 */
diff --git a/usr/src/stand/lib/sa/sparc/prom_misc.c b/usr/src/stand/lib/sa/sparc/prom_misc.c
index b3dba7e8bbfc..ac6baa9e0176 100644
--- a/usr/src/stand/lib/sa/sparc/prom_misc.c
+++ b/usr/src/stand/lib/sa/sparc/prom_misc.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2002-2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -44,7 +44,7 @@ prom_create_encoded_prop(char *propname, void *prop_data, int prop_datalen,
 					" pop-package"
 					" r> to my-self";
 	char *command;
-	static dnode_t cn = OBP_NONODE;
+	static pnode_t cn = OBP_NONODE;
 
 	if (cn == OBP_NONODE) {
 		cn = prom_finddevice("/chosen");
diff --git a/usr/src/stand/lib/sock/socket.c b/usr/src/stand/lib/sock/socket.c
index 9d80b3a7381a..9db6dd7fd22d 100644
--- a/usr/src/stand/lib/sock/socket.c
+++ b/usr/src/stand/lib/sock/socket.c
@@ -1248,7 +1248,7 @@ get_netconfig_strategy(void)
 	char	lbootpath[OBP_MAXPATHLEN];
 	char	net_options[NCT_BUFSIZE];
 	char	*op, *nop, *sp;
-	dnode_t	cn;
+	pnode_t	cn;
 	int	proplen;
 
 	/* If the PROM DHCP cache exists, we're done */
diff --git a/usr/src/stand/lib/wanboot/bootinfo_aux.c b/usr/src/stand/lib/wanboot/bootinfo_aux.c
index 7e9f29fab5c4..10d5ae6494db 100644
--- a/usr/src/stand/lib/wanboot/bootinfo_aux.c
+++ b/usr/src/stand/lib/wanboot/bootinfo_aux.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -56,7 +56,7 @@ bi_end_bootinfo(void)
 boolean_t
 bi_get_chosen_prop(const char *name, void *valbuf, size_t *vallenp)
 {
-	static dnode_t	chosen;
+	static pnode_t	chosen;
 	int		len;
 
 	/*
diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh
index 30882ca24be1..697dd2b7bd3d 100644
--- a/usr/src/tools/scripts/bfu.sh
+++ b/usr/src/tools/scripts/bfu.sh
@@ -4323,6 +4323,16 @@ mondo_loop() {
 	    $root/kernel/strmod/sparcv9/tcp6	\
 	    $root/kernel/strmod/sparcv9/udp6	\
 
+	#
+	# Remove old ZFS binaries (back when it was three modules)
+	#
+	find $root/kernel/drv -name zpool | xargs rm -f
+	rm -f $root/kernel/drv/zpool.conf
+	rm -r $root/kernel/drv/zpool.cache
+
+	find $root/kernel/drv -name zvol | xargs rm -f
+	rm -f $root/kernel/drv/zvol.conf
+
 	#
 	# Remove /usr/lib/old_libthread since support for it has
 	# been removed from the kernel in Solaris 10.  If this is
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index f0f018ac8ca2..a99cc15380d3 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -23,7 +23,7 @@
 # Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
+# ident	"%Z%%M%	%I%	%E% SMI"
 #
 #	This Makefile defines all file modules for the directory uts/common
 # and its children. These are the source files which may be considered
@@ -65,6 +65,7 @@ CORE_OBJS +=	$(COMMON_CORE_OBJS) $($(MACH)_CORE_OBJS)
 GENUNIX_OBJS +=	\
 		access.o	\
 		acl.o		\
+		acl_common.o	\
 		adjtime.o	\
 		alarm.o		\
 		aio_subr.o	\
@@ -844,6 +845,71 @@ NOTIFY_OBJS += md_notify.o
 
 TRANS_OBJS += mdtrans.o trans_ioctl.o trans_log.o
 
+ZFS_COMMON_OBJS +=		\
+	arc.o			\
+	bplist.o		\
+	dbuf.o			\
+	dmu.o			\
+	dmu_object.o		\
+	dmu_objset.o		\
+	dmu_traverse.o		\
+	dmu_tx.o		\
+	dnode.o			\
+	dnode_sync.o		\
+	dsl_dir.o		\
+	dsl_dataset.o		\
+	dsl_pool.o		\
+	dmu_zfetch.o		\
+	dsl_prop.o		\
+	fletcher.o		\
+	lzjb.o			\
+	metaslab.o		\
+	refcount.o		\
+	sha256.o		\
+	spa.o			\
+	spa_config.o		\
+	spa_misc.o		\
+	space_map.o		\
+	txg.o			\
+	uberblock.o		\
+	unique.o		\
+	vdev.o			\
+	vdev_cache.o		\
+	vdev_file.o		\
+	vdev_label.o		\
+	vdev_mirror.o		\
+	vdev_missing.o		\
+	vdev_queue.o		\
+	vdev_raidz.o		\
+	vdev_root.o		\
+	zap.o			\
+	zap_leaf.o		\
+	zap_micro.o		\
+	zfs_byteswap.o		\
+	zil.o			\
+	zio.o			\
+	zio_checksum.o		\
+	zio_compress.o
+
+ZFS_SHARED_OBJS +=		\
+	zfs_namecheck.o		\
+	zfs_prop.o
+
+ZFS_OBJS +=			\
+	$(ZFS_COMMON_OBJS)	\
+	$(ZFS_SHARED_OBJS)	\
+	vdev_disk.o		\
+	zfs_acl.o		\
+	zfs_ctldir.o		\
+	zfs_dir.o		\
+	zfs_ioctl.o		\
+	zfs_log.o		\
+	zfs_replay.o		\
+	zfs_vfsops.o		\
+	zfs_vnops.o		\
+	zfs_znode.o		\
+	zvol.o
+
 #
 #			streams modules
 #
diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules
index 7b5cd4efa743..0f5c43f1d1dd 100644
--- a/usr/src/uts/common/Makefile.rules
+++ b/usr/src/uts/common/Makefile.rules
@@ -62,6 +62,10 @@ $(OBJS_DIR)/%.o:		$(COMMONBASE)/bignum/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o: 		$(COMMONBASE)/acl/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(COMMONBASE)/avl/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -246,6 +250,14 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/common/fs/ufs/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/fs/zfs/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
+$(OBJS_DIR)/%.o:		$(COMMONBASE)/zfs/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 KMECHKRB5_BASE=$(UTSBASE)/common/gssapi/mechs/krb5
 
 KGSSDFLAGS=-I $(UTSBASE)/common/gssapi/include 
@@ -904,6 +916,9 @@ $(LINTS_DIR)/%.ln:		$(COMMONBASE)/crypto/rsa/%.c
 $(LINTS_DIR)/%.ln:		$(COMMONBASE)/bignum/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(COMMONBASE)/acl/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(COMMONBASE)/avl/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
@@ -1045,6 +1060,12 @@ $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/fs/ufs/%.c
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/fs/ufs_log/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/fs/zfs/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
+$(LINTS_DIR)/%.ln:		$(COMMONBASE)/zfs/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln: 		$(UTSBASE)/common/gssapi/%.c
 	@($(LHEAD) $(LINT.c) $(KGSSDFLAGS) $< $(LTAIL))
 
diff --git a/usr/src/uts/common/fs/ctfs/ctfs_all.c b/usr/src/uts/common/fs/ctfs/ctfs_all.c
index dd3eeb15b649..4933edd96061 100644
--- a/usr/src/uts/common/fs/ctfs/ctfs_all.c
+++ b/usr/src/uts/common/fs/ctfs/ctfs_all.c
@@ -99,7 +99,7 @@ ctfs_adir_do_lookup(vnode_t *vp, const char *nm, vnode_t **vpp, ino64_t *inop)
 	if (*nm != '\0')
 		return (ENOENT);
 
-	ct = contract_ptr(i, VTOZ(vp)->zone_uniqid);
+	ct = contract_ptr(i, VTOZONE(vp)->zone_uniqid);
 	if (ct == NULL)
 		return (ENOENT);
 
@@ -118,7 +118,7 @@ ctfs_adir_do_readdir(vnode_t *vp, struct dirent64 *dp, int *eofp,
 	uint64_t zuniqid;
 	ctid_t next;
 
-	zuniqid = VTOZ(vp)->zone_uniqid;
+	zuniqid = VTOZONE(vp)->zone_uniqid;
 	next = contract_lookup(zuniqid, *offp);
 
 	if (next == -1) {
diff --git a/usr/src/uts/common/fs/ctfs/ctfs_ctl.c b/usr/src/uts/common/fs/ctfs/ctfs_ctl.c
index a13091826c81..f4980d4a97e8 100644
--- a/usr/src/uts/common/fs/ctfs/ctfs_ctl.c
+++ b/usr/src/uts/common/fs/ctfs/ctfs_ctl.c
@@ -249,11 +249,11 @@ ctfs_stat_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
 	detail = STRUCT_FGET(st, ctst_detail);
 	if (detail == CTD_COMMON) {
 		mutex_enter(&ct->ct_lock);
-		contract_status_common(ct, VTOZ(vp), STRUCT_BUF(st), mdl);
+		contract_status_common(ct, VTOZONE(vp), STRUCT_BUF(st), mdl);
 		mutex_exit(&ct->ct_lock);
 	} else if (detail <= CTD_ALL) {
 		VERIFY(nvlist_alloc(&foo, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		type->ct_type_ops->contop_status(ct, VTOZ(vp), detail, foo,
+		type->ct_type_ops->contop_status(ct, VTOZONE(vp), detail, foo,
 		    STRUCT_BUF(st), mdl);
 		VERIFY(nvlist_pack(foo, &bufp, &len, NV_ENCODE_NATIVE,
 		    KM_SLEEP) == 0);
diff --git a/usr/src/uts/common/fs/ctfs/ctfs_event.c b/usr/src/uts/common/fs/ctfs/ctfs_event.c
index afb08a7cfc4b..7fa7cfb60840 100644
--- a/usr/src/uts/common/fs/ctfs/ctfs_event.c
+++ b/usr/src/uts/common/fs/ctfs/ctfs_event.c
@@ -287,7 +287,7 @@ ctfs_ev_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
 	ctfs_evnode_t *evnode = vp->v_data;
 
 	return (ctfs_endpoint_ioctl(&evnode->ctfs_ev_listener, cmd, arg, cr,
-	    VTOZ(vp), 0));
+	    VTOZONE(vp), 0));
 }
 
 /*
@@ -430,7 +430,7 @@ ctfs_bu_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
 	ctfs_bunode_t *bunode = vp->v_data;
 
 	return (ctfs_endpoint_ioctl(&bunode->ctfs_bu_listener, cmd, arg, cr,
-	    VTOZ(vp), bunode->ctfs_bu_queue->ctq_listno == CTEL_BUNDLE));
+	    VTOZONE(vp), bunode->ctfs_bu_queue->ctq_listno == CTEL_BUNDLE));
 }
 
 /*
diff --git a/usr/src/uts/common/fs/ctfs/ctfs_tdir.c b/usr/src/uts/common/fs/ctfs/ctfs_tdir.c
index 479f64b064c7..1f5dd4237028 100644
--- a/usr/src/uts/common/fs/ctfs/ctfs_tdir.c
+++ b/usr/src/uts/common/fs/ctfs/ctfs_tdir.c
@@ -108,7 +108,7 @@ ctfs_tdir_do_readdir(vnode_t *vp, struct dirent64 *dp, int *eofp,
 	ctid_t next;
 	ct_type_t *ty = ct_types[gfs_file_index(vp)];
 
-	zuniqid = VTOZ(vp)->zone_uniqid;
+	zuniqid = VTOZONE(vp)->zone_uniqid;
 	next = contract_type_lookup(ty, zuniqid, *offp);
 
 	if (next == -1) {
@@ -135,7 +135,7 @@ ctfs_tdir_do_lookup(vnode_t *vp, const char *nm, vnode_t **vpp, ino64_t *inop)
 		return (ENOENT);
 
 	ct = contract_type_ptr(ct_types[gfs_file_index(vp)], i,
-	    VTOZ(vp)->zone_uniqid);
+	    VTOZONE(vp)->zone_uniqid);
 	if (ct == NULL)
 		return (ENOENT);
 
diff --git a/usr/src/uts/common/fs/devfs/devfs_subr.c b/usr/src/uts/common/fs/devfs/devfs_subr.c
index 0f53a24ca08d..864ed2ad6091 100644
--- a/usr/src/uts/common/fs/devfs/devfs_subr.c
+++ b/usr/src/uts/common/fs/devfs/devfs_subr.c
@@ -568,20 +568,6 @@ dv_vattr_merge(struct dv_node *dv, struct vattr *vap)
 	}
 }
 
-/*
- * Free a vsecattr
- */
-static void
-dv_free_vsa(struct vsecattr *vsap)
-{
-	if (vsap->vsa_aclcnt > 0 && vsap->vsa_aclentp)
-		kmem_free(vsap->vsa_aclentp,
-		    vsap->vsa_aclcnt * sizeof (aclent_t));
-	if (vsap->vsa_dfaclcnt > 0 && vsap->vsa_dfaclentp)
-		kmem_free(vsap->vsa_dfaclentp,
-		    vsap->vsa_dfaclcnt * sizeof (aclent_t));
-}
-
 /*
  * dv_shadow_node
  *
@@ -623,7 +609,6 @@ dv_shadow_node(
 	int		create_tried;
 	int		error;
 	mperm_t		mp;
-	struct vsecattr	vsa;
 
 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
 	dv = VTODV(vp);
@@ -678,19 +663,14 @@ dv_shadow_node(
 		dv->dv_attrvp = rvp;	/* with one hold */
 
 		/*
-		 * Determine if we have (non-trivial) ACLs on this node.
-		 * NB: This should be changed call fs_acl_nontrivial for
-		 * new ACE flavor ACLs.
+		 * Determine if we have non-trivial ACLs on this node.
+		 * It is not necessary to VOP_RWLOCK since fs_acl_nontrivial
+		 * only does VOP_GETSECATTR.
 		 */
-		vsa.vsa_mask = VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT;
-		error = VOP_GETSECATTR(rvp, &vsa, 0, cred);
 		dv->dv_flags &= ~DV_ACL;
-		if (error == 0) {
-			if (vsa.vsa_aclcnt > MIN_ACL_ENTRIES) {
-				dv->dv_flags |= DV_ACL;	/* non-trivial ACL */
-			}
-			dv_free_vsa(&vsa);
-		}
+
+		if (fs_acl_nontrivial(rvp, cred))
+			dv->dv_flags |= DV_ACL;
 
 		/*
 		 * If we have synced out the memory attributes, free
diff --git a/usr/src/uts/common/fs/devfs/devfs_vnops.c b/usr/src/uts/common/fs/devfs/devfs_vnops.c
index 7a3d4c1c04ed..b8dfce54487a 100644
--- a/usr/src/uts/common/fs/devfs/devfs_vnops.c
+++ b/usr/src/uts/common/fs/devfs/devfs_vnops.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -621,7 +621,6 @@ devfs_getsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
 
 	error = VOP_GETSECATTR(avp, vsap, flags, cr);
 	dsysdebug(error, ("vop_getsecattr %s %d\n", VTODV(vp)->dv_name, error));
-
 	rw_exit(&dv->dv_contents);
 	return (error);
 }
@@ -678,10 +677,11 @@ devfs_setsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
 	VOP_RWUNLOCK(avp, V_WRITELOCK_TRUE, NULL);
 
 	/*
-	 * NB: This code should call fs_acl_nontrivial when available so that
-	 * DV_ACL is only set on nontrivial ACLs.
+	 * Set DV_ACL if we have a non-trivial set of ACLs.  It is not
+	 * necessary to hold VOP_RWLOCK since fs_acl_nontrivial only does
+	 * VOP_GETSECATTR calls.
 	 */
-	if (error == 0)
+	if (fs_acl_nontrivial(avp, cr))
 		dv->dv_flags |= DV_ACL;
 	return (error);
 }
diff --git a/usr/src/uts/common/fs/fs_subr.c b/usr/src/uts/common/fs/fs_subr.c
index 7fc9dc4277d5..3466db383200 100644
--- a/usr/src/uts/common/fs/fs_subr.c
+++ b/usr/src/uts/common/fs/fs_subr.c
@@ -24,7 +24,7 @@
 
 
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -57,6 +57,7 @@
 #include <sys/kmem.h>
 #include <sys/file.h>
 #include <sys/nbmlock.h>
+#include <acl/acl_common.h>
 
 static callb_cpr_t *frlock_serialize_blocked(flk_cb_when_t, void *);
 
@@ -632,3 +633,84 @@ fs_vnevent_support(vnode_t *vp, vnevent_t vnevent)
 	ASSERT(vp != NULL);
 	return (0);
 }
+
+/*
+ * return 1 for non-trivial ACL.
+ *
+ * NB: It is not necessary for the caller to VOP_RWLOCK since
+ *	we only issue VOP_GETSECATTR.
+ *
+ * Returns 0 == trivial
+ *         1 == NOT Trivial
+ *	   <0 could not determine.
+ */
+int
+fs_acl_nontrivial(vnode_t *vp, cred_t *cr)
+{
+	ulong_t		acl_styles;
+	ulong_t		acl_flavor;
+	vsecattr_t 	vsecattr;
+	int 		error;
+	int		isnontrivial;
+
+	/* determine the forms of ACLs maintained */
+	error = VOP_PATHCONF(vp, _PC_ACL_ENABLED, &acl_styles, cr);
+
+	/* clear bits we don't understand and establish default acl_style */
+	acl_styles &= (_ACL_ACLENT_ENABLED | _ACL_ACE_ENABLED);
+	if (error || (acl_styles == 0))
+		acl_styles = _ACL_ACLENT_ENABLED;
+
+	vsecattr.vsa_aclentp = NULL;
+	vsecattr.vsa_dfaclentp = NULL;
+	vsecattr.vsa_aclcnt = 0;
+	vsecattr.vsa_dfaclcnt = 0;
+
+	while (acl_styles) {
+		/* select one of the styles as current flavor */
+		acl_flavor = 0;
+		if (acl_styles & _ACL_ACLENT_ENABLED) {
+			acl_flavor = _ACL_ACLENT_ENABLED;
+			vsecattr.vsa_mask = VSA_ACLCNT | VSA_DFACLCNT;
+		} else if (acl_styles & _ACL_ACE_ENABLED) {
+			acl_flavor = _ACL_ACE_ENABLED;
+			vsecattr.vsa_mask = VSA_ACECNT | VSA_ACE;
+		}
+
+		ASSERT(vsecattr.vsa_mask && acl_flavor);
+		error = VOP_GETSECATTR(vp, &vsecattr, 0, cr);
+		if (error == 0)
+			break;
+
+		/* that flavor failed */
+		acl_styles &= ~acl_flavor;
+	}
+
+	/* if all styles fail then assume trivial */
+	if (acl_styles == 0)
+		return (0);
+
+	/* process the flavor that worked */
+	isnontrivial = 0;
+	if (acl_flavor & _ACL_ACLENT_ENABLED) {
+		if (vsecattr.vsa_aclcnt > MIN_ACL_ENTRIES)
+			isnontrivial = 1;
+		if (vsecattr.vsa_aclcnt && vsecattr.vsa_aclentp != NULL)
+			kmem_free(vsecattr.vsa_aclentp,
+			    vsecattr.vsa_aclcnt * sizeof (aclent_t));
+		if (vsecattr.vsa_dfaclcnt && vsecattr.vsa_dfaclentp != NULL)
+			kmem_free(vsecattr.vsa_dfaclentp,
+			    vsecattr.vsa_dfaclcnt * sizeof (aclent_t));
+	}
+	if (acl_flavor & _ACL_ACE_ENABLED) {
+
+		isnontrivial = ace_trivial(vsecattr.vsa_aclentp,
+		    vsecattr.vsa_aclcnt);
+
+		if (vsecattr.vsa_aclcnt && vsecattr.vsa_aclentp != NULL)
+			kmem_free(vsecattr.vsa_aclentp,
+			    vsecattr.vsa_aclcnt * sizeof (ace_t));
+		/* ACE has no vsecattr.vsa_dfaclcnt */
+	}
+	return (isnontrivial);
+}
diff --git a/usr/src/uts/common/fs/fs_subr.h b/usr/src/uts/common/fs/fs_subr.h
index 27fc8457185e..8cd453edba19 100644
--- a/usr/src/uts/common/fs/fs_subr.h
+++ b/usr/src/uts/common/fs/fs_subr.h
@@ -23,7 +23,7 @@
 /*	  All Rights Reserved  	*/
 
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -81,6 +81,7 @@ extern int	fs_shrlock(struct vnode *, int, struct shrlock *, int,
 			cred_t *);
 extern int	fs_vnevent_nosupport(vnode_t *, vnevent_t);
 extern int	fs_vnevent_support(vnode_t *, vnevent_t);
+extern int	fs_acl_nontrivial(struct vnode *vp, struct cred *cr);
 
 #endif	/* _KERNEL */
 
diff --git a/usr/src/uts/common/fs/lookup.c b/usr/src/uts/common/fs/lookup.c
index 7fd7f6651077..b7fdf996e2c7 100644
--- a/usr/src/uts/common/fs/lookup.c
+++ b/usr/src/uts/common/fs/lookup.c
@@ -789,7 +789,7 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
 	size_t dbuflen;
 	struct iovec iov;
 	struct uio uio;
-	int err;
+	int error;
 	int eof;
 	vnode_t *cmpvp;
 	struct dirent64 *dp;
@@ -811,8 +811,8 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
 	uio.uio_extflg = UIO_COPY_CACHED;
 	uio.uio_loffset = 0;
 
-	if ((err = VOP_ACCESS(dvp, VREAD, 0, cr)) != 0)
-		return (err);
+	if ((error = VOP_ACCESS(dvp, VREAD, 0, cr)) != 0)
+		return (error);
 
 	while (!eof) {
 		uio.uio_resid = dlen;
@@ -820,12 +820,12 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
 		iov.iov_len = dlen;
 
 		(void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL);
-		err = VOP_READDIR(dvp, &uio, cr, &eof);
+		error = VOP_READDIR(dvp, &uio, cr, &eof);
 		VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL);
 
 		dbuflen = dlen - uio.uio_resid;
 
-		if (err || dbuflen == 0)
+		if (error || dbuflen == 0)
 			break;
 
 		dp = (dirent64_t *)dbuf;
@@ -840,7 +840,7 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
 				continue;
 			}
 
-			err = VOP_LOOKUP(dvp, dp->d_name, &cmpvp, &pnp, 0,
+			error = VOP_LOOKUP(dvp, dp->d_name, &cmpvp, &pnp, 0,
 			    vrootp, cr);
 
 			/*
@@ -849,7 +849,7 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
 			 * just removed an entry since the readdir() call, and
 			 * the entry we want is further on in the directory.
 			 */
-			if (err == 0) {
+			if (error == 0) {
 				if (vnode_match(tvp, cmpvp, cr)) {
 					VN_RELE(cmpvp);
 					*rdp = dp;
@@ -857,8 +857,8 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
 				}
 
 				VN_RELE(cmpvp);
-			} else if (err != ENOENT) {
-				return (err);
+			} else if (error != ENOENT) {
+				return (error);
 			}
 
 			dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
@@ -868,13 +868,26 @@ dirfindvp(vnode_t *vrootp, vnode_t *dvp, vnode_t *tvp, cred_t *cr, char *dbuf,
 	/*
 	 * Something strange has happened, this directory does not contain the
 	 * specified vnode.  This should never happen in the normal case, since
-	 * we ensured that dvp is the parent of vp.  This may be possible in
-	 * some race conditions, so fail gracefully.
+	 * we ensured that dvp is the parent of vp.  This is possible in some
+	 * rare conditions (races and the special .zfs directory).
 	 */
-	if (err == 0)
-		err = ENOENT;
+	if (error == 0) {
+		error = VOP_LOOKUP(dvp, ".zfs", &cmpvp, &pnp, 0, vrootp, cr);
+		if (error == 0) {
+			if (vnode_match(tvp, cmpvp, cr)) {
+				(void) strcpy(dp->d_name, ".zfs");
+				dp->d_reclen = strlen(".zfs");
+				dp->d_off = 2;
+				dp->d_ino = 1;
+				*rdp = dp;
+			} else {
+				error = ENOENT;
+			}
+			VN_RELE(cmpvp);
+		}
+	}
 
-	return (err);
+	return (error);
 }
 
 /*
diff --git a/usr/src/uts/common/fs/nfs/nfs4_acl.c b/usr/src/uts/common/fs/nfs/nfs4_acl.c
index 9b584f6256fc..96aa1756e940 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_acl.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_acl.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -80,10 +80,15 @@ static int ace4_list_to_aent(ace4_list_t *, aclent_t **, int *, uid_t, gid_t,
 static int ln_ace4_to_aent(nfsace4 *ace4, int n, uid_t, gid_t,
     aclent_t **, int *, aclent_t **, int *, int, int, int);
 static int ace4_cmp(nfsace4 *, nfsace4 *);
-static int acet_to_ace4(ace_t *, nfsace4 *, int, int);
-static int ace4_to_acet(nfsace4 *, ace_t *, uid_t, gid_t, int, int, int);
+static int acet_to_ace4(ace_t *, nfsace4 *, int);
+static int ace4_to_acet(nfsace4 *, ace_t *, uid_t, gid_t, int, int);
 static int validate_idmapping(utf8string *, uid_t, int, int, int);
 static int u8s_mapped_to_nobody(utf8string *, uid_t, int);
+static void ace4_mask_to_acet_mask(acemask4, uint32_t *);
+static void acet_mask_to_ace4_mask(uint32_t, acemask4 *);
+static void ace4_flags_to_acet_flags(aceflag4, uint16_t *);
+static void acet_flags_to_ace4_flags(uint16_t, aceflag4 *);
+
 /*
  * The following two functions check and set ACE4_SYNCRONIZE, ACE4_WRITE_OWNER,
  * ACE4_DELETE and ACE4_WRITE_ATTRIBUTES.
@@ -1651,7 +1656,7 @@ ln_ace4_cmp(nfsace4 *a, nfsace4* b, int n)
  * strings versus integer uid/gids.
  */
 static int
-acet_to_ace4(ace_t *ace, nfsace4 *nfsace4, int isdir, int isserver)
+acet_to_ace4(ace_t *ace, nfsace4 *nfsace4, int isserver)
 {
 	int error = 0;
 
@@ -1669,44 +1674,45 @@ acet_to_ace4(ace_t *ace, nfsace4 *nfsace4, int isdir, int isserver)
 	}
 
 	switch (ace->a_type) {
-	case ALLOW:
+	case ACE_ACCESS_ALLOWED_ACE_TYPE:
 		nfsace4->type = ACE4_ACCESS_ALLOWED_ACE_TYPE;
 		break;
-	case DENY:
+	case ACE_ACCESS_DENIED_ACE_TYPE:
 		nfsace4->type = ACE4_ACCESS_DENIED_ACE_TYPE;
 		break;
 	default:
+		NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
+		    "acet_to_ace4: unsupported type: %x", ace->a_type));
 		error = ENOTSUP;
 		break;
 	}
 	if (error != 0)
 		goto out;
 
-	nfsace4->access_mask = mode_to_ace4_access(ace->a_access_mask,
-	    isdir, ace->a_flags & ACE_OWNER, ace->a_type == ALLOW, isserver);
+	acet_mask_to_ace4_mask(ace->a_access_mask, &nfsace4->access_mask);
+	acet_flags_to_ace4_flags(ace->a_flags, &nfsace4->flag);
 
-	nfsace4->flag = (ace->a_flags & ACE_NFSV4_SUP_FLAGS);
-	if (ace->a_flags & ACE_GROUPS) {
+	if (ace->a_flags & ACE_GROUP) {
+		nfsace4->flag |= ACE4_IDENTIFIER_GROUP;
+		(void) str_to_utf8(ACE4_WHO_GROUP, &nfsace4->who);
+	} else if (ace->a_flags & ACE_IDENTIFIER_GROUP) {
 		nfsace4->flag |= ACE4_IDENTIFIER_GROUP;
 		error = nfs_idmap_gid_str(ace->a_who, &nfsace4->who, isserver);
-	} else if (ace->a_flags & ACE_USER) {
-		error = nfs_idmap_uid_str(ace->a_who, &nfsace4->who, isserver);
+		if (error != 0)
+			NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
+			    "acet_to_ace4: idmap failed with %d", error));
 	} else if (ace->a_flags & ACE_OWNER) {
 		(void) str_to_utf8(ACE4_WHO_OWNER, &nfsace4->who);
-	} else if (ace->a_flags & ACE_GROUP) {
-		nfsace4->flag |= ACE4_IDENTIFIER_GROUP;
-		(void) str_to_utf8(ACE4_WHO_GROUP, &nfsace4->who);
-	} else if (ace->a_flags & ACE_OTHER) {
+	} else if (ace->a_flags & ACE_EVERYONE) {
 		(void) str_to_utf8(ACE4_WHO_EVERYONE, &nfsace4->who);
+	} else {
+		error = nfs_idmap_uid_str(ace->a_who, &nfsace4->who, isserver);
+		if (error != 0)
+			NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
+			    "acet_to_ace4: idmap failed with %d", error));
 	}
 
 out:
-#ifdef DEBUG
-	if (error != 0)
-	    NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
-		"acet_to_ace4: idmap failed with %d", error));
-#endif
-
 	return (error);
 }
 
@@ -1716,10 +1722,9 @@ acet_to_ace4(ace_t *ace, nfsace4 *nfsace4, int isdir, int isserver)
  */
 static int
 ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
-    int isdir, int isserver, int just_count)
+    int isserver, int just_count)
 {
 	int error = 0;
-	o_mode_t mode;
 
 	if (nfsace4 == NULL) {
 		NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
@@ -1734,12 +1739,14 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
 
 	switch (nfsace4->type) {
 	case ACE4_ACCESS_ALLOWED_ACE_TYPE:
-		ace->a_type = ALLOW;
+		ace->a_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
 		break;
 	case ACE4_ACCESS_DENIED_ACE_TYPE:
-		ace->a_type = DENY;
+		ace->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
 		break;
 	default:
+		NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
+		    "ace4_to_acet: unsupported type: %x", nfsace4->type));
 		error = ENOTSUP;
 		break;
 	}
@@ -1761,16 +1768,15 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
 		goto out;
 	}
 
-	ace->a_access_mask = nfsace4->access_mask;
-	error = ace4_mask_to_mode(nfsace4->access_mask, &mode, isdir);
-	if (error != 0)
-		goto out;
-	ace->a_access_mask = mode;
-	if (nfsace4->flag & ~(ACE_NFSV4_SUP_FLAGS | ACE4_IDENTIFIER_GROUP)) {
+	ace4_mask_to_acet_mask(nfsace4->access_mask, &ace->a_access_mask);
+
+	if (nfsace4->flag & ~ACE_NFSV4_SUP_FLAGS) {
+		NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
+		    "ace4_to_acet: unsupported flags: %x", nfsace4->flag));
 		error = ENOTSUP;
 		goto out;
 	}
-	ace->a_flags = (nfsace4->flag & ACE_NFSV4_SUP_FLAGS);
+	ace4_flags_to_acet_flags(nfsace4->flag, &ace->a_flags);
 
 	if (nfsace4->flag & ACE4_IDENTIFIER_GROUP) {
 		if ((nfsace4->who.utf8string_len == 6) &&
@@ -1780,7 +1786,7 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
 			ace->a_flags |= ACE_GROUP;
 			error = 0;
 		} else {
-			ace->a_flags |= ACE_GROUPS;
+			ace->a_flags |= ACE_IDENTIFIER_GROUP;
 			error = nfs_idmap_str_gid(&nfsace4->who,
 			    &ace->a_who, isserver);
 			if (error != 0) {
@@ -1807,10 +1813,9 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
 		} else if ((nfsace4->who.utf8string_len == 9) &&
 		    (bcmp(ACE4_WHO_EVERYONE,
 		    nfsace4->who.utf8string_val, 9) == 0)) {
-			ace->a_flags |= ACE_OTHER;
+			ace->a_flags |= ACE_EVERYONE;
 			ace->a_who = 0;
 		} else {
-			ace->a_flags |= ACE_USER;
 			error = nfs_idmap_str_uid(&nfsace4->who,
 			    &ace->a_who, isserver);
 			if (error != 0) {
@@ -1830,18 +1835,124 @@ ace4_to_acet(nfsace4 *nfsace4, ace_t *ace, uid_t owner, gid_t group,
 	}
 
 out:
-#ifdef DEBUG
-	if (error != 0)
-		NFS4_DEBUG(nfs4_acl_debug, (CE_NOTE,
-		    "ace4_to_acet: idmap failed with %d", error));
-#endif
-
 	return (error);
 }
 
+static void
+ace4_mask_to_acet_mask(acemask4 ace4_mask, uint32_t *acet_mask)
+{
+	*acet_mask = 0;
+
+	if (ace4_mask & ACE4_READ_DATA)
+		*acet_mask |= ACE_READ_DATA;
+	if (ace4_mask & ACE4_WRITE_DATA)
+		*acet_mask |= ACE_WRITE_DATA;
+	if (ace4_mask & ACE4_APPEND_DATA)
+		*acet_mask |= ACE_APPEND_DATA;
+	if (ace4_mask & ACE4_READ_NAMED_ATTRS)
+		*acet_mask |= ACE_READ_NAMED_ATTRS;
+	if (ace4_mask & ACE4_WRITE_NAMED_ATTRS)
+		*acet_mask |= ACE_WRITE_NAMED_ATTRS;
+	if (ace4_mask & ACE4_EXECUTE)
+		*acet_mask |= ACE_EXECUTE;
+	if (ace4_mask & ACE4_DELETE_CHILD)
+		*acet_mask |= ACE_DELETE_CHILD;
+	if (ace4_mask & ACE4_READ_ATTRIBUTES)
+		*acet_mask |= ACE_READ_ATTRIBUTES;
+	if (ace4_mask & ACE4_WRITE_ATTRIBUTES)
+		*acet_mask |= ACE_WRITE_ATTRIBUTES;
+	if (ace4_mask & ACE4_DELETE)
+		*acet_mask |= ACE_DELETE;
+	if (ace4_mask & ACE4_READ_ACL)
+		*acet_mask |= ACE_READ_ACL;
+	if (ace4_mask & ACE4_WRITE_ACL)
+		*acet_mask |= ACE_WRITE_ACL;
+	if (ace4_mask & ACE4_WRITE_OWNER)
+		*acet_mask |= ACE_WRITE_OWNER;
+	if (ace4_mask & ACE4_SYNCHRONIZE)
+		*acet_mask |= ACE_SYNCHRONIZE;
+}
+
+static void
+acet_mask_to_ace4_mask(uint32_t acet_mask, acemask4 *ace4_mask)
+{
+	*ace4_mask = 0;
+
+	if (acet_mask & ACE_READ_DATA)
+		*ace4_mask |= ACE4_READ_DATA;
+	if (acet_mask & ACE_WRITE_DATA)
+		*ace4_mask |= ACE4_WRITE_DATA;
+	if (acet_mask & ACE_APPEND_DATA)
+		*ace4_mask |= ACE_APPEND_DATA;
+	if (acet_mask & ACE4_READ_NAMED_ATTRS)
+		*ace4_mask |= ACE_READ_NAMED_ATTRS;
+	if (acet_mask & ACE_WRITE_NAMED_ATTRS)
+		*ace4_mask |= ACE4_WRITE_NAMED_ATTRS;
+	if (acet_mask & ACE_EXECUTE)
+		*ace4_mask |= ACE4_EXECUTE;
+	if (acet_mask & ACE_DELETE_CHILD)
+		*ace4_mask |= ACE4_DELETE_CHILD;
+	if (acet_mask & ACE_READ_ATTRIBUTES)
+		*ace4_mask |= ACE4_READ_ATTRIBUTES;
+	if (acet_mask & ACE_WRITE_ATTRIBUTES)
+		*ace4_mask |= ACE4_WRITE_ATTRIBUTES;
+	if (acet_mask & ACE_DELETE)
+		*ace4_mask |= ACE4_DELETE;
+	if (acet_mask & ACE_READ_ACL)
+		*ace4_mask |= ACE4_READ_ACL;
+	if (acet_mask & ACE_WRITE_ACL)
+		*ace4_mask |= ACE4_WRITE_ACL;
+	if (acet_mask & ACE_WRITE_OWNER)
+		*ace4_mask |= ACE4_WRITE_OWNER;
+	if (acet_mask & ACE_SYNCHRONIZE)
+		*ace4_mask |= ACE4_SYNCHRONIZE;
+}
+
+static void
+ace4_flags_to_acet_flags(aceflag4 ace4_flags, uint16_t *acet_flags)
+{
+	*acet_flags = 0;
+
+	if (ace4_flags & ACE4_FILE_INHERIT_ACE)
+		*acet_flags |= ACE_FILE_INHERIT_ACE;
+	if (ace4_flags & ACE4_DIRECTORY_INHERIT_ACE)
+		*acet_flags |= ACE_DIRECTORY_INHERIT_ACE;
+	if (ace4_flags & ACE4_NO_PROPAGATE_INHERIT_ACE)
+		*acet_flags |= ACE_NO_PROPAGATE_INHERIT_ACE;
+	if (ace4_flags & ACE4_INHERIT_ONLY_ACE)
+		*acet_flags |= ACE_INHERIT_ONLY_ACE;
+	if (ace4_flags & ACE4_SUCCESSFUL_ACCESS_ACE_FLAG)
+		*acet_flags |= ACE_SUCCESSFUL_ACCESS_ACE_FLAG;
+	if (ace4_flags & ACE4_FAILED_ACCESS_ACE_FLAG)
+		*acet_flags |= ACE_FAILED_ACCESS_ACE_FLAG;
+	if (ace4_flags & ACE4_IDENTIFIER_GROUP)
+		*acet_flags |= ACE_IDENTIFIER_GROUP;
+}
+
+static void
+acet_flags_to_ace4_flags(uint16_t acet_flags, aceflag4 *ace4_flags)
+{
+	*ace4_flags = 0;
+
+	if (acet_flags & ACE_FILE_INHERIT_ACE)
+		*ace4_flags |= ACE4_FILE_INHERIT_ACE;
+	if (acet_flags & ACE_DIRECTORY_INHERIT_ACE)
+		*ace4_flags |= ACE4_DIRECTORY_INHERIT_ACE;
+	if (acet_flags & ACE_NO_PROPAGATE_INHERIT_ACE)
+		*ace4_flags |= ACE4_NO_PROPAGATE_INHERIT_ACE;
+	if (acet_flags & ACE_INHERIT_ONLY_ACE)
+		*ace4_flags |= ACE4_INHERIT_ONLY_ACE;
+	if (acet_flags & ACE_SUCCESSFUL_ACCESS_ACE_FLAG)
+		*ace4_flags |= ACE4_SUCCESSFUL_ACCESS_ACE_FLAG;
+	if (acet_flags & ACE_FAILED_ACCESS_ACE_FLAG)
+		*ace4_flags |= ACE4_FAILED_ACCESS_ACE_FLAG;
+	if (acet_flags & ACE_IDENTIFIER_GROUP)
+		*ace4_flags |= ACE4_IDENTIFIER_GROUP;
+}
+
 int
 vs_ace4_to_acet(vsecattr_t *vs_ace4, vsecattr_t *vs_acet,
-    uid_t owner, gid_t group, int isdir, int isserver, int just_count)
+    uid_t owner, gid_t group, int isserver, int just_count)
 {
 	int error;
 	int i;
@@ -1865,7 +1976,7 @@ vs_ace4_to_acet(vsecattr_t *vs_ace4, vsecattr_t *vs_acet,
 	for (i = 0; i < vs_ace4->vsa_aclcnt; i++) {
 		error = ace4_to_acet((nfsace4 *)(vs_ace4->vsa_aclentp) + i,
 		    (ace_t *)(vs_acet->vsa_aclentp) + i, owner, group,
-		    isdir, isserver, just_count);
+		    isserver, just_count);
 		if (error != 0)
 			goto out;
 	}
@@ -1879,7 +1990,7 @@ vs_ace4_to_acet(vsecattr_t *vs_ace4, vsecattr_t *vs_acet,
 
 int
 vs_acet_to_ace4(vsecattr_t *vs_acet, vsecattr_t *vs_ace4,
-    int isdir, int isserver)
+    int isserver)
 {
 	int error = 0;
 	int i;
@@ -1900,7 +2011,7 @@ vs_acet_to_ace4(vsecattr_t *vs_acet, vsecattr_t *vs_ace4,
 
 	for (i = 0; i < vs_acet->vsa_aclcnt; i++) {
 		error = acet_to_ace4((ace_t *)(vs_acet->vsa_aclentp) + i,
-		    (nfsace4 *)(vs_ace4->vsa_aclentp) + i, isdir, isserver);
+		    (nfsace4 *)(vs_ace4->vsa_aclentp) + i, isserver);
 		if (error != 0)
 			goto out;
 	}
diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
index 6ef0000ea352..6169621a73e3 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
@@ -887,8 +887,7 @@ rfs4_fattr4_acl(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
 		if (error != 0)
 			break;
 		if (whichacl & _ACL_ACE_ENABLED) {
-			error = vs_acet_to_ace4(&vs_native, &vs_ace4,
-			    vp->v_type == VDIR, TRUE);
+			error = vs_acet_to_ace4(&vs_native, &vs_ace4, TRUE);
 			vs_acet_destroy(&vs_native);
 		} else {
 			error = vs_aent_to_ace4(&vs_native, &vs_ace4,
@@ -968,8 +967,7 @@ rfs4_fattr4_acl(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
 
 		if (whichacl & _ACL_ACE_ENABLED) {
 			error = vs_ace4_to_acet(&vs_ace4, &vs_native,
-			    vap->va_uid, vap->va_gid, vp->v_type == VDIR, TRUE,
-			    FALSE);
+			    vap->va_uid, vap->va_gid, TRUE, FALSE);
 			if (error != 0)
 				break;
 			(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
index d07cedb514c5..9ae1d0a56c86 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
@@ -11982,7 +11982,7 @@ nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr)
 			 * These are ace_t type entries.
 			 */
 			error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap,
-			    vp->v_type == VDIR, FALSE);
+			    FALSE);
 			if (error)
 				return (error);
 		}
@@ -12151,7 +12151,7 @@ nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap,
 
 	if (orig_mask & (VSA_ACE | VSA_ACECNT)) {
 		error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid,
-		    isdir, FALSE, ((orig_mask & VSA_ACE) ? FALSE : TRUE));
+		    FALSE, ((orig_mask & VSA_ACE) ? FALSE : TRUE));
 
 		if (error)
 			return (error);
diff --git a/usr/src/uts/common/fs/nfs/nfs_acl_srv.c b/usr/src/uts/common/fs/nfs/nfs_acl_srv.c
index 836297350a13..1242f94e109b 100644
--- a/usr/src/uts/common/fs/nfs/nfs_acl_srv.c
+++ b/usr/src/uts/common/fs/nfs/nfs_acl_srv.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.
+ * Copyright 2005 Sun Microsystems, Inc.
  * All rights reserved.
  * Use is subject to license terms.
  */
@@ -68,6 +68,8 @@
 #include <nfs/nfs_clnt.h>
 #include <nfs/nfs_acl.h>
 
+#include <fs/fs_subr.h>
+
 /*
  * These are the interface routines for the server side of the
  * NFS ACL server.  See the NFS ACL protocol specification
@@ -95,6 +97,25 @@ acl2_getacl(GETACL2args *args, GETACL2res *resp, struct exportinfo *exi,
 
 	error = VOP_GETSECATTR(vp, &resp->resok.acl, 0, cr);
 
+	if (error == ENOSYS) {
+		/*
+		 * If the underlying file system doesn't support
+		 * aclent_t type acls, fabricate an acl.  This is
+		 * required in order to to support existing clients
+		 * that require the call to VOP_GETSECATTR to
+		 * succeed while making the assumption that all
+		 * file systems support aclent_t type acls.  This
+		 * causes problems for servers exporting ZFS file
+		 * systems because ZFS supports ace_t type acls,
+		 * and fails (with ENOSYS) when asked for aclent_t
+		 * type acls.
+		 *
+		 * Note: if the fs_fab_acl() fails, we have other problems.
+		 * This error should be returned to the caller.
+		 */
+		error = fs_fab_acl(vp, &resp->resok.acl, 0, cr);
+	}
+
 	if (error) {
 		VN_RELE(vp);
 		resp->status = puterrno(error);
@@ -454,6 +475,25 @@ acl3_getacl(GETACL3args *args, GETACL3res *resp, struct exportinfo *exi,
 
 	error = VOP_GETSECATTR(vp, &resp->resok.acl, 0, cr);
 
+	if (error == ENOSYS) {
+		/*
+		 * If the underlying file system doesn't support
+		 * aclent_t type acls, fabricate an acl.  This is
+		 * required in order to to support existing clients
+		 * that require the call to VOP_GETSECATTR to
+		 * succeed while making the assumption that all
+		 * file systems support aclent_t type acls.  This
+		 * causes problems for servers exporting ZFS file
+		 * systems because ZFS supports ace_t type acls,
+		 * and fails (with ENOSYS) when asked for aclent_t
+		 * type acls.
+		 *
+		 * Note: if the fs_fab_acl() fails, we have other problems.
+		 * This error should be returned to the caller.
+		 */
+		error = fs_fab_acl(vp, &resp->resok.acl, 0, cr);
+	}
+
 	if (error)
 		goto out;
 
diff --git a/usr/src/uts/common/fs/proc/prioctl.c b/usr/src/uts/common/fs/proc/prioctl.c
index 79f486e9b1e1..844a3b7bb1a9 100644
--- a/usr/src/uts/common/fs/proc/prioctl.c
+++ b/usr/src/uts/common/fs/proc/prioctl.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -498,7 +498,7 @@ prioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
 			 */
 			t = pr_thread(pnp);	/* returns locked thread */
 			thread_unlock(t);
-			oprgetstatus(t, &un.prstat, VTOZ(vp));
+			oprgetstatus(t, &un.prstat, VTOZONE(vp));
 			prunlock(pnp);
 			if (copyout(&un.prstat, cmaddr, sizeof (un.prstat)))
 				error = EFAULT;
@@ -835,7 +835,7 @@ prioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
 		break;
 
 	case PIOCSTATUS:	/* get process/lwp status */
-		oprgetstatus(t, &un.prstat, VTOZ(vp));
+		oprgetstatus(t, &un.prstat, VTOZONE(vp));
 		prunlock(pnp);
 		if (copyout(&un.prstat, cmaddr, sizeof (un.prstat)))
 			error = EFAULT;
@@ -866,13 +866,13 @@ prioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
 		Bprsp = thing;
 		thing = NULL;
 		prsp = Bprsp;
-		oprgetstatus(t, prsp, VTOZ(vp));
+		oprgetstatus(t, prsp, VTOZONE(vp));
 		t = p->p_tlist;
 		do {
 			ASSERT(!(t->t_proc_flag & TP_LWPEXIT));
 			ASSERT(nlwp > 0);
 			--nlwp;
-			oprgetstatus(t, ++prsp, VTOZ(vp));
+			oprgetstatus(t, ++prsp, VTOZONE(vp));
 		} while ((t = t->t_forw) != p->p_tlist);
 		ASSERT(nlwp == 0);
 		prunlock(pnp);
@@ -2053,7 +2053,7 @@ prioctl32(struct vnode *vp, int cmd, intptr_t arg, int flag,
 			 */
 			t = pr_thread(pnp);	/* returns locked thread */
 			thread_unlock(t);
-			oprgetstatus32(t, &un32.prstat, VTOZ(vp));
+			oprgetstatus32(t, &un32.prstat, VTOZONE(vp));
 			prunlock(pnp);
 			if (copyout(&un32.prstat, cmaddr, sizeof (un32.prstat)))
 				error = EFAULT;
@@ -2430,7 +2430,7 @@ prioctl32(struct vnode *vp, int cmd, intptr_t arg, int flag,
 			error = EOVERFLOW;
 			break;
 		}
-		oprgetstatus32(t, &un32.prstat, VTOZ(vp));
+		oprgetstatus32(t, &un32.prstat, VTOZONE(vp));
 		prunlock(pnp);
 		if (copyout(&un32.prstat, cmaddr, sizeof (un32.prstat)))
 			error = EFAULT;
@@ -2471,13 +2471,13 @@ prioctl32(struct vnode *vp, int cmd, intptr_t arg, int flag,
 		Bprsp = (prstatus32_t *)thing;
 		thing = NULL;
 		prsp = Bprsp;
-		oprgetstatus32(t, prsp, VTOZ(vp));
+		oprgetstatus32(t, prsp, VTOZONE(vp));
 		t = p->p_tlist;
 		do {
 			ASSERT(!(t->t_proc_flag & TP_LWPEXIT));
 			ASSERT(nlwp > 0);
 			--nlwp;
-			oprgetstatus32(t, ++prsp, VTOZ(vp));
+			oprgetstatus32(t, ++prsp, VTOZONE(vp));
 		} while ((t = t->t_forw) != p->p_tlist);
 		ASSERT(nlwp == 0);
 		prunlock(pnp);
diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c
index dea54056c602..d12ee64e8c4e 100644
--- a/usr/src/uts/common/fs/proc/prvnops.c
+++ b/usr/src/uts/common/fs/proc/prvnops.c
@@ -709,7 +709,7 @@ pr_read_status(prnode_t *pnp, uio_t *uiop)
 	 */
 	sp = kmem_alloc(sizeof (*sp), KM_SLEEP);
 	if ((error = prlock(pnp, ZNO)) == 0) {
-		prgetstatus(pnp->pr_common->prc_proc, sp, VTOZ(PTOV(pnp)));
+		prgetstatus(pnp->pr_common->prc_proc, sp, VTOZONE(PTOV(pnp)));
 		prunlock(pnp);
 		error = pr_uioread(sp, sizeof (*sp), uiop);
 	}
@@ -753,7 +753,7 @@ pr_read_lstatus(prnode_t *pnp, uio_t *uiop)
 		if (ldp->ld_entry == NULL ||
 		    (t = ldp->ld_entry->le_thread) == NULL)
 			continue;
-		prgetlwpstatus(t, sp, VTOZ(PTOV(pnp)));
+		prgetlwpstatus(t, sp, VTOZONE(PTOV(pnp)));
 		sp = (lwpstatus_t *)((caddr_t)sp + LSPAN(lwpstatus_t));
 	}
 	prunlock(pnp);
@@ -1426,7 +1426,7 @@ pr_read_lwpstatus(prnode_t *pnp, uio_t *uiop)
 		goto out;
 	}
 
-	prgetlwpstatus(pnp->pr_common->prc_thread, sp, VTOZ(PTOV(pnp)));
+	prgetlwpstatus(pnp->pr_common->prc_thread, sp, VTOZONE(PTOV(pnp)));
 	prunlock(pnp);
 
 	error = pr_uioread(sp, sizeof (*sp), uiop);
@@ -1799,7 +1799,7 @@ pr_read_status_32(prnode_t *pnp, uio_t *uiop)
 			error = EOVERFLOW;
 		} else {
 			prgetstatus32(pnp->pr_common->prc_proc, sp,
-			    VTOZ(PTOV(pnp)));
+			    VTOZONE(PTOV(pnp)));
 			prunlock(pnp);
 			error = pr_uioread(sp, sizeof (*sp), uiop);
 		}
@@ -1852,7 +1852,7 @@ pr_read_lstatus_32(prnode_t *pnp, uio_t *uiop)
 		if (ldp->ld_entry == NULL ||
 		    (t = ldp->ld_entry->le_thread) == NULL)
 			continue;
-		prgetlwpstatus32(t, sp, VTOZ(PTOV(pnp)));
+		prgetlwpstatus32(t, sp, VTOZONE(PTOV(pnp)));
 		sp = (lwpstatus32_t *)((caddr_t)sp + LSPAN32(lwpstatus32_t));
 	}
 	prunlock(pnp);
@@ -2471,7 +2471,7 @@ pr_read_lwpstatus_32(prnode_t *pnp, uio_t *uiop)
 		goto out;
 	}
 
-	prgetlwpstatus32(pnp->pr_common->prc_thread, sp, VTOZ(PTOV(pnp)));
+	prgetlwpstatus32(pnp->pr_common->prc_thread, sp, VTOZONE(PTOV(pnp)));
 	prunlock(pnp);
 
 	error = pr_uioread(sp, sizeof (*sp), uiop);
@@ -4281,9 +4281,9 @@ pr_lookup_ctdir(vnode_t *dp, char *comp)
 	 * outside the zone.  (see logic in contract_status_common)
 	 */
 	if ((ct->ct_owner != p) &&
-	    !(p == VTOZ(dp)->zone_zsched && ct->ct_state < CTS_ORPHAN &&
-	    VTOZ(dp)->zone_uniqid == contract_getzuniqid(ct) &&
-	    VTOZ(dp)->zone_uniqid != GLOBAL_ZONEUNIQID &&
+	    !(p == VTOZONE(dp)->zone_zsched && ct->ct_state < CTS_ORPHAN &&
+	    VTOZONE(dp)->zone_uniqid == contract_getzuniqid(ct) &&
+	    VTOZONE(dp)->zone_uniqid != GLOBAL_ZONEUNIQID &&
 	    ct->ct_czuniqid == GLOBAL_ZONEUNIQID)) {
 		prunlock(dpnp);
 		prfreenode(pnp);
@@ -4668,7 +4668,7 @@ pr_readdir_procdir(prnode_t *pnp, uio_t *uiop, int *eofp)
 
 	ASSERT(pnp->pr_type == PR_PROCDIR);
 
-	zoneid = VTOZ(PTOV(pnp))->zone_id;
+	zoneid = VTOZONE(PTOV(pnp))->zone_id;
 
 	if ((error = gfs_readdir_init(&gstate, PNSIZ, PRSDSIZE, uiop,
 	    PRROOTINO, PRROOTINO)) != 0)
@@ -5453,7 +5453,7 @@ pr_readdir_ctdir(prnode_t *pnp, uio_t *uiop, int *eofp)
 		return (error);
 	}
 
-	zid = VTOZ(pnp->pr_vnode)->zone_uniqid;
+	zid = VTOZONE(pnp->pr_vnode)->zone_uniqid;
 	while ((error = gfs_readdir_pred(&gstate, uiop, &n)) == 0) {
 		id_t next = contract_plookup(p, n, zid);
 		if (next == -1) {
diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c
index 1e7793ba39c7..4d562852afaa 100644
--- a/usr/src/uts/common/fs/vnode.c
+++ b/usr/src/uts/common/fs/vnode.c
@@ -720,28 +720,37 @@ vn_createat(
 		vsec.vsa_dfaclcnt = 0;
 		vsec.vsa_dfaclentp = NULL;
 		vsec.vsa_mask = VSA_DFACLCNT;
-		if (error = VOP_GETSECATTR(dvp, &vsec, 0, CRED())) {
+		error =  VOP_GETSECATTR(dvp, &vsec, 0, CRED());
+		/*
+		 * If error is ENOSYS then treat it as no error
+		 * Don't want to force all file systems to support
+		 * aclent_t style of ACL's.
+		 */
+		if (error == ENOSYS)
+			error = 0;
+		if (error) {
 			if (*vpp != NULL)
 				VN_RELE(*vpp);
 			goto out;
-		}
-
-		/*
-		 * Apply the umask if no default ACLs.
-		 */
-		if (vsec.vsa_dfaclcnt == 0)
-			vap->va_mode &= ~umask;
+		} else {
+			/*
+			 * Apply the umask if no default ACLs.
+			 */
+			if (vsec.vsa_dfaclcnt == 0)
+				vap->va_mode &= ~umask;
 
-		/*
-		 * VOP_GETSECATTR() may have allocated memory for ACLs we
-		 * didn't request, so double-check and free it if necessary.
-		 */
-		if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
-			kmem_free((caddr_t)vsec.vsa_aclentp,
-				vsec.vsa_aclcnt * sizeof (aclent_t));
-		if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
-			kmem_free((caddr_t)vsec.vsa_dfaclentp,
-				vsec.vsa_dfaclcnt * sizeof (aclent_t));
+			/*
+			 * VOP_GETSECATTR() may have allocated memory for
+			 * ACLs we didn't request, so double-check and
+			 * free it if necessary.
+			 */
+			if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
+				kmem_free((caddr_t)vsec.vsa_aclentp,
+				    vsec.vsa_aclcnt * sizeof (aclent_t));
+			if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
+				kmem_free((caddr_t)vsec.vsa_dfaclentp,
+				    vsec.vsa_dfaclcnt * sizeof (aclent_t));
+		}
 	}
 
 	/*
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
new file mode 100644
index 000000000000..0a6cc7b65812
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -0,0 +1,1998 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * DVA-based Adjustable Relpacement Cache
+ *
+ * While much of the theory of operation and algorithms used here
+ * are based on the self-tuning, low overhead replacement cache
+ * presented by Megiddo and Modha at FAST 2003, there are some
+ * significant differences:
+ *
+ * 1. The Megiddo and Modha model assumes any page is evictable.
+ * Pages in its cache cannot be "locked" into memory.  This makes
+ * the eviction algorithm simple: evict the last page in the list.
+ * This also make the performance characteristics easy to reason
+ * about.  Our cache is not so simple.  At any given moment, some
+ * subset of the blocks in the cache are un-evictable because we
+ * have handed out a reference to them.  Blocks are only evictable
+ * when there are no external references active.  This makes
+ * eviction far more problematic:  we choose to evict the evictable
+ * blocks that are the "lowest" in the list.
+ *
+ * There are times when it is not possible to evict the requested
+ * space.  In these circumstances we are unable to adjust the cache
+ * size.  To prevent the cache growing unbounded at these times we
+ * implement a "cache throttle" that slowes the flow of new data
+ * into the cache until we can make space avaiable.
+ *
+ * 2. The Megiddo and Modha model assumes a fixed cache size.
+ * Pages are evicted when the cache is full and there is a cache
+ * miss.  Our model has a variable sized cache.  It grows with
+ * high use, but also tries to react to memory preasure from the
+ * operating system: decreasing its size when system memory is
+ * tight.
+ *
+ * 3. The Megiddo and Modha model assumes a fixed page size. All
+ * elements of the cache are therefor exactly the same size.  So
+ * when adjusting the cache size following a cache miss, its simply
+ * a matter of choosing a single page to evict.  In our model, we
+ * have variable sized cache blocks (rangeing from 512 bytes to
+ * 128K bytes).  We therefor choose a set of blocks to evict to make
+ * space for a cache miss that approximates as closely as possible
+ * the space used by the new block.
+ *
+ * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
+ * by N. Megiddo & D. Modha, FAST 2003
+ */
+
+/*
+ * The locking model:
+ *
+ * A new reference to a cache buffer can be obtained in two
+ * ways: 1) via a hash table lookup using the DVA as a key,
+ * or 2) via one of the ARC lists.  The arc_read() inerface
+ * uses method 1, while the internal arc algorithms for
+ * adjusting the cache use method 2.  We therefor provide two
+ * types of locks: 1) the hash table lock array, and 2) the
+ * arc list locks.
+ *
+ * Buffers do not have their own mutexs, rather they rely on the
+ * hash table mutexs for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexs).
+ *
+ * buf_hash_find() returns the appropriate mutex (held) when it
+ * locates the requested buffer in the hash table.  It returns
+ * NULL for the mutex if the buffer was not in the table.
+ *
+ * buf_hash_remove() expects the appropriate hash mutex to be
+ * already held before it is invoked.
+ *
+ * Each arc state also has a mutex which is used to protect the
+ * buffer list associated with the state.  When attempting to
+ * obtain a hash table lock while holding an arc list lock you
+ * must use: mutex_tryenter() to avoid deadlock.  Also note that
+ * the "top" state mutex must be held before the "bot" state mutex.
+ *
+ * Note that the majority of the performance stats are manipulated
+ * with atomic operations.
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/refcount.h>
+#ifdef _KERNEL
+#include <sys/vmsystm.h>
+#include <vm/anon.h>
+#include <sys/fs/swapnode.h>
+#endif
+#include <sys/callb.h>
+
+static kmutex_t		arc_reclaim_thr_lock;
+static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
+static uint8_t		arc_thread_exit;
+
+typedef enum arc_reclaim_strategy {
+	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
+	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
+} arc_reclaim_strategy_t;
+
+/* number of seconds before growing cache again */
+static int		arc_grow_retry = 60;
+
+static kmutex_t arc_reclaim_lock;
+static int arc_dead;
+
+/*
+ * Note that buffers can be on one of 5 states:
+ *	ARC_anon	- anonymous (discussed below)
+ *	ARC_mru_top	- recently used, currently cached
+ *	ARC_mru_bot	- recentely used, no longer in cache
+ *	ARC_mfu_top	- frequently used, currently cached
+ *	ARC_mfu_bot	- frequently used, no longer in cache
+ * When there are no active references to the buffer, they
+ * are linked onto one of the lists in arc.  These are the
+ * only buffers that can be evicted or deleted.
+ *
+ * Anonymous buffers are buffers that are not associated with
+ * a DVA.  These are buffers that hold dirty block copies
+ * before they are written to stable storage.  By definition,
+ * they are "ref'd" and are considered part of arc_mru_top
+ * that cannot be freed.  Generally, they will aquire a DVA
+ * as they are written and migrate onto the arc_mru_top list.
+ */
+
+typedef struct arc_state {
+	list_t	list;	/* linked list of evictable buffer in state */
+	uint64_t lsize;	/* total size of buffers in the linked list */
+	uint64_t size;	/* total size of all buffers in this state */
+	uint64_t hits;
+	kmutex_t mtx;
+} arc_state_t;
+
+/* The 5 states: */
+static arc_state_t ARC_anon;
+static arc_state_t ARC_mru_top;
+static arc_state_t ARC_mru_bot;
+static arc_state_t ARC_mfu_top;
+static arc_state_t ARC_mfu_bot;
+
+static struct arc {
+	arc_state_t 	*anon;
+	arc_state_t	*mru_top;
+	arc_state_t	*mru_bot;
+	arc_state_t	*mfu_top;
+	arc_state_t	*mfu_bot;
+	uint64_t	size;		/* Actual total arc size */
+	uint64_t	p;		/* Target size (in bytes) of mru_top */
+	uint64_t	c;		/* Target size of cache (in bytes) */
+	uint64_t	c_min;		/* Minimum target cache size */
+	uint64_t	c_max;		/* Maximum target cache size */
+	uint64_t	incr;		/* Size by which to increment arc.c */
+	int64_t		size_check;
+
+	/* performance stats */
+	uint64_t	hits;
+	uint64_t	misses;
+	uint64_t	deleted;
+	uint64_t	skipped;
+	uint64_t	hash_elements;
+	uint64_t	hash_elements_max;
+	uint64_t	hash_collisions;
+	uint64_t	hash_chains;
+	uint32_t	hash_chain_max;
+
+	int		no_grow;	/* Don't try to grow cache size */
+} arc;
+
+/* Default amount to grow arc.incr */
+static int64_t arc_incr_size = 1024;
+
+/* > 0 ==> time to increment arc.c */
+static int64_t arc_size_check_default = -1000;
+
+static uint64_t arc_tempreserve;
+
+typedef struct arc_callback arc_callback_t;
+
+struct arc_callback {
+	arc_done_func_t		*acb_done;
+	void			*acb_private;
+	arc_byteswap_func_t	*acb_byteswap;
+	arc_buf_t		*acb_buf;
+	zio_t			*acb_zio_dummy;
+	arc_callback_t		*acb_next;
+};
+
+struct arc_buf_hdr {
+	/* immutable */
+	uint64_t		b_size;
+	spa_t			*b_spa;
+
+	/* protected by hash lock */
+	dva_t			b_dva;
+	uint64_t		b_birth;
+	uint64_t		b_cksum0;
+
+	arc_buf_hdr_t		*b_hash_next;
+	arc_buf_t		*b_buf;
+	uint32_t		b_flags;
+
+	kcondvar_t		b_cv;
+	arc_callback_t		*b_acb;
+
+	/* protected by arc state mutex */
+	arc_state_t		*b_state;
+	list_node_t		b_arc_node;
+
+	/* updated atomically */
+	clock_t			b_arc_access;
+
+	/* self protecting */
+	refcount_t		b_refcnt;
+};
+
+/*
+ * Private ARC flags.  These flags are private ARC only flags that will show up
+ * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
+ * be passed in as arc_flags in things like arc_read.  However, these flags
+ * should never be passed and should only be set by ARC code.  When adding new
+ * public flags, make sure not to smash the private ones.
+ */
+
+#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
+#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
+#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
+
+#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
+#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
+#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
+
+/*
+ * Hash table routines
+ */
+
+#define	HT_LOCK_PAD	64
+
+struct ht_lock {
+	kmutex_t	ht_lock;
+#ifdef _KERNEL
+	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
+#endif
+};
+
+#define	BUF_LOCKS 256
+typedef struct buf_hash_table {
+	uint64_t ht_mask;
+	arc_buf_hdr_t **ht_table;
+	struct ht_lock ht_locks[BUF_LOCKS];
+} buf_hash_table_t;
+
+static buf_hash_table_t buf_hash_table;
+
+#define	BUF_HASH_INDEX(spa, dva, birth) \
+	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
+#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
+#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
+#define	HDR_LOCK(buf) \
+	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
+
+uint64_t zfs_crc64_table[256];
+
+static uint64_t
+buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
+{
+	uintptr_t spav = (uintptr_t)spa;
+	uint8_t *vdva = (uint8_t *)dva;
+	uint64_t crc = -1ULL;
+	int i;
+
+	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+
+	for (i = 0; i < sizeof (dva_t); i++)
+		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
+
+	crc ^= (spav>>8) ^ birth;
+
+	return (crc);
+}
+
+#define	BUF_EMPTY(buf)						\
+	((buf)->b_dva.dva_word[0] == 0 &&			\
+	(buf)->b_dva.dva_word[1] == 0 &&			\
+	(buf)->b_birth == 0)
+
+#define	BUF_EQUAL(spa, dva, birth, buf)				\
+	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
+	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
+	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+
+static arc_buf_hdr_t *
+buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
+{
+	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
+	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+	arc_buf_hdr_t *buf;
+
+	mutex_enter(hash_lock);
+	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
+	    buf = buf->b_hash_next) {
+		if (BUF_EQUAL(spa, dva, birth, buf)) {
+			*lockp = hash_lock;
+			return (buf);
+		}
+	}
+	mutex_exit(hash_lock);
+	*lockp = NULL;
+	return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table.  If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static arc_buf_hdr_t *fbufs[4]; /* XXX to find 6341326 */
+static kthread_t *fbufs_lastthread;
+static arc_buf_hdr_t *
+buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
+{
+	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+	arc_buf_hdr_t *fbuf;
+	uint32_t max, i;
+
+	fbufs_lastthread = curthread;
+	*lockp = hash_lock;
+	mutex_enter(hash_lock);
+	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
+	    fbuf = fbuf->b_hash_next, i++) {
+		if (i < sizeof (fbufs) / sizeof (fbufs[0]))
+			fbufs[i] = fbuf;
+		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
+			return (fbuf);
+	}
+
+	buf->b_hash_next = buf_hash_table.ht_table[idx];
+	buf_hash_table.ht_table[idx] = buf;
+
+	/* collect some hash table performance data */
+	if (i > 0) {
+		atomic_add_64(&arc.hash_collisions, 1);
+		if (i == 1)
+			atomic_add_64(&arc.hash_chains, 1);
+	}
+	while (i > (max = arc.hash_chain_max) &&
+	    max != atomic_cas_32(&arc.hash_chain_max, max, i)) {
+		continue;
+	}
+	atomic_add_64(&arc.hash_elements, 1);
+	if (arc.hash_elements > arc.hash_elements_max)
+		atomic_add_64(&arc.hash_elements_max, 1);
+
+	return (NULL);
+}
+
+static void
+buf_hash_remove(arc_buf_hdr_t *buf)
+{
+	arc_buf_hdr_t *fbuf, **bufp;
+	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+
+	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
+
+	bufp = &buf_hash_table.ht_table[idx];
+	while ((fbuf = *bufp) != buf) {
+		ASSERT(fbuf != NULL);
+		bufp = &fbuf->b_hash_next;
+	}
+	*bufp = buf->b_hash_next;
+	buf->b_hash_next = NULL;
+
+	/* collect some hash table performance data */
+	atomic_add_64(&arc.hash_elements, -1);
+	if (buf_hash_table.ht_table[idx] &&
+	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
+		atomic_add_64(&arc.hash_chains, -1);
+}
+
+/*
+ * Global data structures and functions for the buf kmem cache.
+ */
+static kmem_cache_t *hdr_cache;
+static kmem_cache_t *buf_cache;
+
+static void
+buf_fini(void)
+{
+	int i;
+
+	kmem_free(buf_hash_table.ht_table,
+	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
+	for (i = 0; i < BUF_LOCKS; i++)
+		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+	kmem_cache_destroy(hdr_cache);
+	kmem_cache_destroy(buf_cache);
+}
+
+/*
+ * Constructor callback - called when the cache is empty
+ * and a new buf is requested.
+ */
+/* ARGSUSED */
+static int
+hdr_cons(void *vbuf, void *unused, int kmflag)
+{
+	arc_buf_hdr_t *buf = vbuf;
+
+	bzero(buf, sizeof (arc_buf_hdr_t));
+	refcount_create(&buf->b_refcnt);
+	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
+	return (0);
+}
+
+/*
+ * Destructor callback - called when a cached buf is
+ * no longer required.
+ */
+/* ARGSUSED */
+static void
+hdr_dest(void *vbuf, void *unused)
+{
+	arc_buf_hdr_t *buf = vbuf;
+
+	refcount_destroy(&buf->b_refcnt);
+	cv_destroy(&buf->b_cv);
+}
+
+void arc_kmem_reclaim(void);
+
+/*
+ * Reclaim callback -- invoked when memory is low.
+ */
+/* ARGSUSED */
+static void
+hdr_recl(void *unused)
+{
+	dprintf("hdr_recl called\n");
+	arc_kmem_reclaim();
+}
+
+static void
+buf_init(void)
+{
+	uint64_t *ct;
+	uint64_t hsize = 1ULL << 10;
+	int i, j;
+
+	/*
+	 * The hash table is big enough to fill all of physical memory
+	 * with an average 4k block size.  The table will take up
+	 * totalmem*sizeof(void*)/4k bytes (eg. 2MB/GB with 8-byte
+	 * pointers).
+	 */
+	while (hsize * 4096 < physmem * PAGESIZE)
+		hsize <<= 1;
+
+	buf_hash_table.ht_mask = hsize - 1;
+	buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+
+	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
+	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	for (i = 0; i < 256; i++)
+		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
+			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
+
+	for (i = 0; i < BUF_LOCKS; i++) {
+		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
+		    NULL, MUTEX_DEFAULT, NULL);
+	}
+}
+
+#define	ARC_MINTIME	(hz>>4) /* 62 ms */
+
+#define	ARC_TAG		(void *)0x05201962
+
+static void
+add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+{
+	ASSERT(MUTEX_HELD(hash_lock));
+
+	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
+	    (ab->b_state != arc.anon)) {
+
+		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
+		mutex_enter(&ab->b_state->mtx);
+		ASSERT(!refcount_is_zero(&ab->b_refcnt));
+		ASSERT(list_link_active(&ab->b_arc_node));
+		list_remove(&ab->b_state->list, ab);
+		ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
+		ab->b_state->lsize -= ab->b_size;
+		mutex_exit(&ab->b_state->mtx);
+	}
+}
+
+static int
+remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+{
+	int cnt;
+
+	ASSERT(MUTEX_HELD(hash_lock));
+
+	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
+	    (ab->b_state != arc.anon)) {
+
+		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
+		mutex_enter(&ab->b_state->mtx);
+		ASSERT(!list_link_active(&ab->b_arc_node));
+		list_insert_head(&ab->b_state->list, ab);
+		ASSERT(ab->b_buf != NULL);
+		ab->b_state->lsize += ab->b_size;
+		mutex_exit(&ab->b_state->mtx);
+	}
+	return (cnt);
+}
+
+/*
+ * Move the supplied buffer to the indicated state.  The mutex
+ * for the buffer must be held by the caller.
+ */
+static void
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab,
+    kmutex_t *hash_lock)
+{
+	arc_buf_t *buf;
+
+	ASSERT(MUTEX_HELD(hash_lock));
+
+	/*
+	 * If this buffer is evictable, transfer it from the
+	 * old state list to the new state list.
+	 */
+	if (refcount_is_zero(&ab->b_refcnt)) {
+		if (ab->b_state != arc.anon) {
+			int drop_mutex = FALSE;
+
+			if (!MUTEX_HELD(&ab->b_state->mtx)) {
+				mutex_enter(&ab->b_state->mtx);
+				drop_mutex = TRUE;
+			}
+			ASSERT(list_link_active(&ab->b_arc_node));
+			list_remove(&ab->b_state->list, ab);
+			ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
+			ab->b_state->lsize -= ab->b_size;
+			if (drop_mutex)
+				mutex_exit(&ab->b_state->mtx);
+		}
+		if (new_state != arc.anon) {
+			int drop_mutex = FALSE;
+
+			if (!MUTEX_HELD(&new_state->mtx)) {
+				mutex_enter(&new_state->mtx);
+				drop_mutex = TRUE;
+			}
+			list_insert_head(&new_state->list, ab);
+			ASSERT(ab->b_buf != NULL);
+			new_state->lsize += ab->b_size;
+			if (drop_mutex)
+				mutex_exit(&new_state->mtx);
+		}
+	}
+
+	ASSERT(!BUF_EMPTY(ab));
+	if (new_state == arc.anon && ab->b_state != arc.anon) {
+		buf_hash_remove(ab);
+	}
+
+	/*
+	 * If this buffer isn't being transferred to the MRU-top
+	 * state, it's safe to clear its prefetch flag
+	 */
+	if ((new_state != arc.mru_top) && (new_state != arc.mru_bot)) {
+		ab->b_flags &= ~ARC_PREFETCH;
+	}
+
+	buf = ab->b_buf;
+	if (buf == NULL) {
+		ASSERT3U(ab->b_state->size, >=, ab->b_size);
+		atomic_add_64(&ab->b_state->size, -ab->b_size);
+		/* we should only be here if we are deleting state */
+		ASSERT(new_state == arc.anon &&
+		    (ab->b_state == arc.mru_bot || ab->b_state == arc.mfu_bot));
+	} else while (buf) {
+		ASSERT3U(ab->b_state->size, >=, ab->b_size);
+		atomic_add_64(&ab->b_state->size, -ab->b_size);
+		atomic_add_64(&new_state->size, ab->b_size);
+		buf = buf->b_next;
+	}
+	ab->b_state = new_state;
+}
+
+arc_buf_t *
+arc_buf_alloc(spa_t *spa, int size, void *tag)
+{
+	arc_buf_hdr_t *hdr;
+	arc_buf_t *buf;
+
+	ASSERT3U(size, >, 0);
+	hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+	ASSERT(BUF_EMPTY(hdr));
+	hdr->b_size = size;
+	hdr->b_spa = spa;
+	hdr->b_state = arc.anon;
+	hdr->b_arc_access = 0;
+	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+	buf->b_hdr = hdr;
+	buf->b_next = NULL;
+	buf->b_data = zio_buf_alloc(size);
+	hdr->b_buf = buf;
+	hdr->b_flags = 0;
+	ASSERT(refcount_is_zero(&hdr->b_refcnt));
+	(void) refcount_add(&hdr->b_refcnt, tag);
+
+	atomic_add_64(&arc.size, size);
+	atomic_add_64(&arc.anon->size, size);
+
+	return (buf);
+}
+
+static void
+arc_hdr_free(arc_buf_hdr_t *hdr)
+{
+	ASSERT(refcount_is_zero(&hdr->b_refcnt));
+	ASSERT3P(hdr->b_state, ==, arc.anon);
+
+	if (!BUF_EMPTY(hdr)) {
+		/*
+		 * We can be called with an arc state lock held,
+		 * so we can't hold a hash lock here.
+		 * ASSERT(not in hash table)
+		 */
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		bzero(&hdr->b_dva, sizeof (dva_t));
+		hdr->b_birth = 0;
+		hdr->b_cksum0 = 0;
+	}
+	if (hdr->b_buf) {
+		arc_buf_t *buf = hdr->b_buf;
+
+		ASSERT3U(hdr->b_size, >, 0);
+		zio_buf_free(buf->b_data, hdr->b_size);
+		atomic_add_64(&arc.size, -hdr->b_size);
+		ASSERT3U(arc.anon->size, >=, hdr->b_size);
+		atomic_add_64(&arc.anon->size, -hdr->b_size);
+		ASSERT3P(buf->b_next, ==, NULL);
+		kmem_cache_free(buf_cache, buf);
+		hdr->b_buf = NULL;
+	}
+	ASSERT(!list_link_active(&hdr->b_arc_node));
+	ASSERT3P(hdr->b_hash_next, ==, NULL);
+	ASSERT3P(hdr->b_acb, ==, NULL);
+	kmem_cache_free(hdr_cache, hdr);
+}
+
+void
+arc_buf_free(arc_buf_t *buf, void *tag)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	kmutex_t *hash_lock = HDR_LOCK(hdr);
+	int freeable;
+
+	mutex_enter(hash_lock);
+	if (remove_reference(hdr, hash_lock, tag) > 0) {
+		arc_buf_t **bufp = &hdr->b_buf;
+		arc_state_t *state = hdr->b_state;
+		uint64_t size = hdr->b_size;
+
+		ASSERT(hdr->b_state != arc.anon || HDR_IO_ERROR(hdr));
+		while (*bufp != buf) {
+			ASSERT(*bufp);
+			bufp = &(*bufp)->b_next;
+		}
+		*bufp = buf->b_next;
+		mutex_exit(hash_lock);
+		zio_buf_free(buf->b_data, size);
+		atomic_add_64(&arc.size, -size);
+		kmem_cache_free(buf_cache, buf);
+		ASSERT3U(state->size, >=, size);
+		atomic_add_64(&state->size, -size);
+		return;
+	}
+
+	/* don't free buffers that are in the middle of an async write */
+	freeable = (hdr->b_state == arc.anon && hdr->b_acb == NULL);
+	mutex_exit(hash_lock);
+
+	if (freeable)
+		arc_hdr_free(hdr);
+}
+
+int
+arc_buf_size(arc_buf_t *buf)
+{
+	return (buf->b_hdr->b_size);
+}
+
+/*
+ * Evict buffers from list until we've removed the specified number of
+ * bytes.  Move the removed buffers to the appropriate evict state.
+ */
+static uint64_t
+arc_evict_state(arc_state_t *state, int64_t bytes)
+{
+	arc_state_t *evicted_state;
+	uint64_t bytes_evicted = 0;
+	arc_buf_hdr_t *ab, *ab_prev;
+	kmutex_t *hash_lock;
+
+	ASSERT(state == arc.mru_top || state == arc.mfu_top);
+
+	if (state == arc.mru_top)
+		evicted_state = arc.mru_bot;
+	else
+		evicted_state = arc.mfu_bot;
+
+	mutex_enter(&state->mtx);
+	mutex_enter(&evicted_state->mtx);
+
+	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
+		ab_prev = list_prev(&state->list, ab);
+		hash_lock = HDR_LOCK(ab);
+		if (mutex_tryenter(hash_lock)) {
+			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
+			arc_change_state(evicted_state, ab, hash_lock);
+			zio_buf_free(ab->b_buf->b_data, ab->b_size);
+			atomic_add_64(&arc.size, -ab->b_size);
+			ASSERT3P(ab->b_buf->b_next, ==, NULL);
+			kmem_cache_free(buf_cache, ab->b_buf);
+			ab->b_buf = NULL;
+			DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
+			bytes_evicted += ab->b_size;
+			mutex_exit(hash_lock);
+			if (bytes_evicted >= bytes)
+				break;
+		} else {
+			atomic_add_64(&arc.skipped, 1);
+		}
+	}
+	mutex_exit(&evicted_state->mtx);
+	mutex_exit(&state->mtx);
+
+	if (bytes_evicted < bytes)
+		dprintf("only evicted %lld bytes from %x",
+		    (longlong_t)bytes_evicted, state);
+
+	return (bytes_evicted);
+}
+
+/*
+ * Remove buffers from list until we've removed the specified number of
+ * bytes.  Destroy the buffers that are removed.
+ */
+static void
+arc_delete_state(arc_state_t *state, int64_t bytes)
+{
+	uint_t bufs_skipped = 0;
+	uint64_t bytes_deleted = 0;
+	arc_buf_hdr_t *ab, *ab_prev;
+	kmutex_t *hash_lock;
+
+top:
+	mutex_enter(&state->mtx);
+	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
+		ab_prev = list_prev(&state->list, ab);
+		hash_lock = HDR_LOCK(ab);
+		if (mutex_tryenter(hash_lock)) {
+			arc_change_state(arc.anon, ab, hash_lock);
+			mutex_exit(hash_lock);
+			atomic_add_64(&arc.deleted, 1);
+			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
+			bytes_deleted += ab->b_size;
+			arc_hdr_free(ab);
+			if (bytes >= 0 && bytes_deleted >= bytes)
+				break;
+		} else {
+			if (bytes < 0) {
+				mutex_exit(&state->mtx);
+				mutex_enter(hash_lock);
+				mutex_exit(hash_lock);
+				goto top;
+			}
+			bufs_skipped += 1;
+		}
+	}
+	mutex_exit(&state->mtx);
+
+	if (bufs_skipped) {
+		atomic_add_64(&arc.skipped, bufs_skipped);
+		ASSERT(bytes >= 0);
+	}
+
+	if (bytes_deleted < bytes)
+		dprintf("only deleted %lld bytes from %p",
+		    (longlong_t)bytes_deleted, state);
+}
+
+static void
+arc_adjust(void)
+{
+	int64_t top_sz, mru_over, arc_over;
+
+	top_sz = arc.anon->size + arc.mru_top->size;
+
+	if (top_sz > arc.p && arc.mru_top->lsize > 0) {
+		int64_t toevict = MIN(arc.mru_top->lsize, top_sz-arc.p);
+		(void) arc_evict_state(arc.mru_top, toevict);
+		top_sz = arc.anon->size + arc.mru_top->size;
+	}
+
+	mru_over = top_sz + arc.mru_bot->size - arc.c;
+
+	if (mru_over > 0) {
+		if (arc.mru_bot->lsize > 0) {
+			int64_t todelete = MIN(arc.mru_bot->lsize, mru_over);
+			arc_delete_state(arc.mru_bot, todelete);
+		}
+	}
+
+	if ((arc_over = arc.size - arc.c) > 0) {
+		int64_t table_over;
+
+		if (arc.mfu_top->lsize > 0) {
+			int64_t toevict = MIN(arc.mfu_top->lsize, arc_over);
+			(void) arc_evict_state(arc.mfu_top, toevict);
+		}
+
+		table_over = arc.size + arc.mru_bot->lsize + arc.mfu_bot->lsize
+		    - arc.c*2;
+
+		if (table_over > 0 && arc.mfu_bot->lsize > 0) {
+			int64_t todelete = MIN(arc.mfu_bot->lsize, table_over);
+			arc_delete_state(arc.mfu_bot, todelete);
+		}
+	}
+}
+
+/*
+ * Flush all *evictable* data from the cache.
+ * NOTE: this will not touch "active" (i.e. referenced) data.
+ */
+void
+arc_flush(void)
+{
+	arc_delete_state(arc.mru_top, -1);
+	arc_delete_state(arc.mfu_top, -1);
+
+	arc_delete_state(arc.mru_bot, -1);
+	arc_delete_state(arc.mfu_bot, -1);
+}
+
+void
+arc_kmem_reclaim(void)
+{
+	/* Remove 6.25% */
+	/*
+	 * We need arc_reclaim_lock because we don't want multiple
+	 * threads trying to reclaim concurrently.
+	 */
+
+	/*
+	 * umem calls the reclaim func when we destroy the buf cache,
+	 * which is after we do arc_fini().  So we set a flag to prevent
+	 * accessing the destroyed mutexes and lists.
+	 */
+	if (arc_dead)
+		return;
+
+	mutex_enter(&arc_reclaim_lock);
+
+	atomic_add_64(&arc.c, -(arc.c >> 4));
+	if (arc.c < arc.c_min)
+		arc.c = arc.c_min;
+	atomic_add_64(&arc.p, -(arc.p >> 4));
+
+	arc_adjust();
+
+	/* Cool it for a while */
+	arc.incr = 0;
+	arc.size_check = arc_size_check_default << 3;
+
+	mutex_exit(&arc_reclaim_lock);
+}
+
+static int
+arc_reclaim_needed(void)
+{
+	uint64_t extra;
+
+#ifdef _KERNEL
+	/*
+	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
+	 */
+	extra = desfree;
+
+	/*
+	 * check that we're out of range of the pageout scanner.  It starts to
+	 * schedule paging if freemem is less than lotsfree and needfree.
+	 * lotsfree is the high-water mark for pageout, and needfree is the
+	 * number of needed free pages.  We add extra pages here to make sure
+	 * the scanner doesn't start up while we're freeing memory.
+	 */
+	if (freemem < lotsfree + needfree + extra)
+		return (1);
+
+	/*
+	 * check to make sure that swapfs has enough space so that anon
+	 * reservations can still succeeed. anon_resvmem() checks that the
+	 * availrmem is greater than swapfs_minfree, and the number of reserved
+	 * swap pages.  We also add a bit of extra here just to prevent
+	 * circumstances from getting really dire.
+	 */
+	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
+		return (1);
+
+	/*
+	 * If we're on an i386 platform, it's possible that we'll exhaust the
+	 * kernel heap space before we ever run out of available physical
+	 * memory.  Most checks of the size of the heap_area compare against
+	 * tune.t_minarmem, which is the minimum available real memory that we
+	 * can have in the system.  However, this is generally fixed at 25 pages
+	 * which is so low that it's useless.  In this comparison, we seek to
+	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
+	 * heap is allocated.  (Or, in the caclulation, if less than 1/4th is
+	 * free)
+	 */
+#if defined(__i386)
+	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
+	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
+		return (1);
+#endif
+
+#else
+	if (spa_get_random(100) == 0)
+		return (1);
+#endif
+	return (0);
+}
+
+static void
+arc_kmem_reap_now(arc_reclaim_strategy_t strat)
+{
+	size_t			i;
+	kmem_cache_t		*prev_cache = NULL;
+	extern kmem_cache_t	*zio_buf_cache[];
+
+	/*
+	 * an agressive reclamation will shrink the cache size as well as reap
+	 * free kmem buffers.  The arc_kmem_reclaim function is called when the
+	 * header-cache is reaped, so we only reap the header cache if we're
+	 * performing an agressive reclaim.  If we're not, just clean the kmem
+	 * buffer caches.
+	 */
+	if (strat == ARC_RECLAIM_AGGR)
+		kmem_cache_reap_now(hdr_cache);
+
+	kmem_cache_reap_now(buf_cache);
+
+	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
+		if (zio_buf_cache[i] != prev_cache) {
+			prev_cache = zio_buf_cache[i];
+			kmem_cache_reap_now(zio_buf_cache[i]);
+		}
+	}
+}
+
+static void
+arc_reclaim_thread(void)
+{
+	clock_t			growtime = 0;
+	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
+	callb_cpr_t		cpr;
+
+	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
+
+	mutex_enter(&arc_reclaim_thr_lock);
+	while (arc_thread_exit == 0) {
+		if (arc_reclaim_needed()) {
+
+			if (arc.no_grow) {
+				if (last_reclaim == ARC_RECLAIM_CONS) {
+					last_reclaim = ARC_RECLAIM_AGGR;
+				} else {
+					last_reclaim = ARC_RECLAIM_CONS;
+				}
+			} else {
+				arc.no_grow = TRUE;
+				last_reclaim = ARC_RECLAIM_AGGR;
+				membar_producer();
+			}
+
+			/* reset the growth delay for every reclaim */
+			growtime = lbolt + (arc_grow_retry * hz);
+
+			arc_kmem_reap_now(last_reclaim);
+
+		} else if ((growtime > 0) && ((growtime - lbolt) <= 0)) {
+			arc.no_grow = FALSE;
+		}
+
+		/* block until needed, or one second, whichever is shorter */
+		CALLB_CPR_SAFE_BEGIN(&cpr);
+		(void) cv_timedwait(&arc_reclaim_thr_cv,
+		    &arc_reclaim_thr_lock, (lbolt + hz));
+		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+	}
+
+	arc_thread_exit = 0;
+	cv_broadcast(&arc_reclaim_thr_cv);
+	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
+	thread_exit();
+}
+
+static void
+arc_try_grow(int64_t bytes)
+{
+	/*
+	 * If we're within (2 * maxblocksize) bytes of the target
+	 * cache size, increment the target cache size
+	 */
+	atomic_add_64((uint64_t *)&arc.size_check, 1);
+
+	if (arc_reclaim_needed()) {
+		cv_signal(&arc_reclaim_thr_cv);
+		return;
+	}
+
+	if (arc.no_grow)
+		return;
+
+	/*
+	 * return true if we successfully grow, or if there's enough space that
+	 * we don't have to grow.  Above, we return false if we can't grow, or
+	 * if we shouldn't because a reclaim is in progress.
+	 */
+	if ((arc.c - arc.size) <= (2ULL << SPA_MAXBLOCKSHIFT)) {
+		if (arc.size_check > 0) {
+			arc.size_check = arc_size_check_default;
+			atomic_add_64(&arc.incr, arc_incr_size);
+		}
+		atomic_add_64(&arc.c, MIN(bytes, arc.incr));
+		if (arc.c > arc.c_max)
+			arc.c = arc.c_max;
+		else
+			atomic_add_64(&arc.p, MIN(bytes, arc.incr));
+	} else if (arc.size > arc.c) {
+		if (arc.size_check > 0) {
+			arc.size_check = arc_size_check_default;
+			atomic_add_64(&arc.incr, arc_incr_size);
+		}
+		atomic_add_64(&arc.c, MIN(bytes, arc.incr));
+		if (arc.c > arc.c_max)
+			arc.c = arc.c_max;
+		else
+			atomic_add_64(&arc.p, MIN(bytes, arc.incr));
+	}
+}
+
+/*
+ * check if the cache has reached its limits and eviction is required prior to
+ * insert.  In this situation, we want to evict if no_grow is set Otherwise, the
+ * cache is either big enough that we can insert, or a arc_try_grow will result
+ * in more space being made available.
+ */
+
+static int
+arc_evict_needed()
+{
+
+	if (arc_reclaim_needed())
+		return (1);
+
+	if (arc.no_grow || (arc.c > arc.c_max) || (arc.size > arc.c))
+		return (1);
+
+	return (0);
+}
+
+/*
+ * The state, supplied as the first argument, is going to have something
+ * inserted on its behalf. So, determine which cache must be victimized to
+ * satisfy an insertion for this state.  We have the following cases:
+ *
+ * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru_top) ->
+ * In this situation if we're out of space, but the resident size of the MFU is
+ * under the limit, victimize the MFU cache to satisfy this insertion request.
+ *
+ * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru_top) ->
+ * Here, we've used up all of the available space for the MRU, so we need to
+ * evict from our own cache instead.  Evict from the set of resident MRU
+ * entries.
+ *
+ * 3. Insert for MFU (c - p) > sizeof(arc.mfu_top) ->
+ * c minus p represents the MFU space in the cache, since p is the size of the
+ * cache that is dedicated to the MRU.  In this situation there's still space on
+ * the MFU side, so the MRU side needs to be victimized.
+ *
+ * 4. Insert for MFU (c - p) < sizeof(arc.mfu_top) ->
+ * MFU's resident set is consuming more space than it has been allotted.  In
+ * this situation, we must victimize our own cache, the MFU, for this insertion.
+ */
+static void
+arc_evict_for_state(arc_state_t *state, uint64_t bytes)
+{
+	uint64_t	mru_used;
+	uint64_t	mfu_space;
+	uint64_t	evicted;
+
+	ASSERT(state == arc.mru_top || state == arc.mfu_top);
+
+	if (state == arc.mru_top) {
+		mru_used = arc.anon->size + arc.mru_top->size;
+		if (arc.p > mru_used) {
+			/* case 1 */
+			evicted = arc_evict_state(arc.mfu_top, bytes);
+			if (evicted < bytes) {
+				arc_adjust();
+			}
+		} else {
+			/* case 2 */
+			evicted = arc_evict_state(arc.mru_top, bytes);
+			if (evicted < bytes) {
+				arc_adjust();
+			}
+		}
+	} else {
+		/* MFU_top case */
+		mfu_space = arc.c - arc.p;
+		if (mfu_space > arc.mfu_top->size) {
+			/* case 3 */
+			evicted = arc_evict_state(arc.mru_top, bytes);
+			if (evicted < bytes) {
+				arc_adjust();
+			}
+		} else {
+			/* case 4 */
+			evicted = arc_evict_state(arc.mfu_top, bytes);
+			if (evicted < bytes) {
+				arc_adjust();
+			}
+		}
+	}
+}
+
+/*
+ * This routine is called whenever a buffer is accessed.
+ */
+static void
+arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
+{
+	int		blksz, mult;
+
+	ASSERT(MUTEX_HELD(hash_lock));
+
+	blksz = buf->b_size;
+
+	if (buf->b_state == arc.anon) {
+		/*
+		 * This buffer is not in the cache, and does not
+		 * appear in our "ghost" list.  Add the new buffer
+		 * to the MRU state.
+		 */
+
+		arc_try_grow(blksz);
+		if (arc_evict_needed()) {
+			arc_evict_for_state(arc.mru_top, blksz);
+		}
+
+		ASSERT(buf->b_arc_access == 0);
+		buf->b_arc_access = lbolt;
+		DTRACE_PROBE1(new_state__mru_top, arc_buf_hdr_t *,
+		    buf);
+		arc_change_state(arc.mru_top, buf, hash_lock);
+
+		/*
+		 * If we are using less than 2/3 of our total target
+		 * cache size, bump up the target size for the MRU
+		 * list.
+		 */
+		if (arc.size < arc.c*2/3) {
+			arc.p = arc.anon->size + arc.mru_top->size + arc.c/6;
+		}
+
+	} else if (buf->b_state == arc.mru_top) {
+		/*
+		 * If this buffer is in the MRU-top state and has the prefetch
+		 * flag, the first read was actually part of a prefetch.  In
+		 * this situation, we simply want to clear the flag and return.
+		 * A subsequent access should bump this into the MFU state.
+		 */
+		if ((buf->b_flags & ARC_PREFETCH) != 0) {
+			buf->b_flags &= ~ARC_PREFETCH;
+			atomic_add_64(&arc.mru_top->hits, 1);
+			return;
+		}
+
+		/*
+		 * This buffer has been "accessed" only once so far,
+		 * but it is still in the cache. Move it to the MFU
+		 * state.
+		 */
+		if (lbolt > buf->b_arc_access + ARC_MINTIME) {
+			/*
+			 * More than 125ms have passed since we
+			 * instantiated this buffer.  Move it to the
+			 * most frequently used state.
+			 */
+			buf->b_arc_access = lbolt;
+			DTRACE_PROBE1(new_state__mfu_top,
+			    arc_buf_hdr_t *, buf);
+			arc_change_state(arc.mfu_top, buf, hash_lock);
+		}
+		atomic_add_64(&arc.mru_top->hits, 1);
+	} else if (buf->b_state == arc.mru_bot) {
+		arc_state_t	*new_state;
+		/*
+		 * This buffer has been "accessed" recently, but
+		 * was evicted from the cache.  Move it to the
+		 * MFU state.
+		 */
+
+		if (buf->b_flags & ARC_PREFETCH) {
+			new_state = arc.mru_top;
+			DTRACE_PROBE1(new_state__mru_top,
+			    arc_buf_hdr_t *, buf);
+		} else {
+			new_state = arc.mfu_top;
+			DTRACE_PROBE1(new_state__mfu_top,
+			    arc_buf_hdr_t *, buf);
+		}
+
+		arc_try_grow(blksz);
+		if (arc_evict_needed()) {
+			arc_evict_for_state(new_state, blksz);
+		}
+
+		/* Bump up the target size of the MRU list */
+		mult = ((arc.mru_bot->size >= arc.mfu_bot->size) ?
+		    1 : (arc.mfu_bot->size/arc.mru_bot->size));
+		arc.p = MIN(arc.c, arc.p + blksz * mult);
+
+		buf->b_arc_access = lbolt;
+		arc_change_state(new_state, buf, hash_lock);
+
+		atomic_add_64(&arc.mru_bot->hits, 1);
+	} else if (buf->b_state == arc.mfu_top) {
+		/*
+		 * This buffer has been accessed more than once and is
+		 * still in the cache.  Keep it in the MFU state.
+		 *
+		 * NOTE: the add_reference() that occurred when we did
+		 * the arc_read() should have kicked this off the list,
+		 * so even if it was a prefetch, it will be put back at
+		 * the head of the list when we remove_reference().
+		 */
+		atomic_add_64(&arc.mfu_top->hits, 1);
+	} else if (buf->b_state == arc.mfu_bot) {
+		/*
+		 * This buffer has been accessed more than once but has
+		 * been evicted from the cache.  Move it back to the
+		 * MFU state.
+		 */
+
+		arc_try_grow(blksz);
+		if (arc_evict_needed()) {
+			arc_evict_for_state(arc.mfu_top, blksz);
+		}
+
+		/* Bump up the target size for the MFU list */
+		mult = ((arc.mfu_bot->size >= arc.mru_bot->size) ?
+		    1 : (arc.mru_bot->size/arc.mfu_bot->size));
+		arc.p = MAX(0, (int64_t)arc.p - blksz * mult);
+
+		buf->b_arc_access = lbolt;
+		DTRACE_PROBE1(new_state__mfu_top,
+		    arc_buf_hdr_t *, buf);
+		arc_change_state(arc.mfu_top, buf, hash_lock);
+
+		atomic_add_64(&arc.mfu_bot->hits, 1);
+	} else {
+		ASSERT(!"invalid arc state");
+	}
+
+}
+
+/* a generic arc_done_func_t which you can use */
+/* ARGSUSED */
+void
+arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
+	arc_buf_free(buf, arg);
+}
+
+/* a generic arc_done_func_t which you can use */
+void
+arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+	arc_buf_t **bufp = arg;
+	if (zio && zio->io_error) {
+		arc_buf_free(buf, arg);
+		*bufp = NULL;
+	} else {
+		*bufp = buf;
+	}
+}
+
+static void
+arc_read_done(zio_t *zio)
+{
+	arc_buf_hdr_t	*hdr;
+	arc_buf_t	*buf;
+	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
+	kmutex_t	*hash_lock;
+	arc_callback_t	*callback_list, *acb;
+	int		freeable = FALSE;
+
+	buf = zio->io_private;
+	hdr = buf->b_hdr;
+
+	if (!HDR_FREED_IN_READ(hdr)) {
+		arc_buf_hdr_t *found;
+
+		found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
+		    &hash_lock);
+
+		/*
+		 * Buffer was inserted into hash-table and removed from lists
+		 * prior to starting I/O.  We should find this header, since
+		 * it's in the hash table, and it should be legit since it's
+		 * not possible to evict it during the I/O.
+		 */
+
+		ASSERT(found);
+		ASSERT(DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)));
+	}
+
+	/* byteswap if necessary */
+	callback_list = hdr->b_acb;
+	ASSERT(callback_list != NULL);
+	if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
+		callback_list->acb_byteswap(buf->b_data, hdr->b_size);
+
+	/* create copies of the data buffer for the callers */
+	abuf = buf;
+	for (acb = callback_list; acb; acb = acb->acb_next) {
+		if (acb->acb_done) {
+			if (abuf == NULL) {
+				abuf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+				abuf->b_data = zio_buf_alloc(hdr->b_size);
+				atomic_add_64(&arc.size, hdr->b_size);
+				bcopy(buf->b_data, abuf->b_data, hdr->b_size);
+				abuf->b_hdr = hdr;
+				abuf->b_next = hdr->b_buf;
+				hdr->b_buf = abuf;
+				atomic_add_64(&hdr->b_state->size, hdr->b_size);
+			}
+			acb->acb_buf = abuf;
+			abuf = NULL;
+		} else {
+			/*
+			 * The caller did not provide a callback function.
+			 * In this case, we should just remove the reference.
+			 */
+			if (HDR_FREED_IN_READ(hdr)) {
+				ASSERT3P(hdr->b_state, ==, arc.anon);
+				(void) refcount_remove(&hdr->b_refcnt,
+				    acb->acb_private);
+			} else {
+				(void) remove_reference(hdr, hash_lock,
+				    acb->acb_private);
+			}
+		}
+	}
+	hdr->b_acb = NULL;
+	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+
+	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
+
+	if (zio->io_error != 0) {
+		hdr->b_flags |= ARC_IO_ERROR;
+		if (hdr->b_state != arc.anon)
+			arc_change_state(arc.anon, hdr, hash_lock);
+		freeable = refcount_is_zero(&hdr->b_refcnt);
+	}
+
+	if (!HDR_FREED_IN_READ(hdr)) {
+		/*
+		 * Only call arc_access on anonymous buffers.  This is because
+		 * if we've issued an I/O for an evicted buffer, we've already
+		 * called arc_access (to prevent any simultaneous readers from
+		 * getting confused).
+		 */
+		if (zio->io_error == 0 && hdr->b_state == arc.anon)
+			arc_access(hdr, hash_lock);
+		mutex_exit(hash_lock);
+	} else {
+		/*
+		 * This block was freed while we waited for the read to
+		 * complete.  It has been removed from the hash table and
+		 * moved to the anonymous state (so that it won't show up
+		 * in the cache).
+		 */
+		ASSERT3P(hdr->b_state, ==, arc.anon);
+		freeable = refcount_is_zero(&hdr->b_refcnt);
+	}
+
+	cv_broadcast(&hdr->b_cv);
+
+	/* execute each callback and free its structure */
+	while ((acb = callback_list) != NULL) {
+		if (acb->acb_done)
+			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
+
+		if (acb->acb_zio_dummy != NULL) {
+			acb->acb_zio_dummy->io_error = zio->io_error;
+			zio_nowait(acb->acb_zio_dummy);
+		}
+
+		callback_list = acb->acb_next;
+		kmem_free(acb, sizeof (arc_callback_t));
+	}
+
+	if (freeable)
+		arc_hdr_free(hdr);
+}
+
+/*
+ * "Read" the block block at the specified DVA (in bp) via the
+ * cache.  If the block is found in the cache, invoke the provided
+ * callback immediately and return.  Note that the `zio' parameter
+ * in the callback will be NULL in this case, since no IO was
+ * required.  If the block is not in the cache pass the read request
+ * on to the spa with a substitute callback function, so that the
+ * requested block will be added to the cache.
+ *
+ * If a read request arrives for a block that has a read in-progress,
+ * either wait for the in-progress read to complete (and return the
+ * results); or, if this is a read with a "done" func, add a record
+ * to the read to invoke the "done" func when the read completes,
+ * and return; or just return.
+ *
+ * arc_read_done() will invoke all the requested "done" functions
+ * for readers of this block.
+ */
+int
+arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
+    arc_done_func_t *done, void *private, int priority, int flags,
+    uint32_t arc_flags)
+{
+	arc_buf_hdr_t *hdr;
+	arc_buf_t *buf;
+	kmutex_t *hash_lock;
+	zio_t	*rzio;
+
+top:
+	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+	if (hdr && hdr->b_buf) {
+
+		ASSERT((hdr->b_state == arc.mru_top) ||
+		    (hdr->b_state == arc.mfu_top) ||
+		    ((hdr->b_state == arc.anon) &&
+		    (HDR_IO_IN_PROGRESS(hdr))));
+
+		if (HDR_IO_IN_PROGRESS(hdr)) {
+
+			if ((arc_flags & ARC_NOWAIT) && done) {
+				arc_callback_t	*acb = NULL;
+
+				acb = kmem_zalloc(sizeof (arc_callback_t),
+				    KM_SLEEP);
+				acb->acb_done = done;
+				acb->acb_private = private;
+				acb->acb_byteswap = swap;
+				if (pio != NULL)
+					acb->acb_zio_dummy = zio_null(pio,
+					    spa, NULL, NULL, flags);
+
+				ASSERT(acb->acb_done != NULL);
+				acb->acb_next = hdr->b_acb;
+				hdr->b_acb = acb;
+				add_reference(hdr, hash_lock, private);
+				mutex_exit(hash_lock);
+				return (0);
+			} else if (arc_flags & ARC_WAIT) {
+				cv_wait(&hdr->b_cv, hash_lock);
+				mutex_exit(hash_lock);
+				goto top;
+			}
+
+			mutex_exit(hash_lock);
+			return (0);
+		}
+
+		/*
+		 * If there is already a reference on this block, create
+		 * a new copy of the data so that we will be guaranteed
+		 * that arc_release() will always succeed.
+		 */
+
+		if (done)
+			add_reference(hdr, hash_lock, private);
+		if (done && refcount_count(&hdr->b_refcnt) > 1) {
+			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+			buf->b_data = zio_buf_alloc(hdr->b_size);
+			ASSERT3U(refcount_count(&hdr->b_refcnt), >, 1);
+			atomic_add_64(&arc.size, hdr->b_size);
+			bcopy(hdr->b_buf->b_data, buf->b_data, hdr->b_size);
+			buf->b_hdr = hdr;
+			buf->b_next = hdr->b_buf;
+			hdr->b_buf = buf;
+			atomic_add_64(&hdr->b_state->size, hdr->b_size);
+		} else {
+			buf = hdr->b_buf;
+		}
+		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
+		arc_access(hdr, hash_lock);
+		mutex_exit(hash_lock);
+		atomic_add_64(&arc.hits, 1);
+		if (done)
+			done(NULL, buf, private);
+	} else {
+		uint64_t size = BP_GET_LSIZE(bp);
+		arc_callback_t	*acb;
+
+		if (hdr == NULL) {
+			/* this block is not in the cache */
+			arc_buf_hdr_t	*exists;
+
+			buf = arc_buf_alloc(spa, size, private);
+			hdr = buf->b_hdr;
+			hdr->b_dva = *BP_IDENTITY(bp);
+			hdr->b_birth = bp->blk_birth;
+			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
+			exists = buf_hash_insert(hdr, &hash_lock);
+			if (exists) {
+				/* somebody beat us to the hash insert */
+				mutex_exit(hash_lock);
+				bzero(&hdr->b_dva, sizeof (dva_t));
+				hdr->b_birth = 0;
+				hdr->b_cksum0 = 0;
+				arc_buf_free(buf, private);
+				goto top; /* restart the IO request */
+			}
+
+		} else {
+			/* this block is in the ghost cache */
+			ASSERT((hdr->b_state == arc.mru_bot) ||
+			    (hdr->b_state == arc.mfu_bot));
+			add_reference(hdr, hash_lock, private);
+
+			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+			buf->b_data = zio_buf_alloc(hdr->b_size);
+			atomic_add_64(&arc.size, hdr->b_size);
+			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
+			buf->b_hdr = hdr;
+			buf->b_next = NULL;
+			hdr->b_buf = buf;
+		}
+
+		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+		acb->acb_done = done;
+		acb->acb_private = private;
+		acb->acb_byteswap = swap;
+
+		ASSERT(hdr->b_acb == NULL);
+		hdr->b_acb = acb;
+
+		/*
+		 * If this DVA is part of a prefetch, mark the buf
+		 * header with the prefetch flag
+		 */
+		if (arc_flags & ARC_PREFETCH)
+			hdr->b_flags |= ARC_PREFETCH;
+		hdr->b_flags |= ARC_IO_IN_PROGRESS;
+
+		/*
+		 * If the buffer has been evicted, migrate it to a present state
+		 * before issuing the I/O.  Once we drop the hash-table lock,
+		 * the header will be marked as I/O in progress and have an
+		 * attached buffer.  At this point, anybody who finds this
+		 * buffer ought to notice that it's legit but has a pending I/O.
+		 */
+
+		if ((hdr->b_state == arc.mru_bot) ||
+		    (hdr->b_state == arc.mfu_bot))
+			arc_access(hdr, hash_lock);
+
+		mutex_exit(hash_lock);
+
+		ASSERT3U(hdr->b_size, ==, size);
+		DTRACE_PROBE2(arc__miss, blkptr_t *, bp,
+		    uint64_t, size);
+		atomic_add_64(&arc.misses, 1);
+		rzio = zio_read(pio, spa, bp, buf->b_data, size,
+		    arc_read_done, buf, priority, flags);
+
+		if (arc_flags & ARC_WAIT)
+			return (zio_wait(rzio));
+
+		ASSERT(arc_flags & ARC_NOWAIT);
+		zio_nowait(rzio);
+	}
+	return (0);
+}
+
+/*
+ * arc_read() variant to support pool traversal.  If the block is already
+ * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
+ * The idea is that we don't want pool traversal filling up memory, but
+ * if the ARC already has the data anyway, we shouldn't pay for the I/O.
+ */
+int
+arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
+{
+	arc_buf_hdr_t *hdr;
+	kmutex_t *hash_mtx;
+	int rc = 0;
+
+	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
+
+	if (hdr && hdr->b_buf && !HDR_IO_IN_PROGRESS(hdr))
+		bcopy(hdr->b_buf->b_data, data, hdr->b_size);
+	else
+		rc = ENOENT;
+
+	if (hash_mtx)
+		mutex_exit(hash_mtx);
+
+	return (rc);
+}
+
+/*
+ * Release this buffer from the cache.  This must be done
+ * after a read and prior to modifying the buffer contents.
+ * If the buffer has more than one reference, we must make
+ * make a new hdr for the buffer.
+ */
+void
+arc_release(arc_buf_t *buf, void *tag)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	kmutex_t *hash_lock = HDR_LOCK(hdr);
+
+	/* this buffer is not on any list */
+	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+
+	if (hdr->b_state == arc.anon) {
+		/* this buffer is already released */
+		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
+		ASSERT(BUF_EMPTY(hdr));
+		return;
+	}
+
+	mutex_enter(hash_lock);
+
+	if (refcount_count(&hdr->b_refcnt) > 1) {
+		arc_buf_hdr_t *nhdr;
+		arc_buf_t **bufp;
+		uint64_t blksz = hdr->b_size;
+		spa_t *spa = hdr->b_spa;
+
+		/*
+		 * Pull the data off of this buf and attach it to
+		 * a new anonymous buf.
+		 */
+		bufp = &hdr->b_buf;
+		while (*bufp != buf) {
+			ASSERT(*bufp);
+			bufp = &(*bufp)->b_next;
+		}
+		*bufp = (*bufp)->b_next;
+		(void) refcount_remove(&hdr->b_refcnt, tag);
+		ASSERT3U(hdr->b_state->size, >=, hdr->b_size);
+		atomic_add_64(&hdr->b_state->size, -hdr->b_size);
+		mutex_exit(hash_lock);
+
+		nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+		nhdr->b_size = blksz;
+		nhdr->b_spa = spa;
+		nhdr->b_buf = buf;
+		nhdr->b_state = arc.anon;
+		nhdr->b_arc_access = 0;
+		nhdr->b_flags = 0;
+		buf->b_hdr = nhdr;
+		buf->b_next = NULL;
+		(void) refcount_add(&nhdr->b_refcnt, tag);
+		atomic_add_64(&arc.anon->size, blksz);
+
+		hdr = nhdr;
+	} else {
+		ASSERT(!list_link_active(&hdr->b_arc_node));
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		arc_change_state(arc.anon, hdr, hash_lock);
+		hdr->b_arc_access = 0;
+		mutex_exit(hash_lock);
+		bzero(&hdr->b_dva, sizeof (dva_t));
+		hdr->b_birth = 0;
+		hdr->b_cksum0 = 0;
+	}
+}
+
+int
+arc_released(arc_buf_t *buf)
+{
+	return (buf->b_hdr->b_state == arc.anon);
+}
+
+static void
+arc_write_done(zio_t *zio)
+{
+	arc_buf_t *buf;
+	arc_buf_hdr_t *hdr;
+	arc_callback_t *acb;
+
+	buf = zio->io_private;
+	hdr = buf->b_hdr;
+	acb = hdr->b_acb;
+	hdr->b_acb = NULL;
+
+	/* this buffer is on no lists and is not in the hash table */
+	ASSERT3P(hdr->b_state, ==, arc.anon);
+
+	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+	hdr->b_birth = zio->io_bp->blk_birth;
+	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+	/* clear the "in-write" flag */
+	hdr->b_hash_next = NULL;
+	/* This write may be all-zero */
+	if (!BUF_EMPTY(hdr)) {
+		arc_buf_hdr_t *exists;
+		kmutex_t *hash_lock;
+
+		exists = buf_hash_insert(hdr, &hash_lock);
+		if (exists) {
+			/*
+			 * This can only happen if we overwrite for
+			 * sync-to-convergence, because we remove
+			 * buffers from the hash table when we arc_free().
+			 */
+			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
+			    BP_IDENTITY(zio->io_bp)));
+			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
+			    zio->io_bp->blk_birth);
+
+			ASSERT(refcount_is_zero(&exists->b_refcnt));
+			arc_change_state(arc.anon, exists, hash_lock);
+			mutex_exit(hash_lock);
+			arc_hdr_free(exists);
+			exists = buf_hash_insert(hdr, &hash_lock);
+			ASSERT3P(exists, ==, NULL);
+		}
+		arc_access(hdr, hash_lock);
+		mutex_exit(hash_lock);
+	}
+	if (acb && acb->acb_done) {
+		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+		acb->acb_done(zio, buf, acb->acb_private);
+	}
+
+	if (acb)
+		kmem_free(acb, sizeof (arc_callback_t));
+}
+
+int
+arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+    uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+    arc_done_func_t *done, void *private, int priority, int flags,
+    uint32_t arc_flags)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	arc_callback_t	*acb;
+	zio_t	*rzio;
+
+	/* this is a private buffer - no locking required */
+	ASSERT3P(hdr->b_state, ==, arc.anon);
+	ASSERT(BUF_EMPTY(hdr));
+	ASSERT(!HDR_IO_ERROR(hdr));
+	acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+	acb->acb_done = done;
+	acb->acb_private = private;
+	acb->acb_byteswap = (arc_byteswap_func_t *)-1;
+	hdr->b_acb = acb;
+	rzio = zio_write(pio, spa, checksum, compress, txg, bp,
+	    buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags);
+
+	if (arc_flags & ARC_WAIT)
+		return (zio_wait(rzio));
+
+	ASSERT(arc_flags & ARC_NOWAIT);
+	zio_nowait(rzio);
+
+	return (0);
+}
+
+int
+arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    zio_done_func_t *done, void *private, uint32_t arc_flags)
+{
+	arc_buf_hdr_t *ab;
+	kmutex_t *hash_lock;
+	zio_t	*zio;
+
+	/*
+	 * If this buffer is in the cache, release it, so it
+	 * can be re-used.
+	 */
+	ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+	if (ab != NULL) {
+		/*
+		 * The checksum of blocks to free is not always
+		 * preserved (eg. on the deadlist).  However, if it is
+		 * nonzero, it should match what we have in the cache.
+		 */
+		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
+		    ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
+		arc_change_state(arc.anon, ab, hash_lock);
+		if (refcount_is_zero(&ab->b_refcnt)) {
+			mutex_exit(hash_lock);
+			arc_hdr_free(ab);
+			atomic_add_64(&arc.deleted, 1);
+		} else {
+			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 1);
+			if (HDR_IO_IN_PROGRESS(ab))
+				ab->b_flags |= ARC_FREED_IN_READ;
+			ab->b_arc_access = 0;
+			bzero(&ab->b_dva, sizeof (dva_t));
+			ab->b_birth = 0;
+			ab->b_cksum0 = 0;
+			mutex_exit(hash_lock);
+		}
+	}
+
+	zio = zio_free(pio, spa, txg, bp, done, private);
+
+	if (arc_flags & ARC_WAIT)
+		return (zio_wait(zio));
+
+	ASSERT(arc_flags & ARC_NOWAIT);
+	zio_nowait(zio);
+
+	return (0);
+}
+
+void
+arc_tempreserve_clear(uint64_t tempreserve)
+{
+	atomic_add_64(&arc_tempreserve, -tempreserve);
+	ASSERT((int64_t)arc_tempreserve >= 0);
+}
+
+int
+arc_tempreserve_space(uint64_t tempreserve)
+{
+#ifdef ZFS_DEBUG
+	/*
+	 * Once in a while, fail for no reason.  Everything should cope.
+	 */
+	if (spa_get_random(10000) == 0) {
+		dprintf("forcing random failure\n");
+		return (ERESTART);
+	}
+#endif
+	/*
+	 * XXX This is kind of hacky.  The limit should be adjusted
+	 * dynamically to keep the time to sync a dataset fixed (around
+	 * 1-5 seconds?).
+	 * Maybe should have some sort of locking?  If two requests come
+	 * in concurrently, we might let them both succeed, when one of
+	 * them should fail.  Not a huge deal.
+	 */
+
+	ASSERT3U(tempreserve, <, arc.c/4); /* otherwise we'll loop forever */
+
+	if (arc_tempreserve + tempreserve + arc.anon->size > arc.c / 4) {
+		dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
+		    "tempreserve=%lluK arc.c=%lluK\n",
+		    arc_tempreserve>>10, arc.anon->lsize>>10,
+		    tempreserve>>10, arc.c>>10);
+		return (ERESTART);
+	}
+	atomic_add_64(&arc_tempreserve, tempreserve);
+	return (0);
+}
+
+void
+arc_init(void)
+{
+	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
+
+	/* Start out with 1/8 of all memory */
+	arc.c = physmem * PAGESIZE / 8;
+
+#ifdef _KERNEL
+	/*
+	 * On architectures where the physical memory can be larger
+	 * than the addressable space (intel in 32-bit mode), we may
+	 * need to limit the cache to 1/8 of VM size.
+	 */
+	arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+#endif
+
+	/* use at least 1/32 of all memory, or 32MB, whichever is more */
+	arc.c_min = MAX(arc.c / 4, 64<<20);
+	/* use at most 3/4 of all memory, or all but 1GB, whichever is more */
+	if (arc.c * 8 >= 1<<30)
+		arc.c_max = (arc.c * 8) - (1<<30);
+	else
+		arc.c_max = arc.c_min;
+	arc.c_max = MAX(arc.c * 6, arc.c_max);
+	arc.c = arc.c_max;
+	arc.p = (arc.c >> 1);
+
+	/* if kmem_flags are set, lets try to use less memory */
+	if (kmem_debugging())
+		arc.c = arc.c / 2;
+	if (arc.c < arc.c_min)
+		arc.c = arc.c_min;
+
+	arc.anon = &ARC_anon;
+	arc.mru_top = &ARC_mru_top;
+	arc.mru_bot = &ARC_mru_bot;
+	arc.mfu_top = &ARC_mfu_top;
+	arc.mfu_bot = &ARC_mfu_bot;
+
+	list_create(&arc.mru_top->list, sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_arc_node));
+	list_create(&arc.mru_bot->list, sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_arc_node));
+	list_create(&arc.mfu_top->list, sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_arc_node));
+	list_create(&arc.mfu_bot->list, sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_arc_node));
+
+	buf_init();
+
+	arc_thread_exit = 0;
+
+	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
+	    TS_RUN, minclsyspri);
+}
+
+void
+arc_fini(void)
+{
+	mutex_enter(&arc_reclaim_thr_lock);
+	arc_thread_exit = 1;
+	while (arc_thread_exit != 0)
+		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
+	mutex_exit(&arc_reclaim_thr_lock);
+
+	arc_flush();
+
+	arc_dead = TRUE;
+
+	mutex_destroy(&arc_reclaim_lock);
+	mutex_destroy(&arc_reclaim_thr_lock);
+	cv_destroy(&arc_reclaim_thr_cv);
+
+	list_destroy(&arc.mru_top->list);
+	list_destroy(&arc.mru_bot->list);
+	list_destroy(&arc.mfu_top->list);
+	list_destroy(&arc.mfu_bot->list);
+
+	buf_fini();
+}
diff --git a/usr/src/uts/common/fs/zfs/bplist.c b/usr/src/uts/common/fs/zfs/bplist.c
new file mode 100644
index 000000000000..68f79ac5a2e5
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/bplist.c
@@ -0,0 +1,239 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/bplist.h>
+#include <sys/zfs_context.h>
+
+static void
+bplist_hold(bplist_t *bpl)
+{
+	ASSERT(MUTEX_HELD(&bpl->bpl_lock));
+	if (bpl->bpl_dbuf == NULL) {
+		bpl->bpl_dbuf = dmu_bonus_hold_tag(bpl->bpl_mos,
+		    bpl->bpl_object, bpl);
+		dmu_buf_read(bpl->bpl_dbuf);
+		bpl->bpl_phys = bpl->bpl_dbuf->db_data;
+	}
+}
+
+uint64_t
+bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
+{
+	uint64_t obj;
+
+	obj = dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
+	    DMU_OT_BPLIST_HDR, sizeof (bplist_phys_t), tx);
+
+	return (obj);
+}
+
+void
+bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+{
+	VERIFY(dmu_object_free(mos, object, tx) == 0);
+}
+
+void
+bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
+{
+	dmu_object_info_t doi;
+
+	VERIFY(dmu_object_info(mos, object, &doi) == 0);
+
+	mutex_enter(&bpl->bpl_lock);
+
+	ASSERT(bpl->bpl_dbuf == NULL);
+	ASSERT(bpl->bpl_phys == NULL);
+	ASSERT(bpl->bpl_cached_dbuf == NULL);
+	ASSERT(bpl->bpl_queue == NULL);
+	ASSERT(object != 0);
+
+	bpl->bpl_mos = mos;
+	bpl->bpl_object = object;
+	bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
+	bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
+
+	mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_close(bplist_t *bpl)
+{
+	mutex_enter(&bpl->bpl_lock);
+
+	ASSERT(bpl->bpl_queue == NULL);
+
+	if (bpl->bpl_cached_dbuf) {
+		dmu_buf_rele(bpl->bpl_cached_dbuf);
+		bpl->bpl_cached_dbuf = NULL;
+	}
+	if (bpl->bpl_dbuf) {
+		dmu_buf_rele_tag(bpl->bpl_dbuf, bpl);
+		bpl->bpl_dbuf = NULL;
+		bpl->bpl_phys = NULL;
+	}
+
+	mutex_exit(&bpl->bpl_lock);
+}
+
+boolean_t
+bplist_empty(bplist_t *bpl)
+{
+	boolean_t rv;
+
+	if (bpl->bpl_object == 0)
+		return (B_TRUE);
+
+	mutex_enter(&bpl->bpl_lock);
+	bplist_hold(bpl);
+	rv = (bpl->bpl_phys->bpl_entries == 0);
+	mutex_exit(&bpl->bpl_lock);
+
+	return (rv);
+}
+
+int
+bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
+{
+	uint64_t blk, off;
+	blkptr_t *bparray;
+	dmu_buf_t *db;
+
+	mutex_enter(&bpl->bpl_lock);
+	bplist_hold(bpl);
+
+	if (*itorp >= bpl->bpl_phys->bpl_entries) {
+		mutex_exit(&bpl->bpl_lock);
+		return (ENOENT);
+	}
+
+	blk = *itorp >> bpl->bpl_bpshift;
+	off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
+	db = bpl->bpl_cached_dbuf;
+
+	if (db == NULL || db->db_offset != (blk << bpl->bpl_blockshift)) {
+		if (db != NULL)
+			dmu_buf_rele(db);
+		bpl->bpl_cached_dbuf = db = dmu_buf_hold(bpl->bpl_mos,
+		    bpl->bpl_object, blk << bpl->bpl_blockshift);
+	}
+
+	ASSERT3U(db->db_size, ==, 1ULL << bpl->bpl_blockshift);
+
+	dmu_buf_read(db);
+	bparray = db->db_data;
+	*bp = bparray[off];
+	(*itorp)++;
+	mutex_exit(&bpl->bpl_lock);
+	return (0);
+}
+
+void
+bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
+{
+	uint64_t blk, off;
+	blkptr_t *bparray;
+	dmu_buf_t *db;
+
+	ASSERT(!BP_IS_HOLE(bp));
+	mutex_enter(&bpl->bpl_lock);
+	bplist_hold(bpl);
+
+	blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
+	off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
+	db = bpl->bpl_cached_dbuf;
+
+	if (db == NULL || db->db_offset != (blk << bpl->bpl_blockshift)) {
+		if (db != NULL)
+			dmu_buf_rele(db);
+		bpl->bpl_cached_dbuf = db = dmu_buf_hold(bpl->bpl_mos,
+		    bpl->bpl_object, blk << bpl->bpl_blockshift);
+	}
+
+	ASSERT3U(db->db_size, ==, 1ULL << bpl->bpl_blockshift);
+
+	dmu_buf_will_dirty(db, tx);
+	bparray = db->db_data;
+	bparray[off] = *bp;
+
+	/* We never need the fill count. */
+	bparray[off].blk_fill = 0;
+
+	/* The bplist will compress better if we can leave off the checksum */
+	bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
+
+	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
+	bpl->bpl_phys->bpl_entries++;
+	bpl->bpl_phys->bpl_bytes += BP_GET_ASIZE(bp);
+	mutex_exit(&bpl->bpl_lock);
+}
+
+/*
+ * Deferred entry; will be written later by bplist_sync().
+ */
+void
+bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp)
+{
+	bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
+
+	ASSERT(!BP_IS_HOLE(bp));
+	mutex_enter(&bpl->bpl_lock);
+	bpq->bpq_blk = *bp;
+	bpq->bpq_next = bpl->bpl_queue;
+	bpl->bpl_queue = bpq;
+	mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
+{
+	bplist_q_t *bpq;
+
+	mutex_enter(&bpl->bpl_lock);
+	while ((bpq = bpl->bpl_queue) != NULL) {
+		bpl->bpl_queue = bpq->bpq_next;
+		mutex_exit(&bpl->bpl_lock);
+		bplist_enqueue(bpl, &bpq->bpq_blk, tx);
+		kmem_free(bpq, sizeof (*bpq));
+		mutex_enter(&bpl->bpl_lock);
+	}
+	mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
+{
+	mutex_enter(&bpl->bpl_lock);
+	ASSERT3P(bpl->bpl_queue, ==, NULL);
+	bplist_hold(bpl);
+	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
+	dmu_free_range(bpl->bpl_mos, bpl->bpl_object, 0, -1ULL, tx);
+	bpl->bpl_phys->bpl_entries = 0;
+	bpl->bpl_phys->bpl_bytes = 0;
+	mutex_exit(&bpl->bpl_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
new file mode 100644
index 000000000000..e4b2d7f9e68d
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -0,0 +1,2022 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+
+static void dbuf_destroy(dmu_buf_impl_t *db);
+static void dbuf_verify(dmu_buf_impl_t *db);
+static void dbuf_evict_user(dmu_buf_impl_t *db);
+static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+static arc_done_func_t dbuf_read_done;
+static arc_done_func_t dbuf_write_done;
+
+/*
+ * Global data structures and functions for the dbuf cache.
+ */
+taskq_t *dbuf_tq;
+static kmem_cache_t *dbuf_cache;
+
+/* ARGSUSED */
+static int
+dbuf_cons(void *vdb, void *unused, int kmflag)
+{
+	dmu_buf_impl_t *db = vdb;
+	bzero(db, sizeof (dmu_buf_impl_t));
+
+	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
+	refcount_create(&db->db_holds);
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+dbuf_dest(void *vdb, void *unused)
+{
+	dmu_buf_impl_t *db = vdb;
+	mutex_destroy(&db->db_mtx);
+	cv_destroy(&db->db_changed);
+	refcount_destroy(&db->db_holds);
+}
+
+/*
+ * dbuf hash table routines
+ */
+static dbuf_hash_table_t dbuf_hash_table;
+
+static uint64_t dbuf_hash_count;
+
+static uint64_t
+dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
+{
+	uintptr_t osv = (uintptr_t)os;
+	uint64_t crc = -1ULL;
+
+	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
+
+	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
+
+	return (crc);
+}
+
+#define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
+
+#define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
+	((dbuf)->db.db_object == (obj) &&		\
+	(dbuf)->db_objset == (os) &&			\
+	(dbuf)->db_level == (level) &&			\
+	(dbuf)->db_blkid == (blkid))
+
+dmu_buf_impl_t *
+dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
+{
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	objset_impl_t *os = dn->dn_objset;
+	uint64_t obj = dn->dn_object;
+	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+	uint64_t idx = hv & h->hash_table_mask;
+	dmu_buf_impl_t *db;
+
+	mutex_enter(DBUF_HASH_MUTEX(h, idx));
+	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
+		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
+			mutex_enter(&db->db_mtx);
+			if (!refcount_is_zero(&db->db_holds)) {
+				mutex_exit(DBUF_HASH_MUTEX(h, idx));
+				return (db);
+			}
+			mutex_exit(&db->db_mtx);
+		}
+	}
+	mutex_exit(DBUF_HASH_MUTEX(h, idx));
+	return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table.  If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static dmu_buf_impl_t *
+dbuf_hash_insert(dmu_buf_impl_t *db)
+{
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	objset_impl_t *os = db->db_objset;
+	uint64_t obj = db->db.db_object;
+	int level = db->db_level;
+	uint64_t blkid = db->db_blkid;
+	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+	uint64_t idx = hv & h->hash_table_mask;
+	dmu_buf_impl_t *dbf;
+
+	mutex_enter(DBUF_HASH_MUTEX(h, idx));
+	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
+		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
+			mutex_enter(&dbf->db_mtx);
+			if (!refcount_is_zero(&dbf->db_holds)) {
+				mutex_exit(DBUF_HASH_MUTEX(h, idx));
+				return (dbf);
+			}
+			mutex_exit(&dbf->db_mtx);
+		}
+	}
+
+	mutex_enter(&db->db_mtx);
+	db->db_hash_next = h->hash_table[idx];
+	h->hash_table[idx] = db;
+	mutex_exit(DBUF_HASH_MUTEX(h, idx));
+	atomic_add_64(&dbuf_hash_count, 1);
+
+	return (NULL);
+}
+
+/*
+ * Remove an entry from the hash table.  This operation will
+ * fail if there are any existing holds on the db.
+ */
+static void
+dbuf_hash_remove(dmu_buf_impl_t *db)
+{
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
+	    db->db_level, db->db_blkid);
+	uint64_t idx = hv & h->hash_table_mask;
+	dmu_buf_impl_t *dbf, **dbp;
+
+	/*
+	 * We musn't hold db_mtx to maintin lock ordering:
+	 * DBUF_HASH_MUTEX > db_mtx.
+	 */
+	ASSERT(refcount_is_zero(&db->db_holds));
+	ASSERT(db->db_dnode != NULL);
+	ASSERT(!MUTEX_HELD(&db->db_mtx));
+
+	mutex_enter(DBUF_HASH_MUTEX(h, idx));
+	dbp = &h->hash_table[idx];
+	while ((dbf = *dbp) != db) {
+		dbp = &dbf->db_hash_next;
+		ASSERT(dbf != NULL);
+	}
+	*dbp = db->db_hash_next;
+	db->db_hash_next = NULL;
+	mutex_exit(DBUF_HASH_MUTEX(h, idx));
+	atomic_add_64(&dbuf_hash_count, -1);
+}
+
+static int dbuf_evictable(dmu_buf_impl_t *db);
+static void dbuf_clear(dmu_buf_impl_t *db);
+
+void
+dbuf_evict(dmu_buf_impl_t *db)
+{
+	int err;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	err = dbuf_evictable(db);
+	ASSERT(err == TRUE);
+	dbuf_clear(db);
+	dbuf_destroy(db);
+}
+
+static void
+dbuf_evict_user(dmu_buf_impl_t *db)
+{
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	if (db->db_level != 0 || db->db_d.db_evict_func == NULL)
+		return;
+
+	if (db->db_d.db_user_data_ptr_ptr)
+		*db->db_d.db_user_data_ptr_ptr = db->db.db_data;
+	db->db_d.db_evict_func(&db->db, db->db_d.db_user_ptr);
+	db->db_d.db_user_ptr = NULL;
+	db->db_d.db_user_data_ptr_ptr = NULL;
+	db->db_d.db_evict_func = NULL;
+}
+
+void
+dbuf_init(void)
+{
+	uint64_t hsize = 1;
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	int i;
+
+	/*
+	 * The hash table is big enough to fill all of physical memory
+	 * with an average 64k block size.  The table will take up
+	 * totalmem*sizeof(void*)/64k bytes (i.e. 128KB/GB with 8-byte
+	 * pointers).
+	 */
+	while (hsize * 65536 < physmem * PAGESIZE)
+		hsize <<= 1;
+
+	h->hash_table_mask = hsize - 1;
+	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
+
+	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
+	    sizeof (dmu_buf_impl_t),
+	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
+	dbuf_tq = taskq_create("dbuf_tq", 8, maxclsyspri, 50, INT_MAX,
+	    TASKQ_PREPOPULATE);
+
+	for (i = 0; i < DBUF_MUTEXES; i++)
+		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+dbuf_fini(void)
+{
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	int i;
+
+	taskq_destroy(dbuf_tq);
+	dbuf_tq = NULL;
+
+	for (i = 0; i < DBUF_MUTEXES; i++)
+		mutex_destroy(&h->hash_mutexes[i]);
+	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+	kmem_cache_destroy(dbuf_cache);
+}
+
+/*
+ * Other stuff.
+ */
+
+static void
+dbuf_verify(dmu_buf_impl_t *db)
+{
+#ifdef ZFS_DEBUG
+	int i;
+	dnode_t *dn = db->db_dnode;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
+		return;
+
+	ASSERT(db->db_objset != NULL);
+	if (dn == NULL) {
+		ASSERT(db->db_parent == NULL);
+		ASSERT(db->db_blkptr == NULL);
+	} else {
+		ASSERT3U(db->db.db_object, ==, dn->dn_object);
+		ASSERT3P(db->db_objset, ==, dn->dn_objset);
+		ASSERT(list_head(&dn->dn_dbufs));
+		ASSERT3U(db->db_level, <, dn->dn_nlevels);
+	}
+	if (db->db_blkid == DB_BONUS_BLKID) {
+		ASSERT(dn != NULL);
+		ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+		ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
+	} else {
+		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
+	}
+
+	if (db->db_level == 0) {
+		void **udpp = db->db_d.db_user_data_ptr_ptr;
+		/* we can be momentarily larger in dnode_set_blksz() */
+		if (db->db_blkid != DB_BONUS_BLKID && dn) {
+			ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
+		}
+		if (udpp) {
+			ASSERT((refcount_is_zero(&db->db_holds) &&
+			    *udpp == NULL) ||
+			    (!refcount_is_zero(&db->db_holds) &&
+			    *udpp == db->db.db_data));
+		}
+
+		if (IS_DNODE_DNODE(db->db.db_object)) {
+			for (i = 0; i < TXG_SIZE; i++) {
+				/*
+				 * it should only be modified in syncing
+				 * context, so make sure we only have
+				 * one copy of the data.
+				 */
+				ASSERT(db->db_d.db_data_old[i] == NULL ||
+				    db->db_d.db_data_old[i] == db->db_buf);
+			}
+		}
+	}
+
+	/* verify db->db_blkptr */
+	if (db->db_blkptr) {
+		if (db->db_parent == dn->dn_dbuf) {
+			/* db is pointed to by the dnode */
+			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
+			if (IS_DNODE_DNODE(db->db.db_object))
+				ASSERT(db->db_parent == NULL);
+			else
+				ASSERT(db->db_parent != NULL);
+			ASSERT3P(db->db_blkptr, ==,
+			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
+		} else {
+			/* db is pointed to by an indirect block */
+			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
+			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
+			ASSERT3U(db->db_parent->db.db_object, ==,
+			    db->db.db_object);
+			/*
+			 * dnode_grow_indblksz() can make this fail if we don't
+			 * have the struct_rwlock.  XXX indblksz no longer
+			 * grows.  safe to do this now?
+			 */
+			if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
+				ASSERT3P(db->db_blkptr, ==,
+				    ((blkptr_t *)db->db_parent->db.db_data +
+				    db->db_blkid % epb));
+			}
+		}
+	}
+	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
+	    db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
+	    db->db_state != DB_FILL && !dn->dn_free_txg) {
+		/*
+		 * If the blkptr isn't set but they have nonzero data,
+		 * it had better be dirty, otherwise we'll lose that
+		 * data when we evict this buffer.
+		 */
+		if (db->db_dirtycnt == 0) {
+			uint64_t *buf = db->db.db_data;
+			int i;
+
+			for (i = 0; i < db->db.db_size >> 3; i++) {
+				ASSERT(buf[i] == 0);
+			}
+		}
+	}
+#endif
+}
+
+static void
+dbuf_update_data(dmu_buf_impl_t *db)
+{
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	if (db->db_level == 0 && db->db_d.db_user_data_ptr_ptr) {
+		ASSERT(!refcount_is_zero(&db->db_holds));
+		*db->db_d.db_user_data_ptr_ptr = db->db.db_data;
+	}
+}
+
+static void
+dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
+{
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(buf->b_data != NULL);
+	db->db_buf = buf;
+	db->db.db_data = buf->b_data;
+	dbuf_update_data(db);
+}
+
+uint64_t
+dbuf_whichblock(dnode_t *dn, uint64_t offset)
+{
+	if (dn->dn_datablkshift) {
+		return (offset >> dn->dn_datablkshift);
+	} else {
+		ASSERT3U(offset, <, dn->dn_datablksz);
+		return (0);
+	}
+}
+
+static void
+dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+	dmu_buf_impl_t *db = vdb;
+
+	mutex_enter(&db->db_mtx);
+	ASSERT3U(db->db_state, ==, DB_READ);
+	/*
+	 * All reads are synchronous, so we must have a hold on the dbuf
+	 */
+	ASSERT(refcount_count(&db->db_holds) > 0);
+	ASSERT(db->db.db_data == NULL);
+	if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
+		/* we were freed in flight; disregard any error */
+		arc_release(buf, db);
+		bzero(buf->b_data, db->db.db_size);
+		db->db_d.db_freed_in_flight = FALSE;
+		dbuf_set_data(db, buf);
+		db->db_state = DB_CACHED;
+	} else if (zio == NULL || zio->io_error == 0) {
+		dbuf_set_data(db, buf);
+		db->db_state = DB_CACHED;
+	} else {
+		ASSERT(db->db_blkid != DB_BONUS_BLKID);
+		arc_buf_free(buf, db);
+		db->db_state = DB_UNCACHED;
+		ASSERT3P(db->db_buf, ==, NULL);
+	}
+	cv_broadcast(&db->db_changed);
+	mutex_exit(&db->db_mtx);
+}
+
+void
+dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+{
+	arc_buf_t *buf;
+	blkptr_t *bp;
+
+	ASSERT(!refcount_is_zero(&db->db_holds));
+	/* We need the struct_rwlock to prevent db_blkptr from changing. */
+	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
+
+	/*
+	 * prefetch only data blocks (level 0) -- don't prefetch indirect
+	 * blocks
+	 */
+	if ((db->db_level > 0) || (db->db_blkid == DB_BONUS_BLKID)) {
+		flags |= DB_RF_NOPREFETCH;
+	}
+
+	if (((flags & DB_RF_NOPREFETCH) == 0) && (db->db_dnode != NULL)) {
+		dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+		    db->db.db_size);
+	}
+
+	if (db->db_state == DB_CACHED) {
+		ASSERT(db->db.db_data != NULL);
+		return;
+	}
+
+	mutex_enter(&db->db_mtx);
+
+	if (db->db_state != DB_UNCACHED) {
+		mutex_exit(&db->db_mtx);
+		return;
+	}
+
+	ASSERT3U(db->db_state, ==, DB_UNCACHED);
+
+	if (db->db_blkid == DB_BONUS_BLKID) {
+		ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
+		buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+		    DN_MAX_BONUSLEN, db);
+		if (db->db.db_size < DN_MAX_BONUSLEN)
+			bzero(buf->b_data, DN_MAX_BONUSLEN);
+		bcopy(DN_BONUS(db->db_dnode->dn_phys), buf->b_data,
+		    db->db.db_size);
+		dbuf_set_data(db, buf);
+		db->db_state = DB_CACHED;
+		mutex_exit(&db->db_mtx);
+		return;
+	}
+
+	if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
+		bp = NULL;
+	else
+		bp = db->db_blkptr;
+
+	if (bp == NULL)
+		dprintf_dbuf(db, "blkptr: %s\n", "NULL");
+	else
+		dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
+
+	if (bp == NULL || BP_IS_HOLE(bp)) {
+		ASSERT(bp == NULL || BP_IS_HOLE(bp));
+		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+		    db->db.db_size, db));
+		bzero(db->db.db_data, db->db.db_size);
+		db->db_state = DB_CACHED;
+		mutex_exit(&db->db_mtx);
+		return;
+	}
+
+	db->db_state = DB_READ;
+	mutex_exit(&db->db_mtx);
+
+	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
+	(void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
+	    db->db_level > 0 ? byteswap_uint64_array :
+	    dmu_ot[db->db_dnode->dn_type].ot_byteswap,
+	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
+	    (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
+	    ARC_NOWAIT);
+}
+
+static int
+dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags)
+{
+	zio_t *zio;
+	int err;
+
+	/*
+	 * We don't have to hold the mutex to check db_state because it
+	 * can't be freed while we have a hold on the buffer.
+	 */
+	ASSERT(!refcount_is_zero(&db->db_holds));
+	if (db->db_state == DB_CACHED)
+		return (0);
+
+	if (db->db_state == DB_UNCACHED) {
+		zio = zio_root(db->db_dnode->dn_objset->os_spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL);
+		if ((flags & DB_RF_HAVESTRUCT) == 0)
+			rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+		dbuf_read_impl(db, zio, flags);
+		if ((flags & DB_RF_HAVESTRUCT) == 0)
+			rw_exit(&db->db_dnode->dn_struct_rwlock);
+		err = zio_wait(zio);
+		if (err)
+			return (err);
+	}
+
+	mutex_enter(&db->db_mtx);
+	while (db->db_state == DB_READ || db->db_state == DB_FILL) {
+		ASSERT(db->db_state == DB_READ ||
+		    (flags & DB_RF_HAVESTRUCT) == 0);
+		cv_wait(&db->db_changed, &db->db_mtx);
+	}
+	ASSERT3U(db->db_state, ==, DB_CACHED);
+	mutex_exit(&db->db_mtx);
+
+	return (0);
+}
+
+#pragma weak dmu_buf_read = dbuf_read
+void
+dbuf_read(dmu_buf_impl_t *db)
+{
+	int err;
+
+	err = dbuf_read_generic(db, DB_RF_MUST_SUCCEED);
+	ASSERT(err == 0);
+}
+
+#pragma weak dmu_buf_read_canfail = dbuf_read_canfail
+int
+dbuf_read_canfail(dmu_buf_impl_t *db)
+{
+	return (dbuf_read_generic(db, DB_RF_CANFAIL));
+}
+
+void
+dbuf_read_havestruct(dmu_buf_impl_t *db)
+{
+	int err;
+
+	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
+	err = dbuf_read_generic(db, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH));
+	ASSERT(err == 0);
+}
+
+static void
+dbuf_noread(dmu_buf_impl_t *db)
+{
+	ASSERT(!refcount_is_zero(&db->db_holds));
+	mutex_enter(&db->db_mtx);
+	while (db->db_state == DB_READ || db->db_state == DB_FILL)
+		cv_wait(&db->db_changed, &db->db_mtx);
+	if (db->db_state == DB_UNCACHED) {
+		int blksz = (db->db_blkid == DB_BONUS_BLKID) ?
+		    DN_MAX_BONUSLEN : db->db.db_size;
+		ASSERT(db->db.db_data == NULL);
+		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+		    blksz, db));
+		db->db_state = DB_FILL;
+	} else {
+		ASSERT3U(db->db_state, ==, DB_CACHED);
+	}
+	mutex_exit(&db->db_mtx);
+}
+
+/*
+ * This is our just-in-time copy function.  It makes a copy of
+ * buffers, that have been modified in a previous transaction
+ * group, before we modify them in the current active group.
+ *
+ * This function is used in two places: when we are dirtying a
+ * buffer for the first time in a txg, and when we are freeing
+ * a range in a dnode that includes this buffer.
+ *
+ * Note that when we are called from dbuf_free_range() we do
+ * not put a hold on the buffer, we just traverse the active
+ * dbuf list for the dnode.
+ */
+static void
+dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
+{
+	arc_buf_t **quiescing, **syncing;
+	int size = (db->db_blkid == DB_BONUS_BLKID) ?
+	    DN_MAX_BONUSLEN : db->db.db_size;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(db->db.db_data != NULL);
+
+	quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK];
+	syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK];
+
+	/*
+	 * If this buffer is referenced from the current quiescing
+	 * transaction group: either make a copy and reset the reference
+	 * to point to the copy, or (if there a no active holders) just
+	 * null out the current db_data pointer.
+	 */
+	if (*quiescing == db->db_buf) {
+		/*
+		 * If the quiescing txg is "dirty", then we better not
+		 * be referencing the same buffer from the syncing txg.
+		 */
+		ASSERT(*syncing != db->db_buf);
+		if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+			*quiescing = arc_buf_alloc(
+			    db->db_dnode->dn_objset->os_spa, size, db);
+			bcopy(db->db.db_data, (*quiescing)->b_data, size);
+		} else {
+			db->db.db_data = NULL;
+			db->db_buf = NULL;
+			db->db_state = DB_UNCACHED;
+		}
+		return;
+	}
+
+	/*
+	 * If this buffer is referenced from the current syncing
+	 * transaction group: either
+	 *	1 - make a copy and reset the reference, or
+	 *	2 - if there are no holders, just null the current db_data.
+	 */
+	if (*syncing == db->db_buf) {
+		ASSERT3P(*quiescing, ==, NULL);
+		ASSERT3U(db->db_dirtycnt, ==, 1);
+		if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+			/* we can't copy if we have already started a write */
+			ASSERT(*syncing != db->db_data_pending);
+			*syncing = arc_buf_alloc(
+			    db->db_dnode->dn_objset->os_spa, size, db);
+			bcopy(db->db.db_data, (*syncing)->b_data, size);
+		} else {
+			db->db.db_data = NULL;
+			db->db_buf = NULL;
+			db->db_state = DB_UNCACHED;
+		}
+	}
+}
+
+void
+dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg)
+{
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	if (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) {
+		db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
+	} else if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
+		/* free this block */
+		ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) ||
+		    db->db_dnode->dn_free_txg == txg);
+		if (!BP_IS_HOLE(db->db_d.db_overridden_by[txg&TXG_MASK])) {
+			/* XXX can get silent EIO here */
+			(void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
+			    txg, db->db_d.db_overridden_by[txg&TXG_MASK],
+			    NULL, NULL, ARC_WAIT);
+		}
+		kmem_free(db->db_d.db_overridden_by[txg&TXG_MASK],
+		    sizeof (blkptr_t));
+		db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
+		/* release the already-written buffer */
+		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
+	}
+}
+
+void
+dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db, *db_next;
+	uint64_t txg = tx->tx_txg;
+
+	dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
+	mutex_enter(&dn->dn_dbufs_mtx);
+	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+		db_next = list_next(&dn->dn_dbufs, db);
+		if ((db->db_level != 0) || (db->db_blkid == DB_BONUS_BLKID))
+			continue;
+		dprintf_dbuf(db, "found buf %s\n", "");
+		if (db->db_blkid < blkid ||
+		    db->db_blkid >= blkid+nblks)
+			continue;
+
+		/* found a level 0 buffer in the range */
+		if (dbuf_undirty(db, tx))
+			continue;
+
+		mutex_enter(&db->db_mtx);
+		if (db->db_state == DB_UNCACHED) {
+			ASSERT(db->db.db_data == NULL);
+			mutex_exit(&db->db_mtx);
+			continue;
+		}
+		if (db->db_state == DB_READ) {
+			/* this will be handled in dbuf_read_done() */
+			db->db_d.db_freed_in_flight = TRUE;
+			mutex_exit(&db->db_mtx);
+			continue;
+		}
+		if (db->db_state == DB_FILL) {
+			/* this will be handled in dbuf_rele() */
+			db->db_d.db_freed_in_flight = TRUE;
+			mutex_exit(&db->db_mtx);
+			continue;
+		}
+
+		/* make a copy of the data if necessary */
+		dbuf_fix_old_data(db, txg);
+
+		if (db->db.db_data) {
+			/* fill in with appropriate data */
+			arc_release(db->db_buf, db);
+			bzero(db->db.db_data, db->db.db_size);
+		}
+		mutex_exit(&db->db_mtx);
+	}
+	mutex_exit(&dn->dn_dbufs_mtx);
+}
+
+static int
+dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
+	uint64_t birth_txg = 0;
+
+	/* Don't count meta-objects */
+	if (ds == NULL)
+		return (FALSE);
+
+	/*
+	 * We don't need any locking to protect db_blkptr:
+	 * If it's syncing, then db_dirtied will be set so we'll
+	 * ignore db_blkptr.
+	 */
+	ASSERT(MUTEX_HELD(&db->db_mtx)); /* XXX strictly necessary? */
+	/* If we have been dirtied since the last snapshot, its not new */
+	if (db->db_dirtied)
+		birth_txg = db->db_dirtied;
+	else if (db->db_blkptr)
+		birth_txg = db->db_blkptr->blk_birth;
+
+	if (birth_txg)
+		return (!dsl_dataset_block_freeable(ds, birth_txg, tx));
+	else
+		return (TRUE);
+}
+
+void
+dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
+{
+	arc_buf_t *buf, *obuf;
+	int osize = db->db.db_size;
+
+	/* XXX does *this* func really need the lock? */
+	ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
+
+	ASSERT3U(osize, <=, size);
+	if (osize == size)
+		return;
+
+	/*
+	 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
+	 * is OK, because there can be no other references to the db
+	 * when we are changing its size, so no concurrent DB_FILL can
+	 * be happening.
+	 */
+	/* Make a copy of the data if necessary */
+	dbuf_will_dirty(db, tx);
+
+	/* create the data buffer for the new block */
+	buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db);
+
+	/* copy old block data to the new block */
+	obuf = db->db_buf;
+	bcopy(obuf->b_data, buf->b_data, osize);
+	/* zero the remainder */
+	bzero((uint8_t *)buf->b_data + osize, size - osize);
+
+	mutex_enter(&db->db_mtx);
+	/* ASSERT3U(refcount_count(&db->db_holds), ==, 1); */
+	dbuf_set_data(db, buf);
+	arc_buf_free(obuf, db);
+	db->db.db_size = size;
+
+	/* fix up the dirty info */
+	if (db->db_level == 0)
+		db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf;
+	mutex_exit(&db->db_mtx);
+
+	dnode_willuse_space(db->db_dnode, size-osize, tx);
+}
+
+void
+dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	dnode_t *dn = db->db_dnode;
+	objset_impl_t *os = dn->dn_objset;
+	int drop_struct_lock = FALSE;
+	int txgoff = tx->tx_txg & TXG_MASK;
+
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(!refcount_is_zero(&db->db_holds));
+	dmu_tx_dirty_buf(tx, db);
+
+	/*
+	 * Shouldn't dirty a regular buffer in syncing context.  Private
+	 * objects may be dirtied in syncing context, but only if they
+	 * were already pre-dirtied in open context.
+	 * XXX We may want to prohibit dirtying in syncing context even
+	 * if they did pre-dirty.
+	 */
+	ASSERT(!(dmu_tx_is_syncing(tx) &&
+	    !BP_IS_HOLE(&dn->dn_objset->os_rootbp) &&
+	    !(dn->dn_object & DMU_PRIVATE_OBJECT) &&
+	    dn->dn_objset->os_dsl_dataset != NULL &&
+	    !dsl_dir_is_private(
+	    dn->dn_objset->os_dsl_dataset->ds_dir)));
+
+	/*
+	 * We make this assert for private objects as well, but after we
+	 * check if we're already dirty.  They are allowed to re-dirty
+	 * in syncing context.
+	 */
+	ASSERT(dn->dn_object & DMU_PRIVATE_OBJECT ||
+	    dn->dn_dirtyctx == DN_UNDIRTIED ||
+	    dn->dn_dirtyctx ==
+	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+	mutex_enter(&db->db_mtx);
+	/* XXX make this true for indirects too? */
+	ASSERT(db->db_level != 0 || db->db_state == DB_CACHED ||
+	    db->db_state == DB_FILL);
+
+	/*
+	 * If this buffer is currently part of an "overridden" region,
+	 * we now need to remove it from that region.
+	 */
+	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+	    db->db_d.db_overridden_by[txgoff] != NULL) {
+		dbuf_unoverride(db, tx->tx_txg);
+	}
+
+	mutex_enter(&dn->dn_mtx);
+	/*
+	 * Don't set dirtyctx to SYNC if we're just modifying this as we
+	 * initialize the objset.
+	 */
+	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
+	    !BP_IS_HOLE(&dn->dn_objset->os_rootbp)) {
+		dn->dn_dirtyctx =
+		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
+		ASSERT(dn->dn_dirtyctx_firstset == NULL);
+		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+	}
+	mutex_exit(&dn->dn_mtx);
+
+	/*
+	 * If this buffer is already dirty, we're done.
+	 */
+	if (list_link_active(&db->db_dirty_node[txgoff])) {
+		mutex_exit(&db->db_mtx);
+		return;
+	}
+
+	/*
+	 * Only valid if not already dirty.
+	 */
+	ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+	ASSERT3U(dn->dn_nlevels, >, db->db_level);
+	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
+	    dn->dn_phys->dn_nlevels > db->db_level ||
+	    dn->dn_next_nlevels[txgoff] > db->db_level ||
+	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
+	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
+
+	/*
+	 * We should only be dirtying in syncing context if it's the
+	 * mos, a spa os, or we're initializing the os.  However, we are
+	 * allowed to dirty in syncing context provided we already
+	 * dirtied it in open context.  Hence we must make this
+	 * assertion only if we're not already dirty.
+	 */
+	ASSERT(!dmu_tx_is_syncing(tx) ||
+	    os->os_dsl_dataset == NULL ||
+	    !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
+	    !BP_IS_HOLE(&os->os_rootbp));
+	ASSERT(db->db.db_size != 0);
+
+	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+	if (db->db_level == 0) {
+		/*
+		 * Release the data buffer from the cache so that we
+		 * can modify it without impacting possible other users
+		 * of this cached data block.  Note that indirect blocks
+		 * and private objects are not released until the syncing
+		 * state (since they are only modified then).
+		 *
+		 * If this buffer is dirty in an old transaction group we need
+		 * to make a copy of it so that the changes we make in this
+		 * transaction group won't leak out when we sync the older txg.
+		 */
+		ASSERT(db->db_buf != NULL);
+		ASSERT(db->db.db_data != NULL);
+		ASSERT(db->db_d.db_data_old[txgoff] == NULL);
+		if (!(db->db.db_object & DMU_PRIVATE_OBJECT)) {
+			arc_release(db->db_buf, db);
+			dbuf_fix_old_data(db, tx->tx_txg);
+			ASSERT(db->db_buf != NULL);
+		}
+		db->db_d.db_data_old[txgoff] = db->db_buf;
+	}
+
+	mutex_enter(&dn->dn_mtx);
+	/*
+	 * We could have been freed_in_flight between the dbuf_noread
+	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
+	 * happened after the free.
+	 */
+	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+		dnode_clear_range(dn, db->db_blkid, 1, tx);
+		db->db_d.db_freed_in_flight = FALSE;
+	}
+
+	db->db_dirtied = tx->tx_txg;
+	list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db);
+	mutex_exit(&dn->dn_mtx);
+
+	/*
+	 * If writting this buffer will consume a new block on disk,
+	 * then update the accounting.
+	 */
+	if (db->db_blkid != DB_BONUS_BLKID) {
+		if (!dbuf_new_block(db, tx) && db->db_blkptr) {
+			/*
+			 * This is only a guess -- if the dbuf is dirty
+			 * in a previous txg, we don't know how much
+			 * space it will use on disk yet.  We should
+			 * really have the struct_rwlock to access
+			 * db_blkptr, but since this is just a guess,
+			 * it's OK if we get an odd answer.
+			 */
+			dnode_willuse_space(dn,
+			    -BP_GET_ASIZE(db->db_blkptr), tx);
+		}
+		dnode_willuse_space(dn, db->db.db_size, tx);
+	}
+
+	/*
+	 * This buffer is now part of this txg
+	 */
+	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
+	db->db_dirtycnt += 1;
+	ASSERT3U(db->db_dirtycnt, <=, 3);
+
+	mutex_exit(&db->db_mtx);
+
+	if (db->db_blkid == DB_BONUS_BLKID) {
+		dnode_setdirty(dn, tx);
+		return;
+	}
+
+	if (db->db_level == 0)
+		dnode_new_blkid(dn, db->db_blkid, tx);
+
+	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		drop_struct_lock = TRUE;
+	}
+
+	if (db->db_level < dn->dn_nlevels-1) {
+		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+		dmu_buf_impl_t *parent;
+		parent = dbuf_hold_level(dn, db->db_level+1,
+		    db->db_blkid >> epbs, FTAG);
+		if (drop_struct_lock)
+			rw_exit(&dn->dn_struct_rwlock);
+		dbuf_dirty(parent, tx);
+		dbuf_remove_ref(parent, FTAG);
+	} else {
+		if (drop_struct_lock)
+			rw_exit(&dn->dn_struct_rwlock);
+	}
+
+	dnode_setdirty(dn, tx);
+}
+
+static int
+dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	dnode_t *dn = db->db_dnode;
+	int txgoff = tx->tx_txg & TXG_MASK;
+
+	ASSERT(tx->tx_txg != 0);
+
+	mutex_enter(&db->db_mtx);
+
+	/*
+	 * If this buffer is not dirty, we're done.
+	 */
+	if (!list_link_active(&db->db_dirty_node[txgoff])) {
+		mutex_exit(&db->db_mtx);
+		return (0);
+	}
+
+	/*
+	 * If this buffer is currently held, we cannot undirty
+	 * it, since one of the current holders may be in the
+	 * middle of an update.  Note that users of dbuf_undirty()
+	 * should not place a hold on the dbuf before the call.
+	 * XXX - this check assumes we are being called from
+	 * dbuf_free_range(), perhaps we should move it there?
+	 */
+	if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+		mutex_exit(&db->db_mtx);
+		mutex_enter(&dn->dn_mtx);
+		dnode_clear_range(dn, db->db_blkid, 1, tx);
+		mutex_exit(&dn->dn_mtx);
+		return (0);
+	}
+
+	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+	dbuf_unoverride(db, tx->tx_txg);
+
+	ASSERT(db->db.db_size != 0);
+	if (db->db_level == 0) {
+		ASSERT(db->db_buf != NULL);
+		ASSERT(db->db_d.db_data_old[txgoff] != NULL);
+		if (db->db_d.db_data_old[txgoff] != db->db_buf)
+			arc_buf_free(db->db_d.db_data_old[txgoff], db);
+		db->db_d.db_data_old[txgoff] = NULL;
+	}
+
+	/* XXX would be nice to fix up dn_towrite_space[] */
+	/* XXX undo db_dirtied? but how? */
+	/* db->db_dirtied = tx->tx_txg; */
+
+	mutex_enter(&dn->dn_mtx);
+	list_remove(&dn->dn_dirty_dbufs[txgoff], db);
+	mutex_exit(&dn->dn_mtx);
+
+	ASSERT(db->db_dirtycnt > 0);
+	db->db_dirtycnt -= 1;
+
+	if (refcount_remove(&db->db_holds,
+	    (void *)(uintptr_t)tx->tx_txg) == 0) {
+		/* make duf_verify() happy */
+		if (db->db.db_data)
+			bzero(db->db.db_data, db->db.db_size);
+
+		dbuf_evict(db);
+		return (1);
+	}
+
+	mutex_exit(&db->db_mtx);
+	return (0);
+}
+
+#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
+void
+dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	int rf = DB_RF_MUST_SUCCEED;
+
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(!refcount_is_zero(&db->db_holds));
+
+	if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
+		rf |= DB_RF_HAVESTRUCT;
+	(void) dbuf_read_generic(db, rf);
+	dbuf_dirty(db, tx);
+}
+
+#pragma weak dmu_buf_will_fill = dbuf_will_fill
+void
+dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(db->db_level == 0);
+	ASSERT(!refcount_is_zero(&db->db_holds));
+
+	ASSERT(!(db->db.db_object & DMU_PRIVATE_OBJECT) ||
+	    dmu_tx_private_ok(tx));
+
+	dbuf_noread(db);
+	dbuf_dirty(db, tx);
+}
+
+#pragma weak dmu_buf_fill_done = dbuf_fill_done
+/* ARGSUSED */
+void
+dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	mutex_enter(&db->db_mtx);
+	dbuf_verify(db);
+
+	if (db->db_state == DB_FILL) {
+		if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
+			/* we were freed while filling */
+			/* XXX dbuf_undirty? */
+			bzero(db->db.db_data, db->db.db_size);
+			db->db_d.db_freed_in_flight = FALSE;
+		}
+		db->db_state = DB_CACHED;
+		cv_broadcast(&db->db_changed);
+	}
+	mutex_exit(&db->db_mtx);
+}
+
+
+static void
+dbuf_clear(dmu_buf_impl_t *db)
+{
+	dnode_t *dn = db->db_dnode;
+
+	ASSERT(MUTEX_HELD(&dn->dn_dbufs_mtx));
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(refcount_is_zero(&db->db_holds));
+
+	if (db->db_state == DB_CACHED) {
+		ASSERT(db->db_buf != NULL);
+		arc_buf_free(db->db_buf, db);
+		db->db.db_data = NULL;
+		db->db_buf = NULL;
+		db->db_state = DB_UNCACHED;
+	}
+
+	ASSERT3U(db->db_state, ==, DB_UNCACHED);
+	ASSERT(db->db_buf == NULL);
+	ASSERT(db->db_data_pending == NULL);
+
+	mutex_exit(&db->db_mtx);
+
+	/*
+	 * If this dbuf is referened from an indirect dbuf,
+	 * decrement the ref count on the indirect dbuf.
+	 */
+	if (db->db_parent && db->db_parent != dn->dn_dbuf)
+		dbuf_remove_ref(db->db_parent, db);
+
+	/* remove from dn_dbufs */
+	list_remove(&dn->dn_dbufs, db);
+
+	dnode_rele(dn, db);
+
+	dbuf_hash_remove(db);
+
+	db->db_dnode = NULL;
+	db->db_parent = NULL;
+	db->db_blkptr = NULL;
+}
+
+static int
+dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
+    dmu_buf_impl_t **parentp, blkptr_t **bpp)
+{
+	int nlevels, epbs;
+
+	if (dn->dn_phys->dn_nlevels == 0)
+		nlevels = 1;
+	else
+		nlevels = dn->dn_phys->dn_nlevels;
+
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+	ASSERT3U(level * epbs, <, 64);
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+	if (blkid == DB_BONUS_BLKID) {
+		/* this is the bonus buffer */
+		*parentp = NULL;
+		*bpp = NULL;
+		return (0);
+	} else if (level >= nlevels ||
+	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
+		/* the buffer has no parent yet */
+		*parentp = NULL;
+		*bpp = NULL;
+		return (ENOENT);
+	} else if (level < nlevels-1) {
+		/* this block is referenced from an indirect block */
+		int err = dbuf_hold_impl(dn, level+1,
+		    blkid >> epbs, fail_sparse, NULL, parentp);
+		if (err)
+			return (err);
+		dbuf_read_havestruct(*parentp);
+		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
+		    (blkid & ((1ULL << epbs) - 1));
+		return (0);
+	} else {
+		/* the block is referenced from the dnode */
+		ASSERT3U(level, ==, nlevels-1);
+		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
+		    blkid < dn->dn_phys->dn_nblkptr);
+		*parentp = dn->dn_dbuf;
+		*bpp = &dn->dn_phys->dn_blkptr[blkid];
+		return (0);
+	}
+}
+
+static dmu_buf_impl_t *
+dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
+    dmu_buf_impl_t *parent, blkptr_t *blkptr)
+{
+	objset_impl_t *os = dn->dn_objset;
+	dmu_buf_impl_t *db, *odb;
+
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+	ASSERT(dn->dn_type != DMU_OT_NONE);
+
+	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
+
+	db->db_objset = os;
+	db->db.db_object = dn->dn_object;
+	db->db_level = level;
+	db->db_blkid = blkid;
+	db->db_state = DB_UNCACHED;
+
+	if (db->db_blkid == DB_BONUS_BLKID) {
+		db->db.db_size = dn->dn_bonuslen;
+		db->db.db_offset = DB_BONUS_BLKID;
+	} else {
+		int blocksize =
+		    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
+		db->db.db_size = blocksize;
+		db->db.db_offset = db->db_blkid * blocksize;
+	}
+
+	db->db_dirtied = 0;
+	db->db_dirtycnt = 0;
+
+	bzero(&db->db_d, sizeof (db->db_d));
+
+	/*
+	 * Hold the dn_dbufs_mtx while we get the new dbuf
+	 * in the hash table *and* added to the dbufs list.
+	 * This prevents a possible deadlock with someone
+	 * trying to look up this dbuf before its added to the
+	 * dn_dbufs list.
+	 */
+	mutex_enter(&dn->dn_dbufs_mtx);
+	if ((odb = dbuf_hash_insert(db)) != NULL) {
+		/* someone else inserted it first */
+		kmem_cache_free(dbuf_cache, db);
+		mutex_exit(&dn->dn_dbufs_mtx);
+		return (odb);
+	}
+	list_insert_head(&dn->dn_dbufs, db);
+	mutex_exit(&dn->dn_dbufs_mtx);
+
+	if (parent && parent != dn->dn_dbuf)
+		dbuf_add_ref(parent, db);
+
+	(void) refcount_add(&dn->dn_holds, db);
+
+	db->db_dnode = dn;
+	db->db_parent = parent;
+	db->db_blkptr = blkptr;
+
+	dprintf_dbuf(db, "db=%p\n", db);
+
+	return (db);
+}
+
+static int
+dbuf_evictable(dmu_buf_impl_t *db)
+{
+	int i;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	dbuf_verify(db);
+
+	if (db->db_state != DB_UNCACHED && db->db_state != DB_CACHED)
+		return (FALSE);
+
+	if (!refcount_is_zero(&db->db_holds))
+		return (FALSE);
+
+#ifdef ZFS_DEBUG
+	for (i = 0; i < TXG_SIZE; i++) {
+		ASSERT(!list_link_active(&db->db_dirty_node[i]));
+		ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL);
+	}
+#endif
+
+	/*
+	 * Now we know we want to free it.
+	 * This call must be done last, since it has side effects -
+	 * calling the db_evict_func().
+	 */
+	dbuf_evict_user(db);
+	return (TRUE);
+}
+
+static void
+dbuf_destroy(dmu_buf_impl_t *db)
+{
+	ASSERT(refcount_is_zero(&db->db_holds));
+
+	ASSERT(db->db.db_data == NULL);
+	ASSERT(db->db_dnode == NULL);
+	ASSERT(db->db_parent == NULL);
+	ASSERT(db->db_hash_next == NULL);
+	ASSERT(db->db_blkptr == NULL);
+	ASSERT(db->db_data_pending == NULL);
+
+	kmem_cache_free(dbuf_cache, db);
+}
+
+void
+dbuf_prefetch(dnode_t *dn, uint64_t blkid)
+{
+	dmu_buf_impl_t *db, *parent = NULL;
+	blkptr_t *bp = NULL;
+
+	ASSERT(blkid != DB_BONUS_BLKID);
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
+	if (dnode_block_freed(dn, blkid))
+		return;
+
+	/* dbuf_find() returns with db_mtx held */
+	if (db = dbuf_find(dn, 0, blkid)) {
+		/*
+		 * This dbuf is already in the cache.  We assume that
+		 * it is already CACHED, or else about to be either
+		 * read or filled.
+		 */
+		mutex_exit(&db->db_mtx);
+		return;
+	}
+
+	if (dbuf_findbp(dn, 0, blkid, TRUE, &parent, &bp) == 0) {
+		if (bp && !BP_IS_HOLE(bp)) {
+			(void) arc_read(NULL, dn->dn_objset->os_spa, bp,
+			    dmu_ot[dn->dn_type].ot_byteswap,
+			    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+			    (ARC_NOWAIT | ARC_PREFETCH));
+		}
+		if (parent && parent != dn->dn_dbuf)
+			dbuf_rele(parent);
+	}
+}
+
+/*
+ * Returns with db_holds incremented, and db_mtx not held.
+ * Note: dn_struct_rwlock must be held.
+ */
+int
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+    void *tag, dmu_buf_impl_t **dbp)
+{
+	dmu_buf_impl_t *db, *parent = NULL;
+
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+	ASSERT3U(dn->dn_nlevels, >, level);
+
+	*dbp = NULL;
+
+	/* dbuf_find() returns with db_mtx held */
+	db = dbuf_find(dn, level, blkid);
+
+	if (db == NULL) {
+		blkptr_t *bp = NULL;
+		int err;
+
+		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
+		if (fail_sparse) {
+			if (err == 0 && bp && BP_IS_HOLE(bp))
+				err = ENOENT;
+			if (err) {
+				if (parent && parent != dn->dn_dbuf)
+					dbuf_rele(parent);
+				return (err);
+			}
+		}
+		db = dbuf_create(dn, level, blkid, parent, bp);
+	}
+
+	/*
+	 * If this buffer is currently syncing out, and we are
+	 * are still referencing it from db_data, we need to make
+	 * a copy of it in case we decide we want to dirty it
+	 * again in this txg.
+	 */
+	if (db->db_level == 0 && db->db_state == DB_CACHED &&
+	    !(dn->dn_object & DMU_PRIVATE_OBJECT) &&
+	    db->db_data_pending == db->db_buf) {
+		int size = (db->db_blkid == DB_BONUS_BLKID) ?
+		    DN_MAX_BONUSLEN : db->db.db_size;
+
+		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+		    size, db));
+		bcopy(db->db_data_pending->b_data, db->db.db_data,
+		    db->db.db_size);
+	}
+
+	dbuf_add_ref(db, tag);
+	dbuf_update_data(db);
+	dbuf_verify(db);
+	mutex_exit(&db->db_mtx);
+
+	/* NOTE: we can't rele the parent until after we drop the db_mtx */
+	if (parent && parent != dn->dn_dbuf)
+		dbuf_rele(parent);
+
+	ASSERT3P(db->db_dnode, ==, dn);
+	ASSERT3U(db->db_blkid, ==, blkid);
+	ASSERT3U(db->db_level, ==, level);
+	*dbp = db;
+
+	return (0);
+}
+
+dmu_buf_impl_t *
+dbuf_hold(dnode_t *dn, uint64_t blkid)
+{
+	dmu_buf_impl_t *db;
+	(void) dbuf_hold_impl(dn, 0, blkid, FALSE, NULL, &db);
+	return (db);
+}
+
+dmu_buf_impl_t *
+dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
+{
+	dmu_buf_impl_t *db;
+	(void) dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+	return (db);
+}
+
+dmu_buf_impl_t *
+dbuf_hold_bonus(dnode_t *dn, void *tag)
+{
+	dmu_buf_impl_t *db;
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	(void) dbuf_hold_impl(dn, 0, DB_BONUS_BLKID, FALSE, tag, &db);
+	rw_exit(&dn->dn_struct_rwlock);
+	return (db);
+}
+
+void
+dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
+{
+	(void) refcount_add(&db->db_holds, tag);
+	/* dprintf_dbuf(db, "adding ref %p; holds up to %lld\n", tag, holds); */
+}
+
+void
+dbuf_remove_ref(dmu_buf_impl_t *db, void *tag)
+{
+	int64_t holds;
+	dnode_t *dn = db->db_dnode;
+	int need_mutex;
+
+	ASSERT(dn != NULL);
+	need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx);
+
+	if (need_mutex) {
+		dnode_add_ref(dn, FTAG);
+		mutex_enter(&dn->dn_dbufs_mtx);
+	}
+
+	mutex_enter(&db->db_mtx);
+	dbuf_verify(db);
+
+	holds = refcount_remove(&db->db_holds, tag);
+
+	if (holds == 0) {
+		ASSERT3U(db->db_state, !=, DB_FILL);
+		if (db->db_level == 0 &&
+		    db->db_d.db_user_data_ptr_ptr != NULL)
+			*db->db_d.db_user_data_ptr_ptr = NULL;
+		dbuf_evict(db);
+	} else {
+		if (holds == db->db_dirtycnt &&
+		    db->db_level == 0 && db->db_d.db_immediate_evict)
+			dbuf_evict_user(db);
+		mutex_exit(&db->db_mtx);
+	}
+
+	if (need_mutex) {
+		mutex_exit(&dn->dn_dbufs_mtx);
+		dnode_rele(dn, FTAG);
+	}
+}
+
+void
+dbuf_rele(dmu_buf_impl_t *db)
+{
+	dbuf_remove_ref(db, NULL);
+}
+
+#pragma weak dmu_buf_refcount = dbuf_refcount
+uint64_t
+dbuf_refcount(dmu_buf_impl_t *db)
+{
+	return (refcount_count(&db->db_holds));
+}
+
+void *
+dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
+    dmu_buf_evict_func_t *evict_func)
+{
+	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
+	    user_data_ptr_ptr, evict_func));
+}
+
+void *
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
+    dmu_buf_evict_func_t *evict_func)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	db->db_d.db_immediate_evict = TRUE;
+	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
+	    user_data_ptr_ptr, evict_func));
+}
+
+void *
+dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
+    void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	ASSERT(db->db_level == 0);
+
+	ASSERT((user_ptr == NULL) == (evict_func == NULL));
+
+	mutex_enter(&db->db_mtx);
+
+	if (db->db_d.db_user_ptr == old_user_ptr) {
+		db->db_d.db_user_ptr = user_ptr;
+		db->db_d.db_user_data_ptr_ptr = user_data_ptr_ptr;
+		db->db_d.db_evict_func = evict_func;
+
+		dbuf_update_data(db);
+	} else {
+		old_user_ptr = db->db_d.db_user_ptr;
+	}
+
+	mutex_exit(&db->db_mtx);
+	return (old_user_ptr);
+}
+
+void *
+dmu_buf_get_user(dmu_buf_t *db_fake)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	ASSERT(!refcount_is_zero(&db->db_holds));
+
+	return (db->db_d.db_user_ptr);
+}
+
+void
+dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
+{
+	arc_buf_t **data;
+	uint64_t txg = tx->tx_txg;
+	dnode_t *dn = db->db_dnode;
+	objset_impl_t *os = dn->dn_objset;
+	int blksz;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+	mutex_enter(&db->db_mtx);
+	/*
+	 * To be synced, we must be dirtied.  But we
+	 * might have been freed after the dirty.
+	 */
+	if (db->db_state == DB_UNCACHED) {
+		/* This buffer has been freed since it was dirtied */
+		ASSERT(db->db.db_data == NULL);
+	} else if (db->db_state == DB_FILL) {
+		/* This buffer was freed and is now being re-filled */
+		ASSERT(db->db.db_data != db->db_d.db_data_old[txg&TXG_MASK]);
+	} else {
+		ASSERT3U(db->db_state, ==, DB_CACHED);
+	}
+	dbuf_verify(db);
+
+	/*
+	 * Don't need a lock on db_dirty (dn_mtx), because it can't
+	 * be modified yet.
+	 */
+
+	if (db->db_level == 0) {
+		data = &db->db_d.db_data_old[txg&TXG_MASK];
+		blksz = arc_buf_size(*data);
+		/*
+		 * If this buffer is currently "in use" (i.e., there are
+		 * active holds and db_data still references it), then make
+		 * a copy before we start the write so that any modifications
+		 * from the open txg will not leak into this write.
+		 *
+		 * NOTE: this copy does not need to be made for objects only
+		 * modified in the syncing context (e.g. DNONE_DNODE blocks)
+		 * or if there is no actual write involved (bonus blocks).
+		 */
+		if (!(dn->dn_object & DMU_PRIVATE_OBJECT) &&
+		    db->db_d.db_overridden_by[txg&TXG_MASK] == NULL &&
+		    db->db_blkid != DB_BONUS_BLKID) {
+			if (refcount_count(&db->db_holds) > 1 &&
+			    *data == db->db_buf) {
+				*data = arc_buf_alloc(
+				    db->db_dnode->dn_objset->os_spa, blksz, db);
+				bcopy(db->db.db_data, (*data)->b_data, blksz);
+			}
+			db->db_data_pending = *data;
+		} else if (dn->dn_object & DMU_PRIVATE_OBJECT) {
+			/*
+			 * Private object buffers are released here rather
+			 * than in dbuf_dirty() since they are only modified
+			 * in the syncing context and we don't want the
+			 * overhead of making multiple copies of the data.
+			 */
+			arc_release(db->db_buf, db);
+		}
+	} else {
+		data = &db->db_buf;
+		if (*data == NULL) {
+			/*
+			 * This can happen if we dirty and then free
+			 * the level-0 data blocks in the same txg. So
+			 * this indirect remains unchanged.
+			 */
+			if (db->db_dirtied == txg)
+				db->db_dirtied = 0;
+			ASSERT(db->db_dirtycnt > 0);
+			db->db_dirtycnt -= 1;
+			mutex_exit(&db->db_mtx);
+			dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+			return;
+		}
+		blksz = db->db.db_size;
+		ASSERT3U(blksz, ==, 1<<dn->dn_phys->dn_indblkshift);
+	}
+
+	ASSERT(*data != NULL);
+
+	if (db->db_blkid == DB_BONUS_BLKID) {
+		/*
+		 * Simply copy the bonus data into the dnode.  It will
+		 * be written out when the dnode is synced (and it will
+		 * be synced, since it must have been dirty for dbuf_sync
+		 * to be called).  The bonus data will be byte swapped
+		 * in dnode_byteswap.
+		 */
+		/*
+		 * Use dn_phys->dn_bonuslen since db.db_size is the length
+		 * of the bonus buffer in the open transaction rather than
+		 * the syncing transaction.
+		 */
+		ASSERT3U(db->db_level, ==, 0);
+		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, blksz);
+		bcopy((*data)->b_data, DN_BONUS(dn->dn_phys),
+		    dn->dn_phys->dn_bonuslen);
+		if (*data != db->db_buf)
+			arc_buf_free(*data, db);
+		db->db_d.db_data_old[txg&TXG_MASK] = NULL;
+		db->db_data_pending = NULL;
+		if (db->db_dirtied == txg)
+			db->db_dirtied = 0;
+		ASSERT(db->db_dirtycnt > 0);
+		db->db_dirtycnt -= 1;
+		mutex_exit(&db->db_mtx);
+		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+		return;
+	} else if (db->db_level > 0 && !arc_released(db->db_buf)) {
+		/*
+		 * This indirect buffer was marked dirty, but
+		 * never modified (if it had been modified, then
+		 * we would have released the buffer).  There is
+		 * no reason to write anything.
+		 */
+		db->db_data_pending = NULL;
+		if (db->db_dirtied == txg)
+			db->db_dirtied = 0;
+		ASSERT(db->db_dirtycnt > 0);
+		db->db_dirtycnt -= 1;
+		mutex_exit(&db->db_mtx);
+		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+		return;
+	} else if (db->db_blkptr == NULL &&
+	    db->db_level == dn->dn_phys->dn_nlevels-1 &&
+	    db->db_blkid < dn->dn_phys->dn_nblkptr) {
+		/*
+		 * This buffer was allocated at a time when there was
+		 * no available blkptrs from the dnode, or it was
+		 * inappropriate to hook it in (i.e., nlevels mis-match).
+		 */
+		ASSERT(db->db_blkptr == NULL);
+		ASSERT(db->db_parent == NULL);
+		db->db_parent = dn->dn_dbuf;
+		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
+		dbuf_verify(db);
+		mutex_exit(&db->db_mtx);
+	} else if (db->db_blkptr == NULL) {
+		dmu_buf_impl_t *parent = db->db_parent;
+		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+		mutex_exit(&db->db_mtx);
+		ASSERT(dn->dn_phys->dn_nlevels > 1);
+		if (parent == NULL) {
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			(void) dbuf_hold_impl(dn, db->db_level+1,
+			    db->db_blkid >> epbs, FALSE, NULL, &parent);
+			rw_exit(&dn->dn_struct_rwlock);
+			dbuf_add_ref(parent, db);
+			db->db_parent = parent;
+			dbuf_rele(parent);
+		}
+		dbuf_read(parent);
+	} else {
+		mutex_exit(&db->db_mtx);
+	}
+
+	ASSERT(IS_DNODE_DNODE(dn->dn_object) || db->db_parent != NULL);
+
+	if (db->db_parent != dn->dn_dbuf) {
+		dmu_buf_impl_t *parent = db->db_parent;
+		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+		mutex_enter(&db->db_mtx);
+		ASSERT(db->db_level == parent->db_level-1);
+		ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK]));
+		/*
+		 * We may have read this block after we dirtied it,
+		 * so never released it from the cache.
+		 */
+		arc_release(parent->db_buf, parent);
+
+		db->db_blkptr = (blkptr_t *)parent->db.db_data +
+		    (db->db_blkid & ((1ULL << epbs) - 1));
+		dbuf_verify(db);
+		mutex_exit(&db->db_mtx);
+	}
+	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
+#ifdef ZFS_DEBUG
+	if (db->db_parent == dn->dn_dbuf) {
+		/*
+		 * We don't need to dnode_setdirty(dn) because if we got
+		 * here then the parent is already dirty.
+		 */
+		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
+		ASSERT3P(db->db_blkptr, ==,
+		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
+	}
+#endif
+	if (db->db_level == 0 &&
+	    db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
+		arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
+		blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK];
+		int old_size = BP_GET_ASIZE(db->db_blkptr);
+		int new_size = BP_GET_ASIZE(*bpp);
+
+		ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+		dnode_diduse_space(dn, new_size-old_size);
+		mutex_enter(&dn->dn_mtx);
+		if (db->db_blkid > dn->dn_phys->dn_maxblkid)
+			dn->dn_phys->dn_maxblkid = db->db_blkid;
+		mutex_exit(&dn->dn_mtx);
+
+		dsl_dataset_block_born(os->os_dsl_dataset, *bpp, tx);
+		if (!BP_IS_HOLE(db->db_blkptr))
+			dsl_dataset_block_kill(os->os_dsl_dataset,
+			    db->db_blkptr, os->os_synctx);
+
+		mutex_enter(&db->db_mtx);
+		*db->db_blkptr = **bpp;
+		kmem_free(*bpp, sizeof (blkptr_t));
+		*bpp = NULL;
+
+		if (*old != db->db_buf)
+			arc_buf_free(*old, db);
+		*old = NULL;
+		db->db_data_pending = NULL;
+
+		cv_broadcast(&db->db_changed);
+
+		ASSERT(db->db_dirtycnt > 0);
+		db->db_dirtycnt -= 1;
+		mutex_exit(&db->db_mtx);
+		dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+	} else {
+		int checksum, compress;
+
+		if (db->db_level > 0) {
+			/*
+			 * XXX -- we should design a compression algorithm
+			 * that specializes in arrays of bps.
+			 */
+			checksum = ZIO_CHECKSUM_FLETCHER_4;
+			compress = ZIO_COMPRESS_LZJB;
+		} else {
+			/*
+			 * Allow dnode settings to override objset settings,
+			 * except for metadata checksums.
+			 */
+			if (dmu_ot[dn->dn_type].ot_metadata) {
+				checksum = os->os_md_checksum;
+				compress = zio_compress_select(dn->dn_compress,
+				    os->os_md_compress);
+			} else {
+				checksum = zio_checksum_select(dn->dn_checksum,
+				    os->os_checksum);
+				compress = zio_compress_select(dn->dn_compress,
+				    os->os_compress);
+			}
+		}
+#ifdef ZFS_DEBUG
+		if (db->db_parent) {
+			ASSERT(list_link_active(
+			    &db->db_parent->db_dirty_node[txg&TXG_MASK]));
+			ASSERT(db->db_parent == dn->dn_dbuf ||
+			    db->db_parent->db_level > 0);
+			if (dn->dn_object & DMU_PRIVATE_OBJECT ||
+			    db->db_level > 0)
+				ASSERT(*data == db->db_buf);
+		}
+#endif
+		ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg);
+		(void) arc_write(zio, os->os_spa, checksum, compress, txg,
+		    db->db_blkptr, *data, dbuf_write_done, db,
+		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT);
+		/*
+		 * We can't access db after arc_write, since it could finish
+		 * and be freed, and we have no locks on it.
+		 */
+	}
+}
+
+struct dbuf_arg {
+	objset_impl_t *os;
+	blkptr_t bp;
+};
+
+static void
+dbuf_do_born(void *arg)
+{
+	struct dbuf_arg *da = arg;
+	dsl_dataset_block_born(da->os->os_dsl_dataset,
+	    &da->bp, da->os->os_synctx);
+	kmem_free(da, sizeof (struct dbuf_arg));
+}
+
+static void
+dbuf_do_kill(void *arg)
+{
+	struct dbuf_arg *da = arg;
+	dsl_dataset_block_kill(da->os->os_dsl_dataset,
+	    &da->bp, da->os->os_synctx);
+	kmem_free(da, sizeof (struct dbuf_arg));
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+	dmu_buf_impl_t *db = vdb;
+	dnode_t *dn = db->db_dnode;
+	objset_impl_t *os = dn->dn_objset;
+	uint64_t txg = zio->io_txg;
+	uint64_t fill = 0;
+	int i;
+	int old_size, new_size;
+
+	ASSERT3U(zio->io_error, ==, 0);
+
+	dprintf_dbuf_bp(db, &zio->io_bp_orig, "bp_orig: %s", "");
+
+	old_size = BP_GET_ASIZE(&zio->io_bp_orig);
+	new_size = BP_GET_ASIZE(zio->io_bp);
+
+	dnode_diduse_space(dn, new_size-old_size);
+
+	mutex_enter(&db->db_mtx);
+
+	if (db->db_dirtied == txg)
+		db->db_dirtied = 0;
+
+	if (db->db_level == 0) {
+		arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
+
+		ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+		if (*old != db->db_buf)
+			arc_buf_free(*old, db);
+		*old = NULL;
+		db->db_data_pending = NULL;
+
+		mutex_enter(&dn->dn_mtx);
+		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
+		    !BP_IS_HOLE(db->db_blkptr))
+			dn->dn_phys->dn_maxblkid = db->db_blkid;
+		mutex_exit(&dn->dn_mtx);
+
+		if (dn->dn_type == DMU_OT_DNODE) {
+			dnode_phys_t *dnp = db->db.db_data;
+			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
+			    i--, dnp++) {
+				if (dnp->dn_type != DMU_OT_NONE)
+					fill++;
+			}
+		} else {
+			if (!BP_IS_HOLE(db->db_blkptr))
+				fill = 1;
+		}
+	} else {
+		blkptr_t *bp = db->db.db_data;
+		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+		if (!BP_IS_HOLE(db->db_blkptr)) {
+			ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, db->db.db_size);
+			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+			    db->db.db_size);
+		}
+		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
+			if (BP_IS_HOLE(bp))
+				continue;
+			ASSERT3U(BP_GET_LSIZE(bp), ==,
+			    db->db_level == 1 ? dn->dn_datablksz :
+			    (1<<dn->dn_phys->dn_indblkshift));
+			fill += bp->blk_fill;
+		}
+	}
+
+	if (!BP_IS_HOLE(db->db_blkptr)) {
+		db->db_blkptr->blk_fill = fill;
+		BP_SET_TYPE(db->db_blkptr, dn->dn_type);
+		BP_SET_LEVEL(db->db_blkptr, db->db_level);
+	} else {
+		ASSERT3U(fill, ==, 0);
+		ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
+	}
+
+	dprintf_dbuf_bp(db, db->db_blkptr,
+	    "wrote %llu bytes to blkptr:", zio->io_size);
+
+	ASSERT(db->db_parent == NULL ||
+	    list_link_active(&db->db_parent->db_dirty_node[txg&TXG_MASK]));
+	cv_broadcast(&db->db_changed);
+	ASSERT(db->db_dirtycnt > 0);
+	db->db_dirtycnt -= 1;
+	mutex_exit(&db->db_mtx);
+
+	/* We must do this after we've set the bp's type and level */
+	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
+	    BP_IDENTITY(&zio->io_bp_orig))) {
+		struct dbuf_arg *da;
+		da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP);
+		da->os = os;
+		da->bp = *zio->io_bp;
+		(void) taskq_dispatch(dbuf_tq, dbuf_do_born, da, 0);
+		if (!BP_IS_HOLE(&zio->io_bp_orig)) {
+			da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP);
+			da->os = os;
+			da->bp = zio->io_bp_orig;
+			(void) taskq_dispatch(dbuf_tq, dbuf_do_kill, da, 0);
+		}
+	}
+
+	dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
new file mode 100644
index 000000000000..14fab6d42073
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -0,0 +1,1761 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+
+const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
+	{	byteswap_uint8_array,	TRUE,	"unallocated"		},
+	{	zap_byteswap,		TRUE,	"object directory"	},
+	{	byteswap_uint64_array,	TRUE,	"object array"		},
+	{	byteswap_uint8_array,	TRUE,	"packed nvlist"		},
+	{	byteswap_uint64_array,	TRUE,	"packed nvlist size"	},
+	{	byteswap_uint64_array,	TRUE,	"bplist"		},
+	{	byteswap_uint64_array,	TRUE,	"bplist header"		},
+	{	byteswap_uint64_array,	TRUE,	"SPA space map header"	},
+	{	byteswap_uint64_array,	TRUE,	"SPA space map"		},
+	{	byteswap_uint64_array,	TRUE,	"ZIL intent log"	},
+	{	dnode_buf_byteswap,	TRUE,	"DMU dnode"		},
+	{	dmu_objset_byteswap,	TRUE,	"DMU objset"		},
+	{	byteswap_uint64_array,	TRUE,	"DSL directory"		},
+	{	zap_byteswap,		TRUE,	"DSL directory child map"},
+	{	zap_byteswap,		TRUE,	"DSL dataset snap map"	},
+	{	zap_byteswap,		TRUE,	"DSL props"		},
+	{	byteswap_uint64_array,	TRUE,	"DSL dataset"		},
+	{	zfs_znode_byteswap,	TRUE,	"ZFS znode"		},
+	{	zfs_acl_byteswap,	TRUE,	"ZFS ACL"		},
+	{	byteswap_uint8_array,	FALSE,	"ZFS plain file"	},
+	{	zap_byteswap,		TRUE,	"ZFS directory"		},
+	{	zap_byteswap,		TRUE,	"ZFS master node"	},
+	{	zap_byteswap,		TRUE,	"ZFS delete queue"	},
+	{	byteswap_uint8_array,	FALSE,	"zvol object"		},
+	{	zap_byteswap,		TRUE,	"zvol prop"		},
+	{	byteswap_uint8_array,	FALSE,	"other uint8[]"		},
+	{	byteswap_uint64_array,	FALSE,	"other uint64[]"	},
+	{	zap_byteswap,		TRUE,	"other ZAP"		},
+};
+
+static int
+dmu_buf_read_array_impl(dmu_buf_impl_t **dbp, int numbufs, uint32_t flags)
+{
+	int i, err = 0;
+	dnode_t *dn;
+	zio_t *zio;
+	int canfail;
+	uint64_t rd_sz;
+
+	if (numbufs == 0)
+		return (0);
+
+	rd_sz = numbufs * dbp[0]->db.db_size;
+	ASSERT(rd_sz <= DMU_MAX_ACCESS);
+
+	dn = dbp[0]->db_dnode;
+	if (flags & DB_RF_CANFAIL) {
+		canfail = 1;
+	} else {
+		canfail = 0;
+	}
+	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, canfail);
+
+	/* don't prefetch if read the read is large */
+	if (rd_sz >= zfetch_array_rd_sz) {
+		flags |= DB_RF_NOPREFETCH;
+	}
+
+	/* initiate async reads */
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	for (i = 0; i < numbufs; i++) {
+		if (dbp[i]->db_state == DB_UNCACHED)
+			dbuf_read_impl(dbp[i], zio, flags);
+	}
+	rw_exit(&dn->dn_struct_rwlock);
+	err = zio_wait(zio);
+
+	if (err)
+		return (err);
+
+	/* wait for other io to complete */
+	for (i = 0; i < numbufs; i++) {
+		mutex_enter(&dbp[i]->db_mtx);
+		while (dbp[i]->db_state == DB_READ ||
+		    dbp[i]->db_state == DB_FILL)
+			cv_wait(&dbp[i]->db_changed, &dbp[i]->db_mtx);
+		ASSERT(dbp[i]->db_state == DB_CACHED);
+		mutex_exit(&dbp[i]->db_mtx);
+	}
+
+	return (0);
+}
+
+void
+dmu_buf_read_array(dmu_buf_t **dbp_fake, int numbufs)
+{
+	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+	int err;
+
+	err = dmu_buf_read_array_impl(dbp, numbufs, DB_RF_MUST_SUCCEED);
+	ASSERT(err == 0);
+}
+
+int
+dmu_buf_read_array_canfail(dmu_buf_t **dbp_fake, int numbufs)
+{
+	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+
+	return (dmu_buf_read_array_impl(dbp, numbufs, DB_RF_CANFAIL));
+}
+
+dmu_buf_t *
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset)
+{
+	dnode_t *dn;
+	uint64_t blkid;
+	dmu_buf_impl_t *db;
+
+	/* dataset_verify(dd); */
+
+	dn = dnode_hold(os->os, object, FTAG);
+	blkid = dbuf_whichblock(dn, offset);
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	db = dbuf_hold(dn, blkid);
+	rw_exit(&dn->dn_struct_rwlock);
+	dnode_rele(dn, FTAG);
+	return (&db->db);
+}
+
+dmu_buf_t *
+dmu_bonus_hold(objset_t *os, uint64_t object)
+{
+	return (dmu_bonus_hold_tag(os, object, NULL));
+}
+
+int
+dmu_bonus_max(void)
+{
+	return (DN_MAX_BONUSLEN);
+}
+
+/*
+ * Returns held bonus buffer if the object exists, NULL if it doesn't.
+ */
+dmu_buf_t *
+dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag)
+{
+	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	dmu_buf_impl_t *db;
+
+	if (dn == NULL)
+		return (NULL);
+
+	db = dbuf_hold_bonus(dn, tag);
+	/* XXX - hack: hold the first block if this is a ZAP object */
+	if (dmu_ot[dn->dn_type].ot_byteswap == zap_byteswap) {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		dn->dn_db0 = dbuf_hold(dn, 0);
+		rw_exit(&dn->dn_struct_rwlock);
+	}
+	dnode_rele(dn, FTAG);
+	return (&db->db);
+}
+
+static dmu_buf_t **
+dbuf_hold_array(dnode_t *dn,
+    uint64_t offset, uint64_t length, int *numbufsp)
+{
+	dmu_buf_t **dbp;
+	uint64_t blkid, nblks, i;
+
+	if (length == 0) {
+		if (numbufsp)
+			*numbufsp = 0;
+		return (NULL);
+	}
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	if (dn->dn_datablkshift) {
+		int blkshift = dn->dn_datablkshift;
+		nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
+			P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
+	} else {
+		ASSERT3U(offset + length, <=, dn->dn_datablksz);
+		nblks = 1;
+	}
+	dbp = kmem_alloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+
+	blkid = dbuf_whichblock(dn, offset);
+	for (i = 0; i < nblks; i++) {
+		dmu_buf_impl_t *dbuf;
+		dbuf = dbuf_hold(dn, blkid+i);
+		dbp[i] = &dbuf->db;
+	}
+	rw_exit(&dn->dn_struct_rwlock);
+
+	if (numbufsp)
+		*numbufsp = nblks;
+	return (dbp);
+}
+
+dmu_buf_t **
+dmu_buf_hold_array(objset_t *os, uint64_t object,
+	uint64_t offset, uint64_t length, int *numbufsp)
+{
+	dnode_t *dn;
+	dmu_buf_t **dbp;
+
+	ASSERT(length <= DMU_MAX_ACCESS);
+
+	if (length == 0) {
+		if (numbufsp)
+			*numbufsp = 0;
+		return (NULL);
+	}
+
+	dn = dnode_hold(os->os, object, FTAG);
+	dbp = dbuf_hold_array(dn, offset, length, numbufsp);
+	dnode_rele(dn, FTAG);
+
+	return (dbp);
+}
+
+void
+dmu_buf_add_ref(dmu_buf_t *dbuf, void *tag)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+	dbuf_add_ref(db, tag);
+}
+
+void
+dmu_buf_remove_ref(dmu_buf_t *dbuf, void *tag)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+	dbuf_remove_ref(db, tag);
+}
+
+void
+dmu_buf_rele(dmu_buf_t *dbuf_fake)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake;
+
+	/* XXX - hack: hold the first block  if this is a ZAP object */
+	if (db->db_blkid == DB_BONUS_BLKID &&
+	    dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap)
+		dbuf_rele(db->db_dnode->dn_db0);
+	dbuf_rele(db);
+}
+
+void
+dmu_buf_rele_tag(dmu_buf_t *dbuf_fake, void *tag)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake;
+
+	/* XXX - hack: hold the first block  if this is a ZAP object */
+	if (db->db_blkid == DB_BONUS_BLKID &&
+	    dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap)
+		dbuf_rele(db->db_dnode->dn_db0);
+	dbuf_remove_ref(db, tag);
+}
+
+void
+dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs)
+{
+	int i;
+	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+
+	if (numbufs == 0)
+		return;
+
+	ASSERT((numbufs * dbp[0]->db.db_size) <= DMU_MAX_ACCESS);
+
+	for (i = 0; i < numbufs; i++)
+		dbuf_rele(dbp[i]);
+
+	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
+}
+
+void
+dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
+{
+	dnode_t *dn;
+	uint64_t blkid;
+	int nblks, i;
+
+	if (len == 0) {  /* they're interested in the bonus buffer */
+		dn = os->os->os_meta_dnode;
+
+		if (object == 0 || object >= DN_MAX_OBJECT)
+			return;
+
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
+		dbuf_prefetch(dn, blkid);
+		rw_exit(&dn->dn_struct_rwlock);
+		return;
+	}
+
+	/*
+	 * XXX - Note, if the dnode for the requested object is not
+	 * already cached, we will do a *synchronous* read in the
+	 * dnode_hold() call.  The same is true for any indirects.
+	 */
+	dn = dnode_hold(os->os, object, FTAG);
+	if (dn == NULL)
+		return;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	if (dn->dn_datablkshift) {
+		int blkshift = dn->dn_datablkshift;
+		nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
+			P2ALIGN(offset, 1<<blkshift)) >> blkshift;
+	} else {
+		nblks = (offset < dn->dn_datablksz);
+	}
+
+	if (nblks != 0) {
+		blkid = dbuf_whichblock(dn, offset);
+		for (i = 0; i < nblks; i++)
+			dbuf_prefetch(dn, blkid+i);
+	}
+
+	rw_exit(&dn->dn_struct_rwlock);
+
+	dnode_rele(dn, FTAG);
+}
+
+void
+dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t size, dmu_tx_t *tx)
+{
+	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	ASSERT(offset < UINT64_MAX);
+	ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
+	dnode_free_range(dn, offset, size, tx);
+	dnode_rele(dn, FTAG);
+}
+
+static int
+dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    void *buf, uint32_t flags)
+{
+	dnode_t *dn;
+	dmu_buf_t **dbp;
+	int numbufs, i;
+
+	dn = dnode_hold(os->os, object, FTAG);
+
+	if (dn->dn_datablkshift == 0) {
+		int newsz = offset > dn->dn_datablksz ? 0 :
+		    MIN(size, dn->dn_datablksz - offset);
+		bzero((char *)buf + newsz, size - newsz);
+		size = newsz;
+	}
+
+	dnode_rele(dn, FTAG);
+
+	if (size == 0)
+		return (0);
+
+	while (size > 0) {
+		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
+		int err;
+
+		/*
+		 * NB: we could do this block-at-a-time, but it's nice
+		 * to be reading in parallel.
+		 */
+		dbp = dmu_buf_hold_array(os, object, offset, mylen, &numbufs);
+		err = dmu_buf_read_array_impl((dmu_buf_impl_t **)dbp, numbufs,
+		    flags);
+		if (err) {
+			dmu_buf_rele_array(dbp, numbufs);
+			return (err);
+		}
+
+		for (i = 0; i < numbufs; i++) {
+			int tocpy;
+			int bufoff;
+			dmu_buf_t *db = dbp[i];
+
+			ASSERT(size > 0);
+
+			bufoff = offset - db->db_offset;
+			tocpy = (int)MIN(db->db_size - bufoff, size);
+
+			bcopy((char *)db->db_data + bufoff, buf, tocpy);
+
+			offset += tocpy;
+			size -= tocpy;
+			buf = (char *)buf + tocpy;
+		}
+		dmu_buf_rele_array(dbp, numbufs);
+	}
+	return (0);
+}
+
+void
+dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    void *buf)
+{
+	int err;
+
+	err = dmu_read_impl(os, object, offset, size, buf, DB_RF_MUST_SUCCEED);
+	ASSERT3U(err, ==, 0);
+}
+
+int
+dmu_read_canfail(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    void *buf)
+{
+	return (dmu_read_impl(os, object, offset, size, buf, DB_RF_CANFAIL));
+}
+
+void
+dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    const void *buf, dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	int numbufs, i;
+
+	dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs);
+
+	for (i = 0; i < numbufs; i++) {
+		int tocpy;
+		int bufoff;
+		dmu_buf_t *db = dbp[i];
+
+		ASSERT(size > 0);
+
+		bufoff = offset - db->db_offset;
+		tocpy = (int)MIN(db->db_size - bufoff, size);
+
+		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+		if (tocpy == db->db_size)
+			dmu_buf_will_fill(db, tx);
+		else
+			dmu_buf_will_dirty(db, tx);
+
+		bcopy(buf, (char *)db->db_data + bufoff, tocpy);
+
+		if (tocpy == db->db_size)
+			dmu_buf_fill_done(db, tx);
+
+		offset += tocpy;
+		size -= tocpy;
+		buf = (char *)buf + tocpy;
+	}
+	dmu_buf_rele_array(dbp, numbufs);
+}
+
+#ifdef _KERNEL
+int
+dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    uio_t *uio, dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	int numbufs, i;
+	int err = 0;
+
+	dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs);
+
+	for (i = 0; i < numbufs; i++) {
+		int tocpy;
+		int bufoff;
+		dmu_buf_t *db = dbp[i];
+
+		ASSERT(size > 0);
+
+		bufoff = offset - db->db_offset;
+		tocpy = (int)MIN(db->db_size - bufoff, size);
+
+		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+		if (tocpy == db->db_size)
+			dmu_buf_will_fill(db, tx);
+		else
+			dmu_buf_will_dirty(db, tx);
+
+		/*
+		 * XXX uiomove could block forever (eg. nfs-backed
+		 * pages).  There needs to be a uiolockdown() function
+		 * to lock the pages in memory, so that uiomove won't
+		 * block.
+		 */
+		err = uiomove((char *)db->db_data + bufoff, tocpy,
+		    UIO_WRITE, uio);
+
+		if (tocpy == db->db_size)
+			dmu_buf_fill_done(db, tx);
+
+		if (err)
+			break;
+
+		offset += tocpy;
+		size -= tocpy;
+	}
+	dmu_buf_rele_array(dbp, numbufs);
+	return (err);
+}
+#endif
+
+struct backuparg {
+	dmu_replay_record_t *drr;
+	vnode_t *vp;
+	objset_t *os;
+	int err;
+};
+
+static int
+dump_bytes(struct backuparg *ba, void *buf, int len)
+{
+	ssize_t resid; /* have to get resid to get detailed errno */
+	/* Need to compute checksum here */
+	ASSERT3U(len % 8, ==, 0);
+	ba->err = vn_rdwr(UIO_WRITE, ba->vp,
+	    (caddr_t)buf, len,
+	    0, UIO_SYSSPACE, FAPPEND, RLIM_INFINITY, CRED(), &resid);
+	return (ba->err);
+}
+
+static int
+dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
+    uint64_t length)
+{
+	/* write a FREE record */
+	bzero(ba->drr, sizeof (dmu_replay_record_t));
+	ba->drr->drr_type = DRR_FREE;
+	ba->drr->drr_u.drr_free.drr_object = object;
+	ba->drr->drr_u.drr_free.drr_offset = offset;
+	ba->drr->drr_u.drr_free.drr_length = length;
+
+	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+		return (EINTR);
+	return (0);
+}
+
+static int
+dump_data(struct backuparg *ba, dmu_object_type_t type,
+    uint64_t object, uint64_t offset, int blksz, void *data)
+{
+	/* write a DATA record */
+	bzero(ba->drr, sizeof (dmu_replay_record_t));
+	ba->drr->drr_type = DRR_WRITE;
+	ba->drr->drr_u.drr_write.drr_object = object;
+	ba->drr->drr_u.drr_write.drr_type = type;
+	ba->drr->drr_u.drr_write.drr_offset = offset;
+	ba->drr->drr_u.drr_write.drr_length = blksz;
+
+	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+		return (EINTR);
+	if (dump_bytes(ba, data, blksz))
+		return (EINTR);
+	return (0);
+}
+
+static int
+dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
+{
+	/* write a FREEOBJECTS record */
+	bzero(ba->drr, sizeof (dmu_replay_record_t));
+	ba->drr->drr_type = DRR_FREEOBJECTS;
+	ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj;
+	ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs;
+
+	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+		return (EINTR);
+	return (0);
+}
+
+static int
+dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
+{
+	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
+		return (dump_freeobjects(ba, object, 1));
+
+	/* write an OBJECT record */
+	bzero(ba->drr, sizeof (dmu_replay_record_t));
+	ba->drr->drr_type = DRR_OBJECT;
+	ba->drr->drr_u.drr_object.drr_object = object;
+	ba->drr->drr_u.drr_object.drr_type = dnp->dn_type;
+	ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype;
+	ba->drr->drr_u.drr_object.drr_blksz =
+	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+	ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen;
+	ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum;
+	ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress;
+
+	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+		return (EINTR);
+
+	if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)))
+		return (EINTR);
+
+	/* free anything past the end of the file */
+	if (dump_free(ba, object, (dnp->dn_maxblkid + 1) *
+	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
+		return (EINTR);
+	if (ba->err)
+		return (EINTR);
+	return (0);
+}
+
+#define	BP_SPAN(dnp, level) \
+	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
+	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+
+static int
+backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+{
+	struct backuparg *ba = arg;
+	uint64_t object = bc->bc_bookmark.zb_object;
+	int level = bc->bc_bookmark.zb_level;
+	uint64_t blkid = bc->bc_bookmark.zb_blkid;
+	blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL;
+	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
+	void *data = bc->bc_data;
+	int err = 0;
+
+	if (issig(JUSTLOOKING))
+		return (EINTR);
+
+	ASSERT(data || bp == NULL);
+
+	if (bp == NULL && object == 0) {
+		uint64_t span = BP_SPAN(bc->bc_dnode, level);
+		uint64_t dnobj = (blkid * span) >> DNODE_SHIFT;
+		err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
+	} else if (bp == NULL) {
+		uint64_t span = BP_SPAN(bc->bc_dnode, level);
+		err = dump_free(ba, object, blkid * span, span);
+	} else if (data && level == 0 && type == DMU_OT_DNODE) {
+		dnode_phys_t *blk = data;
+		int i;
+		int blksz = BP_GET_LSIZE(bp);
+
+		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
+			uint64_t dnobj =
+			    (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
+			err = dump_dnode(ba, dnobj, blk+i);
+			if (err)
+				break;
+		}
+	} else if (level == 0 &&
+	    type != DMU_OT_DNODE && type != DMU_OT_OBJSET) {
+		int blksz = BP_GET_LSIZE(bp);
+		if (data == NULL) {
+			arc_buf_t *abuf;
+
+			(void) arc_read(NULL, spa, bp,
+			    dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
+			    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
+			    ARC_WAIT);
+
+			if (abuf) {
+				err = dump_data(ba, type, object, blkid * blksz,
+				    blksz, abuf->b_data);
+				arc_buf_free(abuf, &abuf);
+			}
+		} else {
+			err = dump_data(ba, type, object, blkid * blksz,
+			    blksz, data);
+		}
+	}
+
+	ASSERT(err == 0 || err == EINTR);
+	return (err);
+}
+
+int
+dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp)
+{
+	dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
+	dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
+	dmu_replay_record_t *drr;
+	struct backuparg ba;
+	int err;
+
+	/* tosnap must be a snapshot */
+	if (ds->ds_phys->ds_next_snap_obj == 0)
+		return (EINVAL);
+
+	/* fromsnap must be an earlier snapshot from the same fs as tosnap */
+	if (fromds && (ds->ds_dir != fromds->ds_dir ||
+	    fromds->ds_phys->ds_creation_txg >=
+	    ds->ds_phys->ds_creation_txg))
+		return (EXDEV);
+
+	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
+	drr->drr_type = DRR_BEGIN;
+	drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
+	drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION;
+	drr->drr_u.drr_begin.drr_creation_time =
+	    ds->ds_phys->ds_creation_time;
+	drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
+	drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
+	if (fromds)
+		drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
+	dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
+
+	ba.drr = drr;
+	ba.vp = vp;
+	ba.os = tosnap;
+
+	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
+		kmem_free(drr, sizeof (dmu_replay_record_t));
+		return (ba.err);
+	}
+
+	err = traverse_dsl_dataset(ds,
+	    fromds ? fromds->ds_phys->ds_creation_txg : 0,
+	    ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK,
+	    backup_cb, &ba);
+
+	if (err) {
+		if (err == EINTR && ba.err)
+			err = ba.err;
+		return (err);
+	}
+
+	bzero(drr, sizeof (dmu_replay_record_t));
+	drr->drr_type = DRR_END;
+
+	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)))
+		return (ba.err);
+
+	kmem_free(drr, sizeof (dmu_replay_record_t));
+
+	return (0);
+}
+
+struct restorearg {
+	int err;
+	int byteswap;
+	vnode_t *vp;
+	char *buf;
+	uint64_t voff;
+	int buflen; /* number of valid bytes in buf */
+	int bufoff; /* next offset to read */
+	int bufsize; /* amount of memory allocated for buf */
+};
+
+static int
+replay_incremental_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	struct drr_begin *drrb = arg;
+	dsl_dataset_t *ds = NULL;
+	dsl_dataset_t *ds_prev = NULL;
+	const char *snapname;
+	int err = EINVAL;
+	uint64_t val;
+
+	/* this must be a filesytem */
+	if (dd->dd_phys->dd_head_dataset_obj == 0)
+		goto die;
+
+	ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
+	    NULL, DS_MODE_EXCLUSIVE, FTAG);
+
+	if (ds == NULL) {
+		err = EBUSY;
+		goto die;
+	}
+
+	/* must already be a snapshot of this fs */
+	if (ds->ds_phys->ds_prev_snap_obj == 0) {
+		err = ENODEV;
+		goto die;
+	}
+
+	/* most recent snapshot must match fromguid */
+	ds_prev = dsl_dataset_open_obj(dd->dd_pool,
+	    ds->ds_phys->ds_prev_snap_obj, NULL,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
+	if (ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) {
+		err = ENODEV;
+		goto die;
+	}
+
+	/* must not have any changes since most recent snapshot */
+	if (ds->ds_phys->ds_bp.blk_birth >
+	    ds_prev->ds_phys->ds_creation_txg) {
+		err = ETXTBSY;
+		goto die;
+	}
+
+	/* new snapshot name must not exist */
+	snapname = strrchr(drrb->drr_toname, '@');
+	if (snapname == NULL) {
+		err = EEXIST;
+		goto die;
+	}
+	snapname++;
+	err = zap_lookup(dd->dd_pool->dp_meta_objset,
+	    ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val);
+	if (err != ENOENT) {
+		if (err == 0)
+			err = EEXIST;
+		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+		dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG);
+		return (err);
+	}
+
+	dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG);
+
+	/* The point of no (unsuccessful) return. */
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	ds->ds_phys->ds_restoring = TRUE;
+
+	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+	return (0);
+
+die:
+	if (ds_prev)
+		dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG);
+	if (ds)
+		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+	return (err);
+}
+
+static int
+replay_full_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	struct drr_begin *drrb = arg;
+	int err;
+	char *fsfullname, *fslastname, *cp;
+	dsl_dataset_t *ds;
+
+	fsfullname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+	(void) strncpy(fsfullname, drrb->drr_toname, MAXNAMELEN);
+	cp = strchr(fsfullname, '@');
+	if (cp == NULL) {
+		kmem_free(fsfullname, MAXNAMELEN);
+		return (EINVAL);
+	}
+	*cp = '\0';
+	fslastname = strrchr(fsfullname, '/');
+	if (fslastname == NULL) {
+		kmem_free(fsfullname, MAXNAMELEN);
+		return (EINVAL);
+	}
+	fslastname++;
+
+	err = dsl_dataset_create_sync(dd, fsfullname, fslastname, NULL, tx);
+	if (err) {
+		kmem_free(fsfullname, MAXNAMELEN);
+		return (err);
+	}
+
+	/* the point of no (unsuccessful) return */
+
+	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, fsfullname,
+	    DS_MODE_EXCLUSIVE, FTAG, &ds);
+	ASSERT3U(err, ==, 0);
+	kmem_free(fsfullname, MAXNAMELEN);
+
+	(void) dmu_objset_create_impl(dsl_dataset_get_spa(ds),
+	    ds, drrb->drr_type, tx);
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	ds->ds_phys->ds_restoring = TRUE;
+
+	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+	return (0);
+}
+
+static int
+replay_end_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	struct drr_begin *drrb = arg;
+	int err;
+	char *snapname;
+	dsl_dataset_t *ds;
+
+	/* XXX verify that drr_toname is in dd */
+
+	snapname = strchr(drrb->drr_toname, '@');
+	if (snapname == NULL)
+		return (EINVAL);
+	snapname++;
+
+	/* create snapshot */
+	err = dsl_dataset_snapshot_sync(dd, snapname, tx);
+	if (err)
+		return (err);
+
+	/* set snapshot's creation time and guid */
+	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, drrb->drr_toname,
+	    DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_RESTORE, FTAG, &ds);
+	ASSERT3U(err, ==, 0);
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	ds->ds_phys->ds_creation_time = drrb->drr_creation_time;
+	ds->ds_phys->ds_guid = drrb->drr_toguid;
+	ds->ds_phys->ds_restoring = FALSE;
+
+	dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG);
+
+	ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
+	    NULL, DS_MODE_STANDARD | DS_MODE_RESTORE, FTAG);
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	ds->ds_phys->ds_restoring = FALSE;
+	dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+
+	return (0);
+}
+
+void *
+restore_read(struct restorearg *ra, int len)
+{
+	void *rv;
+
+	/* some things will require 8-byte alignment, so everything must */
+	ASSERT3U(len % 8, ==, 0);
+
+	while (ra->buflen - ra->bufoff < len) {
+		ssize_t resid;
+		int leftover = ra->buflen - ra->bufoff;
+
+		(void) memmove(ra->buf, ra->buf + ra->bufoff, leftover);
+		ra->err = vn_rdwr(UIO_READ, ra->vp,
+		    (caddr_t)ra->buf + leftover, ra->bufsize - leftover,
+		    ra->voff, UIO_SYSSPACE, FAPPEND,
+		    RLIM_INFINITY, CRED(), &resid);
+
+		/* Need to compute checksum */
+
+		ra->voff += ra->bufsize - leftover - resid;
+		ra->buflen = ra->bufsize - resid;
+		ra->bufoff = 0;
+		if (resid == ra->bufsize - leftover)
+			ra->err = EINVAL;
+		if (ra->err)
+			return (NULL);
+	}
+
+	ASSERT3U(ra->bufoff % 8, ==, 0);
+	ASSERT3U(ra->buflen - ra->bufoff, >=, len);
+	rv = ra->buf + ra->bufoff;
+	ra->bufoff += len;
+	return (rv);
+}
+
+static void
+backup_byteswap(dmu_replay_record_t *drr)
+{
+#define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
+#define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
+	drr->drr_type = BSWAP_32(drr->drr_type);
+	switch (drr->drr_type) {
+	case DRR_BEGIN:
+		DO64(drr_begin.drr_magic);
+		DO64(drr_begin.drr_version);
+		DO64(drr_begin.drr_creation_time);
+		DO32(drr_begin.drr_type);
+		DO64(drr_begin.drr_toguid);
+		DO64(drr_begin.drr_fromguid);
+		break;
+	case DRR_OBJECT:
+		DO64(drr_object.drr_object);
+		/* DO64(drr_object.drr_allocation_txg); */
+		DO32(drr_object.drr_type);
+		DO32(drr_object.drr_bonustype);
+		DO32(drr_object.drr_blksz);
+		DO32(drr_object.drr_bonuslen);
+		break;
+	case DRR_FREEOBJECTS:
+		DO64(drr_freeobjects.drr_firstobj);
+		DO64(drr_freeobjects.drr_numobjs);
+		break;
+	case DRR_WRITE:
+		DO64(drr_write.drr_object);
+		DO32(drr_write.drr_type);
+		DO64(drr_write.drr_offset);
+		DO64(drr_write.drr_length);
+		break;
+	case DRR_FREE:
+		DO64(drr_free.drr_object);
+		DO64(drr_free.drr_offset);
+		DO64(drr_free.drr_length);
+		break;
+	case DRR_END:
+		DO64(drr_end.drr_checksum);
+		break;
+	}
+#undef DO64
+#undef DO32
+}
+
+static int
+restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
+{
+	int err;
+	dmu_tx_t *tx;
+
+	err = dmu_object_info(os, drro->drr_object, NULL);
+
+	if (err != 0 && err != ENOENT)
+		return (EINVAL);
+
+	if (drro->drr_type == DMU_OT_NONE ||
+	    drro->drr_type >= DMU_OT_NUMTYPES ||
+	    drro->drr_bonustype >= DMU_OT_NUMTYPES ||
+	    drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS ||
+	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
+	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
+	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
+	    drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
+		return (EINVAL);
+	}
+
+	tx = dmu_tx_create(os);
+
+	if (err == ENOENT) {
+		/* currently free, want to be allocated */
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1);
+		err = dmu_tx_assign(tx, TXG_WAIT);
+		if (err) {
+			dmu_tx_abort(tx);
+			return (err);
+		}
+		err = dmu_object_claim(os, drro->drr_object,
+		    drro->drr_type, drro->drr_blksz,
+		    drro->drr_bonustype, drro->drr_bonuslen, tx);
+	} else {
+		/* currently allocated, want to be allocated */
+		dmu_tx_hold_bonus(tx, drro->drr_object);
+		/*
+		 * We may change blocksize, so need to
+		 * hold_write
+		 */
+		dmu_tx_hold_write(tx, drro->drr_object, 0, 1);
+		err = dmu_tx_assign(tx, TXG_WAIT);
+		if (err) {
+			dmu_tx_abort(tx);
+			return (err);
+		}
+
+		err = dmu_object_reclaim(os, drro->drr_object,
+		    drro->drr_type, drro->drr_blksz,
+		    drro->drr_bonustype, drro->drr_bonuslen, tx);
+	}
+	if (err) {
+		dmu_tx_commit(tx);
+		return (EINVAL);
+	}
+
+	dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
+	dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
+
+	if (drro->drr_bonuslen) {
+		dmu_buf_t *db;
+		void *data;
+		db = dmu_bonus_hold(os, drro->drr_object);
+		dmu_buf_will_dirty(db, tx);
+
+		ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
+		data = restore_read(ra, P2ROUNDUP(db->db_size, 8));
+		if (data == NULL) {
+			dmu_tx_commit(tx);
+			return (ra->err);
+		}
+		bcopy(data, db->db_data, db->db_size);
+		if (ra->byteswap) {
+			dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
+			    drro->drr_bonuslen);
+		}
+		dmu_buf_rele(db);
+	}
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+restore_freeobjects(struct restorearg *ra, objset_t *os,
+    struct drr_freeobjects *drrfo)
+{
+	uint64_t obj;
+
+	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
+		return (EINVAL);
+
+	for (obj = drrfo->drr_firstobj;
+	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs; obj++) {
+		dmu_tx_t *tx;
+		int err;
+
+		if (dmu_object_info(os, obj, NULL) != 0)
+			continue;
+
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_bonus(tx, obj);
+		err = dmu_tx_assign(tx, TXG_WAIT);
+		if (err) {
+			dmu_tx_abort(tx);
+			return (err);
+		}
+		err = dmu_object_free(os, obj, tx);
+		dmu_tx_commit(tx);
+		if (err && err != ENOENT)
+			return (EINVAL);
+	}
+	return (0);
+}
+
+static int
+restore_write(struct restorearg *ra, objset_t *os,
+    struct drr_write *drrw)
+{
+	dmu_tx_t *tx;
+	void *data;
+	int err;
+
+	if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
+	    drrw->drr_type >= DMU_OT_NUMTYPES)
+		return (EINVAL);
+
+	data = restore_read(ra, drrw->drr_length);
+	if (data == NULL)
+		return (ra->err);
+
+	if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
+		return (EINVAL);
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_write(tx, drrw->drr_object,
+	    drrw->drr_offset, drrw->drr_length);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+	if (ra->byteswap)
+		dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length);
+	dmu_write(os, drrw->drr_object,
+	    drrw->drr_offset, drrw->drr_length, data, tx);
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+restore_free(struct restorearg *ra, objset_t *os,
+    struct drr_free *drrf)
+{
+	dmu_tx_t *tx;
+	int err;
+
+	if (drrf->drr_length != -1ULL &&
+	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
+		return (EINVAL);
+
+	if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
+		return (EINVAL);
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_free(tx, drrf->drr_object,
+	    drrf->drr_offset, drrf->drr_length);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+	dmu_free_range(os, drrf->drr_object,
+	    drrf->drr_offset, drrf->drr_length, tx);
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+int
+dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
+    vnode_t *vp, uint64_t voffset)
+{
+	struct restorearg ra;
+	dmu_replay_record_t *drr;
+	char *cp, *tosnap;
+	dsl_dir_t *dd = NULL;
+	objset_t *os = NULL;
+
+	bzero(&ra, sizeof (ra));
+	ra.vp = vp;
+	ra.voff = voffset;
+	ra.bufsize = 1<<20;
+	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
+
+	if (drrb->drr_magic == DMU_BACKUP_MAGIC) {
+		ra.byteswap = FALSE;
+	} else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
+		ra.byteswap = TRUE;
+	} else {
+		ra.err = EINVAL;
+		goto out;
+	}
+
+	if (ra.byteswap) {
+		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
+		drrb->drr_version = BSWAP_64(drrb->drr_version);
+		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
+		drrb->drr_type = BSWAP_32(drrb->drr_type);
+		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
+		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
+	}
+
+	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+
+	tosnap = drrb->drr_toname;
+	if (drrb->drr_version != DMU_BACKUP_VERSION ||
+	    drrb->drr_type >= DMU_OST_NUMTYPES ||
+	    strchr(drrb->drr_toname, '@') == NULL) {
+		ra.err = EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Process the begin in syncing context.
+	 */
+	if (drrb->drr_fromguid) {
+		/* incremental backup */
+
+		cp = strchr(tosnap, '@');
+		*cp = '\0';
+		dd = dsl_dir_open(tosnap, FTAG, NULL);
+		*cp = '@';
+		if (dd == NULL) {
+			ra.err = ENOENT;
+			goto out;
+		}
+
+		ra.err = dsl_dir_sync_task(dd, replay_incremental_sync,
+		    drrb, 1<<20);
+	} else {
+		/* full backup */
+		const char *tail;
+
+		cp = strchr(tosnap, '@');
+		*cp = '\0';
+		dd = dsl_dir_open(tosnap, FTAG, &tail);
+		*cp = '@';
+		if (dd == NULL) {
+			ra.err = ENOENT;
+			goto out;
+		}
+		if (tail == NULL) {
+			ra.err = EEXIST;
+			goto out;
+		}
+
+		ra.err = dsl_dir_sync_task(dd, replay_full_sync,
+		    drrb, 1<<20);
+	}
+	if (ra.err)
+		goto out;
+
+	/*
+	 * Open the objset we are modifying.
+	 */
+
+	cp = strchr(tosnap, '@');
+	*cp = '\0';
+	ra.err = dmu_objset_open(tosnap, DMU_OST_ANY,
+	    DS_MODE_PRIMARY | DS_MODE_RESTORE, &os);
+	*cp = '@';
+	ASSERT3U(ra.err, ==, 0);
+
+	/*
+	 * Read records and process them.
+	 */
+	while (ra.err == 0 &&
+	    NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
+		if (issig(JUSTLOOKING)) {
+			ra.err = EINTR;
+			goto out;
+		}
+
+		if (ra.byteswap)
+			backup_byteswap(drr);
+
+		switch (drr->drr_type) {
+		case DRR_OBJECT:
+		{
+			/*
+			 * We need to make a copy of the record header,
+			 * because restore_{object,write} may need to
+			 * restore_read(), which will invalidate drr.
+			 */
+			struct drr_object drro = drr->drr_u.drr_object;
+			ra.err = restore_object(&ra, os, &drro);
+			break;
+		}
+		case DRR_FREEOBJECTS:
+		{
+			struct drr_freeobjects drrfo =
+			    drr->drr_u.drr_freeobjects;
+			ra.err = restore_freeobjects(&ra, os, &drrfo);
+			break;
+		}
+		case DRR_WRITE:
+		{
+			struct drr_write drrw = drr->drr_u.drr_write;
+			ra.err = restore_write(&ra, os, &drrw);
+			break;
+		}
+		case DRR_FREE:
+		{
+			struct drr_free drrf = drr->drr_u.drr_free;
+			ra.err = restore_free(&ra, os, &drrf);
+			break;
+		}
+		case DRR_END:
+			/* Need to verify checksum. */
+			/*
+			 * dd may be the parent of the dd we are
+			 * restoring into (eg. if it's a full backup).
+			 */
+			ra.err = dsl_dir_sync_task(dmu_objset_ds(os)->
+			    ds_dir, replay_end_sync, drrb, 1<<20);
+			goto out;
+		default:
+			ra.err = EINVAL;
+			goto out;
+		}
+	}
+
+out:
+	if (os)
+		dmu_objset_close(os);
+
+	/*
+	 * Make sure we don't rollback/destroy unless we actually
+	 * processed the begin properly.  'os' will only be set if this
+	 * is the case.
+	 */
+	if (ra.err && os && dd && tosnap && strchr(tosnap, '@')) {
+		/*
+		 * rollback or destroy what we created, so we don't
+		 * leave it in the restoring state.
+		 */
+		txg_wait_synced(dd->dd_pool, 0);
+		if (drrb->drr_fromguid) {
+			/* incremental: rollback to most recent snapshot */
+			(void) dsl_dir_sync_task(dd,
+			    dsl_dataset_rollback_sync, NULL, 0);
+		} else {
+			/* full: destroy whole fs */
+			cp = strchr(tosnap, '@');
+			*cp = '\0';
+			cp = strchr(tosnap, '/');
+			if (cp) {
+				(void) dsl_dir_sync_task(dd,
+				    dsl_dir_destroy_sync, cp+1, 0);
+			}
+			cp = strchr(tosnap, '\0');
+			*cp = '@';
+		}
+
+	}
+
+	if (dd)
+		dsl_dir_close(dd, FTAG);
+	kmem_free(ra.buf, ra.bufsize);
+	if (sizep)
+		*sizep = ra.voff;
+	return (ra.err);
+}
+
+/*
+ * Intent log support: sync the block at <os, object, offset> to disk.
+ * N.B. and XXX: the caller is responsible for serializing dmu_sync()s
+ * of the same block, and for making sure that the data isn't changing
+ * while dmu_sync() is writing it.
+ *
+ * Return values:
+ *
+ *	EALREADY: this txg has already been synced, so there's nothing to to.
+ *		The caller should not log the write.
+ *
+ *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
+ *		The caller should not log the write.
+ *
+ *	EINPROGRESS: the block is in the process of being synced by the
+ *		usual mechanism (spa_sync()), so we can't sync it here.
+ *		The caller should txg_wait_synced() and not log the write.
+ *
+ *	EBUSY: another thread is trying to dmu_sync() the same dbuf.
+ *		(This case cannot arise under the current locking rules.)
+ *		The caller should txg_wait_synced() and not log the write.
+ *
+ *	ESTALE: the block was dirtied or freed while we were writing it,
+ *		so the data is no longer valid.
+ *		The caller should txg_wait_synced() and not log the write.
+ *
+ *	0: success.  Sets *bp to the blkptr just written, and sets
+ *		*blkoff to the data's offset within that block.
+ *		The caller should log this blkptr/blkoff in its lr_write_t.
+ */
+int
+dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
+    blkptr_t *bp, uint64_t txg)
+{
+	dsl_pool_t *dp = os->os->os_dsl_dataset->ds_dir->dd_pool;
+	tx_state_t *tx = &dp->dp_tx;
+	dmu_buf_impl_t *db;
+	blkptr_t *blk;
+	int err;
+
+	ASSERT(RW_LOCK_HELD(&tx->tx_suspend));
+	ASSERT(BP_IS_HOLE(bp));
+	ASSERT(txg != 0);
+
+	dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
+	    txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
+
+	/*
+	 * If this txg already synced, there's nothing to do.
+	 */
+	if (txg <= tx->tx_synced_txg) {
+		/*
+		 * If we're running ziltest, we need the blkptr regardless.
+		 */
+		if (txg > spa_freeze_txg(dp->dp_spa)) {
+			db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset);
+			/* if db_blkptr == NULL, this was an empty write */
+			if (db->db_blkptr)
+				*bp = *db->db_blkptr; /* structure assignment */
+			else
+				bzero(bp, sizeof (blkptr_t));
+			*blkoff = offset - db->db.db_offset;
+			ASSERT3U(*blkoff, <, db->db.db_size);
+			dmu_buf_rele((dmu_buf_t *)db);
+			return (0);
+		}
+		return (EALREADY);
+	}
+
+	/*
+	 * If this txg is in the middle of syncing, just wait for it.
+	 */
+	if (txg == tx->tx_syncing_txg) {
+		ASSERT(txg != tx->tx_open_txg);
+		return (EINPROGRESS);
+	}
+
+	db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset);
+
+	mutex_enter(&db->db_mtx);
+
+	/*
+	 * If this dbuf isn't dirty, must have been free_range'd.
+	 * There's no need to log writes to freed blocks, so we're done.
+	 */
+	if (!list_link_active(&db->db_dirty_node[txg&TXG_MASK])) {
+		mutex_exit(&db->db_mtx);
+		dmu_buf_rele((dmu_buf_t *)db);
+		return (ENOENT);
+	}
+
+	blk = db->db_d.db_overridden_by[txg&TXG_MASK];
+
+	/*
+	 * If we already did a dmu_sync() of this dbuf in this txg,
+	 * free the old block before writing the new one.
+	 */
+	if (blk != NULL) {
+		ASSERT(blk != IN_DMU_SYNC);
+		if (blk == IN_DMU_SYNC) {
+			mutex_exit(&db->db_mtx);
+			dmu_buf_rele((dmu_buf_t *)db);
+			return (EBUSY);
+		}
+		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
+		if (!BP_IS_HOLE(blk)) {
+			(void) arc_free(NULL, os->os->os_spa, txg, blk,
+			    NULL, NULL, ARC_WAIT);
+		}
+		kmem_free(blk, sizeof (blkptr_t));
+	}
+
+	db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC;
+	mutex_exit(&db->db_mtx);
+
+	blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+	blk->blk_birth = 0; /* mark as invalid */
+
+	err = arc_write(NULL, os->os->os_spa,
+	    zio_checksum_select(db->db_dnode->dn_checksum, os->os->os_checksum),
+	    zio_compress_select(db->db_dnode->dn_compress, os->os->os_compress),
+	    txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL,
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+	ASSERT(err == 0);
+
+	if (!BP_IS_HOLE(blk)) {
+		blk->blk_fill = 1;
+		BP_SET_TYPE(blk, db->db_dnode->dn_type);
+		BP_SET_LEVEL(blk, 0);
+	}
+
+	/* copy the block pointer back to caller */
+	*bp = *blk; /* structure assignment */
+	*blkoff = offset - db->db.db_offset;
+	ASSERT3U(*blkoff, <, db->db.db_size);
+
+	mutex_enter(&db->db_mtx);
+	if (db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC) {
+		/* we were dirtied/freed during the sync */
+		ASSERT3P(db->db_d.db_overridden_by[txg&TXG_MASK], ==, NULL);
+		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
+		mutex_exit(&db->db_mtx);
+		dmu_buf_rele((dmu_buf_t *)db);
+		/* Note that this block does not free on disk until txg syncs */
+
+		/*
+		 * XXX can we use ARC_NOWAIT here?
+		 * XXX should we be ignoring the return code?
+		 */
+		if (!BP_IS_HOLE(blk)) {
+			(void) arc_free(NULL, os->os->os_spa, txg, blk,
+			    NULL, NULL, ARC_WAIT);
+		}
+		kmem_free(blk, sizeof (blkptr_t));
+		return (ESTALE);
+	}
+
+	db->db_d.db_overridden_by[txg&TXG_MASK] = blk;
+	mutex_exit(&db->db_mtx);
+	dmu_buf_rele((dmu_buf_t *)db);
+	ASSERT3U(txg, >, tx->tx_syncing_txg);
+	return (0);
+}
+
+uint64_t
+dmu_object_max_nonzero_offset(objset_t *os, uint64_t object)
+{
+	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	uint64_t rv = dnode_max_nonzero_offset(dn);
+	dnode_rele(dn, FTAG);
+	return (rv);
+}
+
+int
+dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
+	dmu_tx_t *tx)
+{
+	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	int err = dnode_set_blksz(dn, size, ibs, tx);
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
+void
+dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+	dmu_tx_t *tx)
+{
+	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+	dn->dn_checksum = checksum;
+	dnode_setdirty(dn, tx);
+	dnode_rele(dn, FTAG);
+}
+
+void
+dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+	dmu_tx_t *tx)
+{
+	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+	ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
+	dn->dn_compress = compress;
+	dnode_setdirty(dn, tx);
+	dnode_rele(dn, FTAG);
+}
+
+int
+dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
+{
+	dnode_t *dn;
+	int i, err;
+
+	dn = dnode_hold(os->os, object, FTAG);
+	/*
+	 * Sync any current changes before
+	 * we go trundling through the block pointers.
+	 */
+	for (i = 0; i < TXG_SIZE; i++) {
+		if (dn->dn_dirtyblksz[i])
+			break;
+	}
+	if (i != TXG_SIZE) {
+		dnode_rele(dn, FTAG);
+		txg_wait_synced(dmu_objset_pool(os), 0);
+		dn = dnode_hold(os->os, object, FTAG);
+	}
+
+	err = dnode_next_offset(dn, hole, off, 1, 1);
+	dnode_rele(dn, FTAG);
+
+	return (err);
+}
+
+void
+dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
+{
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	mutex_enter(&dn->dn_mtx);
+
+	doi->doi_data_block_size = dn->dn_datablksz;
+	doi->doi_metadata_block_size = dn->dn_indblkshift ?
+	    1ULL << dn->dn_indblkshift : 0;
+	doi->doi_indirection = dn->dn_nlevels;
+	doi->doi_checksum = dn->dn_checksum;
+	doi->doi_compress = dn->dn_compress;
+	doi->doi_physical_blks = dn->dn_phys->dn_secphys;
+	doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
+	doi->doi_type = dn->dn_type;
+	doi->doi_bonus_size = dn->dn_bonuslen;
+	doi->doi_bonus_type = dn->dn_bonustype;
+
+	mutex_exit(&dn->dn_mtx);
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+/*
+ * Get information on a DMU object.
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int
+dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
+{
+	dnode_t *dn = dnode_hold(os->os, object, FTAG);
+
+	if (dn == NULL)
+		return (ENOENT);
+
+	if (doi != NULL)
+		dmu_object_info_from_dnode(dn, doi);
+
+	dnode_rele(dn, FTAG);
+	return (0);
+}
+
+/*
+ * As above, but faster; can be used when you have a held dbuf in hand.
+ */
+void
+dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
+{
+	dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi);
+}
+
+/*
+ * Faster still when you only care about the size.
+ * This is specifically optimized for zfs_getattr().
+ */
+void
+dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
+{
+	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+
+	*blksize = dn->dn_datablksz;
+	*nblk512 = dn->dn_phys->dn_secphys + 1;	/* add 1 for dnode space */
+}
+
+void
+byteswap_uint64_array(void *vbuf, size_t size)
+{
+	uint64_t *buf = vbuf;
+	size_t count = size >> 3;
+	int i;
+
+	ASSERT((size & 7) == 0);
+
+	for (i = 0; i < count; i++)
+		buf[i] = BSWAP_64(buf[i]);
+}
+
+void
+byteswap_uint32_array(void *vbuf, size_t size)
+{
+	uint32_t *buf = vbuf;
+	size_t count = size >> 2;
+	int i;
+
+	ASSERT((size & 3) == 0);
+
+	for (i = 0; i < count; i++)
+		buf[i] = BSWAP_32(buf[i]);
+}
+
+void
+byteswap_uint16_array(void *vbuf, size_t size)
+{
+	uint16_t *buf = vbuf;
+	size_t count = size >> 1;
+	int i;
+
+	ASSERT((size & 1) == 0);
+
+	for (i = 0; i < count; i++)
+		buf[i] = BSWAP_16(buf[i]);
+}
+
+/* ARGSUSED */
+void
+byteswap_uint8_array(void *vbuf, size_t size)
+{
+}
+
+void
+dmu_init(void)
+{
+	dbuf_init();
+	dnode_init();
+	arc_init();
+}
+
+void
+dmu_fini(void)
+{
+	arc_fini();
+	dnode_fini();
+	dbuf_fini();
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c
new file mode 100644
index 000000000000..d150d6c40033
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu_object.c
@@ -0,0 +1,149 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+
+uint64_t
+dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	objset_impl_t *osi = os->os;
+	uint64_t object;
+	uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
+	    (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
+	dnode_t *dn;
+	int restarted = B_FALSE;
+
+	mutex_enter(&osi->os_obj_lock);
+	for (;;) {
+		object = osi->os_obj_next;
+		/*
+		 * Each time we polish off an L2 bp worth of dnodes
+		 * (2^13 objects), move to another L2 bp that's still
+		 * reasonably sparse (at most 1/4 full).  Look from the
+		 * beginning once, but after that keep looking from here.
+		 * If we can't find one, just keep going from here.
+		 */
+		if (P2PHASE(object, L2_dnode_count) == 0) {
+			uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
+			int error = dnode_next_offset(osi->os_meta_dnode,
+			    B_TRUE, &offset, 2, DNODES_PER_BLOCK >> 2);
+			restarted = B_TRUE;
+			if (error == 0)
+				object = offset >> DNODE_SHIFT;
+		}
+		osi->os_obj_next = ++object;
+
+		dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG);
+		if (dn)
+			break;
+
+		if (dmu_object_next(os, &object, B_TRUE) == 0)
+			osi->os_obj_next = object - 1;
+	}
+
+	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+	dnode_rele(dn, FTAG);
+
+	mutex_exit(&osi->os_obj_lock);
+
+	dmu_tx_add_new_object(tx, os, object);
+	return (object);
+}
+
+int
+dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+
+	if ((object & DMU_PRIVATE_OBJECT) && !dmu_tx_private_ok(tx))
+		return (EBADF);
+
+	dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG);
+	if (dn == NULL)
+		return (EEXIST);
+	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+	dnode_rele(dn, FTAG);
+
+	dmu_tx_add_new_object(tx, os, object);
+	return (0);
+}
+
+int
+dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+
+	if ((object & DMU_PRIVATE_OBJECT) && !dmu_tx_private_ok(tx))
+		return (EBADF);
+
+	dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG);
+	if (dn == NULL)
+		return (EBADF);
+	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
+	dnode_rele(dn, FTAG);
+
+	return (0);
+}
+
+int
+dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+
+	ASSERT(!(object & DMU_PRIVATE_OBJECT) || dmu_tx_private_ok(tx));
+
+	dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG);
+	if (dn == NULL)
+		return (ENOENT);
+
+	ASSERT(dn->dn_type != DMU_OT_NONE);
+	dnode_free(dn, tx);
+	dnode_rele(dn, FTAG);
+
+	return (0);
+}
+
+int
+dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole)
+{
+	uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
+	int error;
+
+	error = dnode_next_offset(os->os->os_meta_dnode,
+	    hole, &offset, 0, DNODES_PER_BLOCK);
+
+	*objectp = offset >> DNODE_SHIFT;
+
+	return (error);
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
new file mode 100644
index 000000000000..9bb621b9a1a8
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -0,0 +1,727 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/zio_checksum.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/dmu_impl.h>
+
+
+spa_t *
+dmu_objset_spa(objset_t *os)
+{
+	return (os->os->os_spa);
+}
+
+zilog_t *
+dmu_objset_zil(objset_t *os)
+{
+	return (os->os->os_zil);
+}
+
+dsl_pool_t *
+dmu_objset_pool(objset_t *os)
+{
+	dsl_dataset_t *ds;
+
+	if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir)
+		return (ds->ds_dir->dd_pool);
+	else
+		return (spa_get_dsl(os->os->os_spa));
+}
+
+dsl_dataset_t *
+dmu_objset_ds(objset_t *os)
+{
+	return (os->os->os_dsl_dataset);
+}
+
+dmu_objset_type_t
+dmu_objset_type(objset_t *os)
+{
+	return (os->os->os_phys->os_type);
+}
+
+void
+dmu_objset_name(objset_t *os, char *buf)
+{
+	dsl_dataset_name(os->os->os_dsl_dataset, buf);
+}
+
+uint64_t
+dmu_objset_id(objset_t *os)
+{
+	dsl_dataset_t *ds = os->os->os_dsl_dataset;
+
+	return (ds ? ds->ds_object : 0);
+}
+
+static void
+checksum_changed_cb(void *arg, uint64_t newval)
+{
+	objset_impl_t *osi = arg;
+
+	/*
+	 * Inheritance should have been done by now.
+	 */
+	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+	osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
+}
+
+static void
+compression_changed_cb(void *arg, uint64_t newval)
+{
+	objset_impl_t *osi = arg;
+
+	/*
+	 * Inheritance and range checking should have been done by now.
+	 */
+	ASSERT(newval != ZIO_COMPRESS_INHERIT);
+
+	osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
+}
+
+void
+dmu_objset_byteswap(void *buf, size_t size)
+{
+	objset_phys_t *osp = buf;
+
+	ASSERT(size == sizeof (objset_phys_t));
+	dnode_byteswap(&osp->os_meta_dnode);
+	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
+	osp->os_type = BSWAP_64(osp->os_type);
+}
+
+objset_impl_t *
+dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
+{
+	objset_impl_t *winner, *osi;
+	int i, err, checksum;
+
+	osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
+	osi->os.os = osi;
+	osi->os_dsl_dataset = ds;
+	osi->os_spa = spa;
+	if (bp)
+		osi->os_rootbp = *bp;
+	osi->os_phys = zio_buf_alloc(sizeof (objset_phys_t));
+	if (!BP_IS_HOLE(&osi->os_rootbp)) {
+		dprintf_bp(&osi->os_rootbp, "reading %s", "");
+		(void) arc_read(NULL, spa, &osi->os_rootbp,
+		    dmu_ot[DMU_OT_OBJSET].ot_byteswap,
+		    arc_bcopy_func, osi->os_phys,
+		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+	} else {
+		bzero(osi->os_phys, sizeof (objset_phys_t));
+	}
+	osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
+
+	/*
+	 * Note: the changed_cb will be called once before the register
+	 * func returns, thus changing the checksum/compression from the
+	 * default (fletcher2/off).
+	 */
+	if (ds) {
+		err = dsl_prop_register(ds, "checksum",
+		    checksum_changed_cb, osi);
+		ASSERT(err == 0);
+
+		err = dsl_prop_register(ds, "compression",
+		    compression_changed_cb, osi);
+		ASSERT(err == 0);
+	} else {
+		/* It's the meta-objset. */
+		osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
+		osi->os_compress = ZIO_COMPRESS_LZJB;
+	}
+
+	/*
+	 * Metadata always gets compressed and checksummed.
+	 * If the data checksum is multi-bit correctable, and it's not
+	 * a ZBT-style checksum, then it's suitable for metadata as well.
+	 * Otherwise, the metadata checksum defaults to fletcher4.
+	 */
+	checksum = osi->os_checksum;
+
+	if (zio_checksum_table[checksum].ci_correctable &&
+	    !zio_checksum_table[checksum].ci_zbt)
+		osi->os_md_checksum = checksum;
+	else
+		osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
+
+	osi->os_md_compress = ZIO_COMPRESS_LZJB;
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
+		    offsetof(dnode_t, dn_dirty_link[i]));
+		list_create(&osi->os_free_dnodes[i], sizeof (dnode_t),
+		    offsetof(dnode_t, dn_dirty_link[i]));
+	}
+	list_create(&osi->os_dnodes, sizeof (dnode_t),
+	    offsetof(dnode_t, dn_link));
+	list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
+	    offsetof(dmu_buf_impl_t, db_link));
+
+	osi->os_meta_dnode = dnode_special_open(osi,
+	    &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
+
+	if (ds != NULL) {
+		winner = dsl_dataset_set_user_ptr(ds, osi, dmu_objset_evict);
+		if (winner) {
+			dmu_objset_evict(ds, osi);
+			osi = winner;
+		}
+	}
+
+	return (osi);
+}
+
+/* called from zpl */
+int
+dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+    objset_t **osp)
+{
+	dsl_dataset_t *ds;
+	int err;
+	objset_t *os;
+	objset_impl_t *osi;
+
+	os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
+	err = dsl_dataset_open(name, mode, os, &ds);
+	if (err) {
+		kmem_free(os, sizeof (objset_t));
+		return (err);
+	}
+
+	osi = dsl_dataset_get_user_ptr(ds);
+	if (osi == NULL) {
+		blkptr_t bp;
+
+		dsl_dataset_get_blkptr(ds, &bp);
+		osi = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, &bp);
+	}
+
+	os->os = osi;
+	os->os_mode = mode;
+
+	if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) {
+		dmu_objset_close(os);
+		return (EINVAL);
+	}
+	*osp = os;
+	return (0);
+}
+
+void
+dmu_objset_close(objset_t *os)
+{
+	dsl_dataset_close(os->os->os_dsl_dataset, os->os_mode, os);
+	kmem_free(os, sizeof (objset_t));
+}
+
+void
+dmu_objset_evict(dsl_dataset_t *ds, void *arg)
+{
+	objset_impl_t *osi = arg;
+	int err, i;
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
+		ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
+	}
+
+	if (ds) {
+		err = dsl_prop_unregister(ds, "checksum",
+		    checksum_changed_cb, osi);
+		ASSERT(err == 0);
+
+		err = dsl_prop_unregister(ds, "compression",
+		    compression_changed_cb, osi);
+		ASSERT(err == 0);
+	}
+
+	ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
+	ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
+	ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
+
+	dnode_special_close(osi->os_meta_dnode);
+	zil_free(osi->os_zil);
+
+	zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
+	kmem_free(osi, sizeof (objset_impl_t));
+}
+
+/* called from dsl for meta-objset */
+objset_impl_t *
+dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type,
+    dmu_tx_t *tx)
+{
+	objset_impl_t *osi;
+	dnode_t *mdn;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	osi = dmu_objset_open_impl(spa, ds, NULL);
+	mdn = osi->os_meta_dnode;
+
+	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
+	    DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
+
+	/*
+	 * We don't want to have to increase the meta-dnode's nlevels
+	 * later, because then we could do it in quescing context while
+	 * we are also accessing it in open context.
+	 *
+	 * This precaution is not necessary for the MOS (ds == NULL),
+	 * because the MOS is only updated in syncing context.
+	 * This is most fortunate: the MOS is the only objset that
+	 * needs to be synced multiple times as spa_sync() iterates
+	 * to convergence, so minimizing its dn_nlevels matters.
+	 */
+	if (ds != NULL)
+		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
+		    mdn->dn_nlevels = DN_META_DNODE_LEVELS;
+
+	ASSERT(type != DMU_OST_NONE);
+	ASSERT(type != DMU_OST_ANY);
+	ASSERT(type < DMU_OST_NUMTYPES);
+	osi->os_phys->os_type = type;
+
+	dsl_dataset_dirty(ds, tx);
+
+	return (osi);
+}
+
+struct oscarg {
+	void (*userfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
+	void *userarg;
+	dsl_dataset_t *clone_parent;
+	const char *fullname;
+	const char *lastname;
+	dmu_objset_type_t type;
+};
+
+static int
+dmu_objset_create_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	struct oscarg *oa = arg;
+	dsl_dataset_t *ds;
+	int err;
+	blkptr_t bp;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	err = dsl_dataset_create_sync(dd, oa->fullname, oa->lastname,
+	    oa->clone_parent, tx);
+	dprintf_dd(dd, "fn=%s ln=%s err=%d\n",
+	    oa->fullname, oa->lastname, err);
+	if (err)
+		return (err);
+
+	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, oa->fullname,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds);
+	ASSERT3U(err, ==, 0);
+	dsl_dataset_get_blkptr(ds, &bp);
+	if (BP_IS_HOLE(&bp)) {
+		objset_impl_t *osi;
+
+		/* This is an empty dmu_objset; not a clone. */
+		osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
+		    ds, oa->type, tx);
+
+		if (oa->userfunc)
+			oa->userfunc(&osi->os, oa->userarg, tx);
+	}
+	dsl_dataset_close(ds, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
+
+	return (0);
+}
+
+int
+dmu_objset_create(const char *name, dmu_objset_type_t type,
+    objset_t *clone_parent,
+    void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg)
+{
+	dsl_dir_t *pds;
+	const char *tail;
+	int err = 0;
+
+	pds = dsl_dir_open(name, FTAG, &tail);
+	if (pds == NULL)
+		return (ENOENT);
+	if (tail == NULL) {
+		dsl_dir_close(pds, FTAG);
+		return (EEXIST);
+	}
+
+	dprintf("name=%s\n", name);
+
+	if (tail[0] == '@') {
+		/*
+		 * If we're creating a snapshot, make sure everything
+		 * they might want is on disk.  XXX Sketchy to know
+		 * about snapshots here, better to put in DSL.
+		 */
+		objset_t *os;
+		size_t plen = strchr(name, '@') - name + 1;
+		char *pbuf = kmem_alloc(plen, KM_SLEEP);
+		bcopy(name, pbuf, plen - 1);
+		pbuf[plen - 1] = '\0';
+
+		err = dmu_objset_open(pbuf, DMU_OST_ANY, DS_MODE_STANDARD, &os);
+		if (err == 0) {
+			err = zil_suspend(dmu_objset_zil(os));
+			if (err == 0) {
+				err = dsl_dir_sync_task(pds,
+				    dsl_dataset_snapshot_sync,
+				    (void*)(tail+1), 16*1024);
+				zil_resume(dmu_objset_zil(os));
+			}
+			dmu_objset_close(os);
+		}
+		kmem_free(pbuf, plen);
+	} else {
+		struct oscarg oa = { 0 };
+		oa.userfunc = func;
+		oa.userarg = arg;
+		oa.fullname = name;
+		oa.lastname = tail;
+		oa.type = type;
+		if (clone_parent != NULL) {
+			/*
+			 * You can't clone to a different type.
+			 */
+			if (clone_parent->os->os_phys->os_type != type) {
+				dsl_dir_close(pds, FTAG);
+				return (EINVAL);
+			}
+			oa.clone_parent = clone_parent->os->os_dsl_dataset;
+		}
+		err = dsl_dir_sync_task(pds, dmu_objset_create_sync, &oa,
+		    256*1024);
+	}
+	dsl_dir_close(pds, FTAG);
+	return (err);
+}
+
+int
+dmu_objset_destroy(const char *name)
+{
+	objset_t *os;
+	int error;
+
+	/*
+	 * If it looks like we'll be able to destroy it, and there's
+	 * an unplayed replay log sitting around, destroy the log.
+	 * It would be nicer to do this in dsl_dataset_destroy_sync(),
+	 * but the replay log objset is modified in open context.
+	 */
+	error = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
+	if (error == 0) {
+		zil_destroy(dmu_objset_zil(os));
+		dmu_objset_close(os);
+	}
+
+	/* XXX uncache everything? */
+	return (dsl_dataset_destroy(name));
+}
+
+int
+dmu_objset_rollback(const char *name)
+{
+	int err;
+	objset_t *os;
+
+	err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_EXCLUSIVE, &os);
+	if (err == 0) {
+		err = zil_suspend(dmu_objset_zil(os));
+		if (err == 0)
+			zil_resume(dmu_objset_zil(os));
+		dmu_objset_close(os);
+		if (err == 0) {
+			/* XXX uncache everything? */
+			err = dsl_dataset_rollback(name);
+		}
+	}
+	return (err);
+}
+
+static void
+dmu_objset_sync_dnodes(objset_impl_t *os, list_t *list, dmu_tx_t *tx)
+{
+	dnode_t *dn = list_head(list);
+	int level, err;
+
+	for (level = 0; dn = list_head(list); level++) {
+		zio_t *zio;
+		zio = zio_root(os->os_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+
+		ASSERT3U(level, <=, DN_MAX_LEVELS);
+
+		while (dn) {
+			dnode_t *next = list_next(list, dn);
+
+			list_remove(list, dn);
+			if (dnode_sync(dn, level, zio, tx) == 0) {
+				/*
+				 * This dnode requires syncing at higher
+				 * levels; put it back onto the list.
+				 */
+				if (next)
+					list_insert_before(list, next, dn);
+				else
+					list_insert_tail(list, dn);
+			}
+			dn = next;
+		}
+		err = zio_wait(zio);
+		ASSERT(err == 0);
+	}
+}
+
+/* ARGSUSED */
+static void
+killer(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+	objset_impl_t *os = arg;
+	objset_phys_t *osphys = zio->io_data;
+	dnode_phys_t *dnp = &osphys->os_meta_dnode;
+	int i;
+
+	ASSERT3U(zio->io_error, ==, 0);
+
+	/*
+	 * Update rootbp fill count.
+	 */
+	os->os_rootbp.blk_fill = 1;	/* count the meta-dnode */
+	for (i = 0; i < dnp->dn_nblkptr; i++)
+		os->os_rootbp.blk_fill += dnp->dn_blkptr[i].blk_fill;
+
+	BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET);
+	BP_SET_LEVEL(zio->io_bp, 0);
+
+	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
+	    BP_IDENTITY(&zio->io_bp_orig))) {
+		dsl_dataset_block_kill(os->os_dsl_dataset, &zio->io_bp_orig,
+		    os->os_synctx);
+		dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp,
+		    os->os_synctx);
+	}
+}
+
+
+/* called from dsl */
+void
+dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx)
+{
+	extern taskq_t *dbuf_tq;
+	int txgoff;
+	list_t *dirty_list;
+	int err;
+	arc_buf_t *abuf =
+	    arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG);
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(os->os_synctx == NULL);
+	/* XXX the write_done callback should really give us the tx... */
+	os->os_synctx = tx;
+
+	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
+
+	txgoff = tx->tx_txg & TXG_MASK;
+
+	dmu_objset_sync_dnodes(os, &os->os_free_dnodes[txgoff], tx);
+	dmu_objset_sync_dnodes(os, &os->os_dirty_dnodes[txgoff], tx);
+
+	/*
+	 * Free intent log blocks up to this tx.
+	 */
+	zil_sync(os->os_zil, tx);
+
+	/*
+	 * Sync meta-dnode
+	 */
+	dirty_list = &os->os_dirty_dnodes[txgoff];
+	ASSERT(list_head(dirty_list) == NULL);
+	list_insert_tail(dirty_list, os->os_meta_dnode);
+	dmu_objset_sync_dnodes(os, dirty_list, tx);
+
+	/*
+	 * Sync the root block.
+	 */
+	bcopy(os->os_phys, abuf->b_data, sizeof (objset_phys_t));
+	err = arc_write(NULL, os->os_spa, os->os_md_checksum,
+	    os->os_md_compress, tx->tx_txg, &os->os_rootbp, abuf, killer, os,
+	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+	ASSERT(err == 0);
+	arc_buf_free(abuf, FTAG);
+
+	dsl_dataset_set_blkptr(os->os_dsl_dataset, &os->os_rootbp, tx);
+
+	ASSERT3P(os->os_synctx, ==, tx);
+	taskq_wait(dbuf_tq);
+	os->os_synctx = NULL;
+}
+
+void
+dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds)
+{
+	if (os->os->os_dsl_dataset != NULL) {
+		dsl_dataset_stats(os->os->os_dsl_dataset, dds);
+	} else {
+		ASSERT(os->os->os_phys->os_type == DMU_OST_META);
+		bzero(dds, sizeof (*dds));
+	}
+	dds->dds_type = os->os->os_phys->os_type;
+}
+
+int
+dmu_objset_is_snapshot(objset_t *os)
+{
+	if (os->os->os_dsl_dataset != NULL)
+		return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset));
+	else
+		return (B_FALSE);
+}
+
+int
+dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+    uint64_t *id, uint64_t *offp)
+{
+	dsl_dataset_t *ds = os->os->os_dsl_dataset;
+	zap_cursor_t cursor;
+	zap_attribute_t attr;
+
+	if (ds->ds_phys->ds_snapnames_zapobj == 0)
+		return (ENOENT);
+
+	zap_cursor_init_serialized(&cursor,
+	    ds->ds_dir->dd_pool->dp_meta_objset,
+	    ds->ds_phys->ds_snapnames_zapobj, *offp);
+
+	if (zap_cursor_retrieve(&cursor, &attr) != 0)
+		return (ENOENT);
+
+	if (strlen(attr.za_name) + 1 > namelen)
+		return (ENAMETOOLONG);
+
+	(void) strcpy(name, attr.za_name);
+	*id = attr.za_first_integer;
+	zap_cursor_advance(&cursor);
+	*offp = zap_cursor_serialize(&cursor);
+
+	return (0);
+}
+
+/*
+ * Find all objsets under name, and for each, call 'func(child_name, arg)'.
+ */
+void
+dmu_objset_find(char *name, void func(char *, void *), void *arg, int flags)
+{
+	dsl_dir_t *dd;
+	objset_t *os;
+	uint64_t snapobj;
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+	char *child;
+	int do_self;
+
+	dd = dsl_dir_open(name, FTAG, NULL);
+	if (dd == NULL)
+		return;
+
+	do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
+
+	/*
+	 * Iterate over all children.
+	 */
+	if (dd->dd_phys->dd_child_dir_zapobj != 0) {
+		for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset,
+		    dd->dd_phys->dd_child_dir_zapobj);
+		    zap_cursor_retrieve(&zc, &attr) == 0;
+		    (void) zap_cursor_advance(&zc)) {
+			ASSERT(attr.za_integer_length == sizeof (uint64_t));
+			ASSERT(attr.za_num_integers == 1);
+
+			/*
+			 * No separating '/' because parent's name ends in /.
+			 */
+			child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+			/* XXX could probably just use name here */
+			dsl_dir_name(dd, child);
+			(void) strcat(child, "/");
+			(void) strcat(child, attr.za_name);
+			dmu_objset_find(child, func, arg, flags);
+			kmem_free(child, MAXPATHLEN);
+		}
+	}
+
+	/*
+	 * Iterate over all snapshots.
+	 */
+	if ((flags & DS_FIND_SNAPSHOTS) &&
+	    dmu_objset_open(name, DMU_OST_ANY,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, &os) == 0) {
+
+		snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj;
+		dmu_objset_close(os);
+
+		for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj);
+		    zap_cursor_retrieve(&zc, &attr) == 0;
+		    (void) zap_cursor_advance(&zc)) {
+			ASSERT(attr.za_integer_length == sizeof (uint64_t));
+			ASSERT(attr.za_num_integers == 1);
+
+			child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+			/* XXX could probably just use name here */
+			dsl_dir_name(dd, child);
+			(void) strcat(child, "@");
+			(void) strcat(child, attr.za_name);
+			func(child, arg);
+			kmem_free(child, MAXPATHLEN);
+		}
+	}
+
+	dsl_dir_close(dd, FTAG);
+
+	/*
+	 * Apply to self if appropriate.
+	 */
+	if (do_self)
+		func(name, arg);
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c
new file mode 100644
index 000000000000..036e3965cf1a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c
@@ -0,0 +1,792 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_impl.h>
+
+#define	BP_SPAN_SHIFT(level, width)	((level) * (width))
+
+#define	BP_EQUAL(b1, b2)				\
+	(DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) &&	\
+	(b1)->blk_birth == (b2)->blk_birth)
+
+/*
+ * Compare two bookmarks.
+ *
+ * For ADVANCE_PRE, the visitation order is:
+ *
+ *	objset 0, 1, 2, ..., ZB_MAXOBJSET.
+ *	object 0, 1, 2, ..., ZB_MAXOBJECT.
+ *	blkoff 0, 1, 2, ...
+ *	level ZB_MAXLEVEL, ..., 2, 1, 0.
+ *
+ * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid
+ * ordering vector is:
+ *
+ *	< objset, object, blkoff, -level >
+ *
+ * For ADVANCE_POST, the starting offsets aren't sequential but ending
+ * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are.
+ * The visitation order is:
+ *
+ *	objset 1, 2, ..., ZB_MAXOBJSET, 0.
+ *	object 1, 2, ..., ZB_MAXOBJECT, 0.
+ *	blkoff 1, 2, ...
+ *	level 0, 1, 2, ..., ZB_MAXLEVEL.
+ *
+ * and thus a valid ordering vector is:
+ *
+ *	< objset - 1, object - 1, blkoff, level >
+ *
+ * Both orderings can be expressed as:
+ *
+ *	< objset + bias, object + bias, blkoff, level ^ bias >
+ *
+ * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST)
+ * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift).
+ *
+ * Special case: an objset's osphys is represented as level -1 of object 0.
+ * It is always either the very first or very last block we visit in an objset.
+ * Therefore, if either bookmark's level is -1, level alone determines order.
+ */
+static int
+compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp,
+    int advance)
+{
+	int bias = (advance & ADVANCE_PRE) ? 0 : -1;
+	uint64_t sblkoff, eblkoff;
+	int slevel, elevel, wshift;
+
+	if (szb->zb_objset + bias < ezb->zb_objset + bias)
+		return (-1);
+
+	if (szb->zb_objset + bias > ezb->zb_objset + bias)
+		return (1);
+
+	slevel = szb->zb_level;
+	elevel = ezb->zb_level;
+
+	if ((slevel | elevel) < 0)
+		return ((slevel ^ bias) - (elevel ^ bias));
+
+	if (szb->zb_object + bias < ezb->zb_object + bias)
+		return (-1);
+
+	if (szb->zb_object + bias > ezb->zb_object + bias)
+		return (1);
+
+	if (dnp == NULL)
+		return (0);
+
+	wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+	sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift);
+	eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift);
+
+	if (sblkoff < eblkoff)
+		return (-1);
+
+	if (sblkoff > eblkoff)
+		return (1);
+
+	return ((elevel ^ bias) - (slevel ^ bias));
+}
+
+#define	SET_BOOKMARK(zb, objset, object, level, blkid)	\
+{							\
+	(zb)->zb_objset = objset;			\
+	(zb)->zb_object = object;			\
+	(zb)->zb_level = level;				\
+	(zb)->zb_blkid = blkid;				\
+}
+
+#define	SET_BOOKMARK_LB(zb, level, blkid)		\
+{							\
+	(zb)->zb_level = level;				\
+	(zb)->zb_blkid = blkid;				\
+}
+
+static int
+advance_objset(zseg_t *zseg, uint64_t objset, int advance)
+{
+	zbookmark_t *zb = &zseg->seg_start;
+
+	if (advance & ADVANCE_PRE) {
+		if (objset >= ZB_MAXOBJSET)
+			return (ERANGE);
+		SET_BOOKMARK(zb, objset, 0, -1, 0);
+	} else {
+		if (objset >= ZB_MAXOBJSET)
+			objset = 0;
+		SET_BOOKMARK(zb, objset, 1, 0, 0);
+	}
+
+	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+		return (ERANGE);
+
+	return (EAGAIN);
+}
+
+static int
+advance_object(zseg_t *zseg, uint64_t object, int advance)
+{
+	zbookmark_t *zb = &zseg->seg_start;
+
+	if (advance & ADVANCE_PRE) {
+		if (object >= ZB_MAXOBJECT) {
+			SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0);
+		} else {
+			SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0);
+		}
+	} else {
+		if (zb->zb_object == 0) {
+			SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0);
+		} else {
+			if (object >= ZB_MAXOBJECT)
+				object = 0;
+			SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0);
+		}
+	}
+
+	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+		return (ERANGE);
+
+	return (EAGAIN);
+}
+
+static int
+advance_from_osphys(zseg_t *zseg, int advance)
+{
+	zbookmark_t *zb = &zseg->seg_start;
+
+	ASSERT(zb->zb_object == 0);
+	ASSERT(zb->zb_level == -1);
+	ASSERT(zb->zb_blkid == 0);
+
+	if (advance & ADVANCE_PRE) {
+		SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0);
+	} else {
+		if (zb->zb_objset == 0)
+			return (ERANGE);
+		SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0);
+	}
+
+	if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0)
+		return (ERANGE);
+
+	return (EAGAIN);
+}
+
+static int
+advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance)
+{
+	zbookmark_t *zb = &zseg->seg_start;
+	int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+	int maxlevel = dnp->dn_nlevels - 1;
+	int level = zb->zb_level;
+	uint64_t blkid = zb->zb_blkid;
+
+	if (advance & ADVANCE_PRE) {
+		if (level > 0 && rc == 0) {
+			level--;
+			blkid <<= wshift;
+		} else {
+			blkid++;
+
+			if ((blkid << BP_SPAN_SHIFT(level, wshift)) >
+			    dnp->dn_maxblkid)
+				return (ERANGE);
+
+			while (level < maxlevel) {
+				if (P2PHASE(blkid, 1ULL << wshift))
+					break;
+				blkid >>= wshift;
+				level++;
+			}
+		}
+	} else {
+		if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) {
+			blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift);
+			level = 0;
+		} else {
+			blkid >>= wshift;
+			level++;
+		}
+
+		while ((blkid << BP_SPAN_SHIFT(level, wshift)) >
+		    dnp->dn_maxblkid) {
+			if (level == maxlevel)
+				return (ERANGE);
+			blkid >>= wshift;
+			level++;
+		}
+	}
+	SET_BOOKMARK_LB(zb, level, blkid);
+
+	if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0)
+		return (ERANGE);
+
+	return (EAGAIN);
+}
+
+static int
+traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc)
+{
+	/*
+	 * Before we issue the callback, prune against maxtxg.
+	 *
+	 * We prune against mintxg before we get here because it's a big win.
+	 * If a given block was born in txg 37, then we know that the entire
+	 * subtree below that block must have been born in txg 37 or earlier.
+	 * We can therefore lop off huge branches of the tree as we go.
+	 *
+	 * There's no corresponding optimization for maxtxg because knowing
+	 * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's
+	 * children.  In fact, the copy-on-write design of ZFS ensures that
+	 * top-level blocks will pretty much always be new.
+	 *
+	 * Therefore, in the name of simplicity we don't prune against
+	 * maxtxg until the last possible moment -- that being right now.
+	 */
+	if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg)
+		return (0);
+
+	if (bc->bc_errno == 0) {
+		zbookmark_t *zb = &bc->bc_bookmark;
+		zbookmark_t *szb = &zseg->seg_start;
+		zbookmark_t *ezb = &zseg->seg_end;
+		zbookmark_t *lzb = &th->th_lastcb;
+		dnode_phys_t *dnp = bc->bc_dnode;
+
+		/*
+		 * Debugging: verify that the order we visit things
+		 * agrees with the order defined by compare_bookmark().
+		 */
+		ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0);
+		ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0);
+		ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 ||
+		    lzb->zb_level == ZB_NO_LEVEL);
+		*lzb = *zb;
+	}
+
+	th->th_callbacks++;
+	return (th->th_func(bc, th->th_spa, th->th_arg));
+}
+
+static int
+traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
+	dnode_phys_t *dnp)
+{
+	zbookmark_t *zb = &bc->bc_bookmark;
+	int error;
+
+	th->th_hits++;
+
+	bc->bc_dnode = dnp;
+	bc->bc_errno = 0;
+
+	if (BP_EQUAL(&bc->bc_blkptr, bp))
+		return (0);
+
+	bc->bc_blkptr = *bp;
+
+	if (bc->bc_data == NULL)
+		return (0);
+
+	if (BP_IS_HOLE(bp)) {
+		ASSERT(th->th_advance & ADVANCE_HOLES);
+		return (0);
+	}
+
+	if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) {
+		error = EIO;
+	} else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) {
+		error = 0;
+		th->th_arc_hits++;
+	} else {
+		error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
+		    BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+		    th->th_zio_flags | ZIO_FLAG_DONT_CACHE));
+
+		if (BP_SHOULD_BYTESWAP(bp) && error == 0)
+			(zb->zb_level > 0 ? byteswap_uint64_array :
+			    dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data,
+			    BP_GET_LSIZE(bp));
+		th->th_reads++;
+	}
+
+	if (error) {
+		bc->bc_errno = error;
+		error = traverse_callback(th, NULL, bc);
+		ASSERT(error == EAGAIN || error == EINTR || error == ERESTART);
+		bc->bc_blkptr.blk_birth = -1ULL;
+	}
+
+	dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n",
+	    bc - &th->th_cache[0][0], error,
+	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
+
+	return (error);
+}
+
+static int
+find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth)
+{
+	zbookmark_t *zb = &zseg->seg_start;
+	traverse_blk_cache_t *bc;
+	blkptr_t *bp = dnp->dn_blkptr;
+	int i, first, level;
+	int nbp = dnp->dn_nblkptr;
+	int minlevel = zb->zb_level;
+	int maxlevel = dnp->dn_nlevels - 1;
+	int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT;
+	int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift);
+	uint64_t blkid = zb->zb_blkid >> bp_shift;
+	int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE;
+	int rc;
+
+	if (minlevel > maxlevel || blkid >= nbp)
+		return (ERANGE);
+
+	for (level = maxlevel; level >= minlevel; level--) {
+		first = P2PHASE(blkid, 1ULL << wshift);
+
+		for (i = first; i < nbp; i++)
+			if (bp[i].blk_birth > zseg->seg_mintxg ||
+			    BP_IS_HOLE(&bp[i]) && do_holes)
+				break;
+
+		if (i != first) {
+			i--;
+			SET_BOOKMARK_LB(zb, level, blkid + (i - first));
+			return (ENOTBLK);
+		}
+
+		bc = &th->th_cache[depth][level];
+
+		SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object,
+		    level, blkid);
+
+		if (rc = traverse_read(th, bc, bp + i, dnp)) {
+			if (rc != EAGAIN) {
+				SET_BOOKMARK_LB(zb, level, blkid);
+			}
+			return (rc);
+		}
+
+		if (BP_IS_HOLE(&bp[i])) {
+			SET_BOOKMARK_LB(zb, level, blkid);
+			th->th_lastcb.zb_level = ZB_NO_LEVEL;
+			return (0);
+		}
+
+		nbp = 1 << wshift;
+		bp = bc->bc_data;
+		bp_shift -= wshift;
+		blkid = zb->zb_blkid >> bp_shift;
+	}
+
+	return (0);
+}
+
+static int
+get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
+    uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth)
+{
+	zseg_t zseg;
+	zbookmark_t *zb = &zseg.seg_start;
+	uint64_t object = *objectp;
+	int i, rc;
+
+	SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK);
+	SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID);
+
+	zseg.seg_mintxg = txg;
+	zseg.seg_maxtxg = -1ULL;
+
+	for (;;) {
+		rc = find_block(th, &zseg, mdn, depth);
+
+		if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
+			break;
+
+		if (rc == 0 && zb->zb_level == 0) {
+			dnode_phys_t *dnp = th->th_cache[depth][0].bc_data;
+			for (i = 0; i < DNODES_PER_BLOCK; i++) {
+				object = (zb->zb_blkid * DNODES_PER_BLOCK) + i;
+				if (object >= *objectp &&
+				    dnp[i].dn_type != DMU_OT_NONE &&
+				    (type == -1 || dnp[i].dn_type == type)) {
+					*objectp = object;
+					*dnpp = &dnp[i];
+					return (0);
+				}
+			}
+		}
+
+		rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE);
+
+		if (rc == ERANGE)
+			break;
+	}
+
+	if (rc == ERANGE)
+		*objectp = ZB_MAXOBJECT;
+
+	return (rc);
+}
+
+static int
+traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
+{
+	zbookmark_t *zb = &zseg->seg_start;
+	traverse_blk_cache_t *bc;
+	dnode_phys_t *dn, *dn_tmp;
+	int worklimit = 1000;
+	int rc;
+
+	dprintf("<%llu, %llu, %d, %llx>\n",
+	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
+
+	bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1];
+	dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
+
+	SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0);
+
+	rc = traverse_read(th, bc, mosbp, dn);
+
+	if (rc)		/* If we get ERESTART, we've got nowhere left to go */
+		return (rc == ERESTART ? EINTR : rc);
+
+	ASSERT(dn->dn_nlevels < ZB_MAXLEVEL);
+
+	if (zb->zb_objset != 0) {
+		uint64_t objset = zb->zb_objset;
+		dsl_dataset_phys_t *dsp;
+
+		rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0,
+		    DMU_OT_DSL_OBJSET, ZB_MOS_CACHE);
+
+		if (objset != zb->zb_objset)
+			rc = advance_objset(zseg, objset, th->th_advance);
+
+		if (rc != 0)
+			return (rc);
+
+		dsp = DN_BONUS(dn_tmp);
+
+		bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1];
+		dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode;
+
+		SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0);
+
+		rc = traverse_read(th, bc, &dsp->ds_bp, dn);
+
+		if (rc != 0) {
+			if (rc == ERESTART)
+				rc = advance_objset(zseg, zb->zb_objset + 1,
+				    th->th_advance);
+			return (rc);
+		}
+
+		if (th->th_advance & ADVANCE_PRUNE)
+			zseg->seg_mintxg =
+			    MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg);
+	}
+
+	if (zb->zb_level == -1) {
+		ASSERT(zb->zb_object == 0);
+
+		if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
+			rc = traverse_callback(th, zseg, bc);
+			if (rc) {
+				ASSERT(rc == EINTR);
+				return (rc);
+			}
+		}
+
+		return (advance_from_osphys(zseg, th->th_advance));
+	}
+
+	if (zb->zb_object != 0) {
+		uint64_t object = zb->zb_object;
+
+		rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp,
+		    zseg->seg_mintxg, -1, ZB_MDN_CACHE);
+
+		if (object != zb->zb_object)
+			rc = advance_object(zseg, object, th->th_advance);
+
+		if (rc != 0)
+			return (rc);
+
+		dn = dn_tmp;
+	}
+
+	if (zb->zb_level == ZB_MAXLEVEL)
+		zb->zb_level = dn->dn_nlevels - 1;
+
+	for (;;) {
+		rc = find_block(th, zseg, dn, ZB_DN_CACHE);
+
+		if (rc == EAGAIN || rc == EINTR || rc == ERANGE)
+			break;
+
+		if (rc == 0) {
+			bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level];
+			ASSERT(bc->bc_dnode == dn);
+			ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth);
+			rc = traverse_callback(th, zseg, bc);
+			if (rc) {
+				ASSERT(rc == EINTR);
+				return (rc);
+			}
+			if (BP_IS_HOLE(&bc->bc_blkptr)) {
+				ASSERT(th->th_advance & ADVANCE_HOLES);
+				rc = ENOTBLK;
+			}
+		}
+
+		rc = advance_block(zseg, dn, rc, th->th_advance);
+
+		if (rc == ERANGE)
+			break;
+
+		/*
+		 * Give spa_sync() a chance to run.
+		 */
+		if (spa_traverse_wanted(th->th_spa)) {
+			th->th_syncs++;
+			return (EAGAIN);
+		}
+
+		if (--worklimit == 0)
+			return (EAGAIN);
+	}
+
+	if (rc == ERANGE)
+		rc = advance_object(zseg, zb->zb_object + 1, th->th_advance);
+
+	return (rc);
+}
+
+/*
+ * It is the caller's responsibility to ensure that the dsl_dataset_t
+ * doesn't go away during traversal.
+ */
+int
+traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance,
+    blkptr_cb_t func, void *arg)
+{
+	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+	traverse_handle_t *th;
+	int err;
+
+	th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED);
+
+	traverse_add_objset(th, txg_start, -1ULL, ds->ds_object);
+
+	while ((err = traverse_more(th)) == EAGAIN)
+		continue;
+
+	traverse_fini(th);
+	return (err);
+}
+
+int
+traverse_more(traverse_handle_t *th)
+{
+	zseg_t *zseg = list_head(&th->th_seglist);
+	uint64_t save_txg;	/* XXX won't be necessary with real itinerary */
+	krwlock_t *rw = spa_traverse_rwlock(th->th_spa);
+	blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa);
+	int rc;
+
+	if (zseg == NULL)
+		return (0);
+
+	th->th_restarts++;
+
+	save_txg = zseg->seg_mintxg;
+
+	if (!(th->th_advance & ADVANCE_NOLOCK))
+		rw_enter(rw, RW_READER);
+
+	rc = traverse_segment(th, zseg, mosbp);
+	ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR);
+
+	if (!(th->th_advance & ADVANCE_NOLOCK))
+		rw_exit(rw);
+
+	zseg->seg_mintxg = save_txg;
+
+	if (rc == ERANGE) {
+		list_remove(&th->th_seglist, zseg);
+		kmem_free(zseg, sizeof (*zseg));
+		return (EAGAIN);
+	}
+
+	return (rc);
+}
+
+/*
+ * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves
+ * are not included.  The blocks covered by this segment will all have
+ * mintxg < birth < maxtxg.
+ */
+static void
+traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid,
+    uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid)
+{
+	zseg_t *zseg;
+
+	zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP);
+
+	zseg->seg_mintxg = mintxg;
+	zseg->seg_maxtxg = maxtxg;
+
+	zseg->seg_start.zb_objset = sobjset;
+	zseg->seg_start.zb_object = sobject;
+	zseg->seg_start.zb_level = slevel;
+	zseg->seg_start.zb_blkid = sblkid;
+
+	zseg->seg_end.zb_objset = eobjset;
+	zseg->seg_end.zb_object = eobject;
+	zseg->seg_end.zb_level = elevel;
+	zseg->seg_end.zb_blkid = eblkid;
+
+	list_insert_tail(&th->th_seglist, zseg);
+}
+
+void
+traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t objset, uint64_t object)
+{
+	if (th->th_advance & ADVANCE_PRE)
+		traverse_add_segment(th, mintxg, maxtxg,
+		    objset, object, ZB_MAXLEVEL, 0,
+		    objset, object, 0, ZB_MAXBLKID);
+	else
+		traverse_add_segment(th, mintxg, maxtxg,
+		    objset, object, 0, 0,
+		    objset, object, 0, ZB_MAXBLKID);
+}
+
+void
+traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t objset)
+{
+	if (th->th_advance & ADVANCE_PRE)
+		traverse_add_segment(th, mintxg, maxtxg,
+		    objset, 0, -1, 0,
+		    objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
+	else
+		traverse_add_segment(th, mintxg, maxtxg,
+		    objset, 1, 0, 0,
+		    objset, 0, -1, 0);
+}
+
+void
+traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg)
+{
+	if (th->th_advance & ADVANCE_PRE)
+		traverse_add_segment(th, mintxg, maxtxg,
+		    0, 0, -1, 0,
+		    ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID);
+	else
+		traverse_add_segment(th, mintxg, maxtxg,
+		    1, 1, 0, 0,
+		    0, 0, -1, 0);
+}
+
+traverse_handle_t *
+traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance,
+    int zio_flags)
+{
+	traverse_handle_t *th;
+	int d, l;
+
+	th = kmem_zalloc(sizeof (*th), KM_SLEEP);
+
+	th->th_spa = spa;
+	th->th_func = func;
+	th->th_arg = arg;
+	th->th_advance = advance;
+	th->th_lastcb.zb_level = ZB_NO_LEVEL;
+	th->th_noread.zb_level = ZB_NO_LEVEL;
+	th->th_zio_flags = zio_flags;
+
+	list_create(&th->th_seglist, sizeof (zseg_t),
+	    offsetof(zseg_t, seg_node));
+
+	for (d = 0; d < ZB_DEPTH; d++) {
+		for (l = 0; l < ZB_MAXLEVEL; l++) {
+			if ((advance & ADVANCE_DATA) ||
+			    l != 0 || d != ZB_DN_CACHE)
+				th->th_cache[d][l].bc_data =
+				    zio_buf_alloc(SPA_MAXBLOCKSIZE);
+		}
+	}
+
+	return (th);
+}
+
+void
+traverse_fini(traverse_handle_t *th)
+{
+	int d, l;
+	zseg_t *zseg;
+
+	for (d = 0; d < ZB_DEPTH; d++)
+		for (l = 0; l < ZB_MAXLEVEL; l++)
+			if (th->th_cache[d][l].bc_data != NULL)
+				zio_buf_free(th->th_cache[d][l].bc_data,
+				    SPA_MAXBLOCKSIZE);
+
+	while ((zseg = list_head(&th->th_seglist)) != NULL) {
+		list_remove(&th->th_seglist, zseg);
+		kmem_free(zseg, sizeof (*zseg));
+	}
+
+	list_destroy(&th->th_seglist);
+
+	dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n",
+	    th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks,
+	    th->th_syncs, th->th_restarts);
+
+	kmem_free(th, sizeof (*th));
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
new file mode 100644
index 000000000000..5dd827e946a1
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -0,0 +1,801 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
+#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
+#include <sys/dsl_pool.h>
+#include <sys/zap_impl.h>	/* for ZAP_BLOCK_SHIFT */
+#include <sys/spa.h>
+#include <sys/zfs_context.h>
+
+#ifdef ZFS_DEBUG
+int dmu_use_tx_debug_bufs = 1;
+#endif
+
+dmu_tx_t *
+dmu_tx_create_ds(dsl_dir_t *dd)
+{
+	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
+	tx->tx_dir = dd;
+	if (dd)
+		tx->tx_pool = dd->dd_pool;
+	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
+	    offsetof(dmu_tx_hold_t, dth_node));
+	refcount_create(&tx->tx_space_written);
+	refcount_create(&tx->tx_space_freed);
+	return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create(objset_t *os)
+{
+	dmu_tx_t *tx = dmu_tx_create_ds(os->os->os_dsl_dataset->ds_dir);
+	tx->tx_objset = os;
+	return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
+{
+	dmu_tx_t *tx = dmu_tx_create_ds(NULL);
+
+	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
+	tx->tx_pool = dp;
+	tx->tx_txg = txg;
+	tx->tx_anyobj = TRUE;
+
+	return (tx);
+}
+
+int
+dmu_tx_is_syncing(dmu_tx_t *tx)
+{
+	return (tx->tx_anyobj);
+}
+
+int
+dmu_tx_private_ok(dmu_tx_t *tx)
+{
+	return (tx->tx_anyobj || tx->tx_privateobj);
+}
+
+static void
+dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
+    enum dmu_tx_hold_type type, dmu_tx_hold_func_t func,
+    uint64_t arg1, uint64_t arg2)
+{
+	dmu_tx_hold_t *dth;
+	dnode_t *dn = NULL;
+
+	if (object != DMU_NEW_OBJECT) {
+		dn = dnode_hold(os->os, object, tx);
+
+		if (tx->tx_txg != 0) {
+			mutex_enter(&dn->dn_mtx);
+			/*
+			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
+			 * problem, but there's no way for it to happen (for
+			 * now, at least).
+			 */
+			ASSERT(dn->dn_assigned_txg == 0);
+			ASSERT(dn->dn_assigned_tx == NULL);
+			dn->dn_assigned_txg = tx->tx_txg;
+			dn->dn_assigned_tx = tx;
+			(void) refcount_add(&dn->dn_tx_holds, tx);
+			mutex_exit(&dn->dn_mtx);
+		}
+	}
+
+	dth = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
+	dth->dth_dnode = dn;
+	dth->dth_type = type;
+	dth->dth_func = func;
+	dth->dth_arg1 = arg1;
+	dth->dth_arg2 = arg2;
+	/*
+	 * XXX Investigate using a different data structure to keep
+	 * track of dnodes in a tx.  Maybe array, since there will
+	 * generally not be many entries?
+	 */
+	list_insert_tail(&tx->tx_holds, dth);
+}
+
+void
+dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
+{
+	/*
+	 * If we're syncing, they can manipulate any object anyhow, and
+	 * the hold on the dnode_t can cause problems.
+	 */
+	if (!dmu_tx_is_syncing(tx)) {
+		dmu_tx_hold_object_impl(tx, os, object, THT_NEWOBJECT,
+		    NULL, 0, 0);
+	}
+}
+
+/* ARGSUSED */
+static void
+dmu_tx_count_write(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
+{
+	uint64_t start, end, space;
+	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
+
+	if (len == 0)
+		return;
+
+	min_bs = SPA_MINBLOCKSHIFT;
+	max_bs = SPA_MAXBLOCKSHIFT;
+	min_ibs = DN_MIN_INDBLKSHIFT;
+	max_ibs = DN_MAX_INDBLKSHIFT;
+
+	/*
+	 * If there's more than one block, the blocksize can't change,
+	 * so we can make a more precise estimate.  Alternatively,
+	 * if the dnode's ibs is larger than max_ibs, always use that.
+	 * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
+	 * the code will still work correctly on existing pools.
+	 */
+	if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) {
+		min_ibs = max_ibs = dn->dn_indblkshift;
+		if (dn->dn_datablkshift != 0)
+			min_bs = max_bs = dn->dn_datablkshift;
+	}
+
+	/*
+	 * 'end' is the last thing we will access, not one past.
+	 * This way we won't overflow when accessing the last byte.
+	 */
+	start = P2ALIGN(off, 1ULL << max_bs);
+	end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
+	space = end - start + 1;
+
+	start >>= min_bs;
+	end >>= min_bs;
+
+	epbs = min_ibs - SPA_BLKPTRSHIFT;
+
+	/*
+	 * The object contains at most 2^(64 - min_bs) blocks,
+	 * and each indirect level maps 2^epbs.
+	 */
+	for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
+		start >>= epbs;
+		end >>= epbs;
+		/*
+		 * If we increase the number of levels of indirection,
+		 * we'll need new blkid=0 indirect blocks.  If start == 0,
+		 * we're already accounting for that blocks; and if end == 0,
+		 * we can't increase the number of levels beyond that.
+		 */
+		if (start != 0 && end != 0)
+			space += 1ULL << max_ibs;
+		space += (end - start + 1) << max_ibs;
+	}
+
+	ASSERT(space < 2 * DMU_MAX_ACCESS);
+
+	tx->tx_space_towrite += space;
+}
+
+static void
+dmu_tx_count_dnode(dmu_tx_t *tx, dnode_t *dn)
+{
+	dnode_t *mdn = tx->tx_objset->os->os_meta_dnode;
+	uint64_t object = dn ? dn->dn_object : DN_MAX_OBJECT - 1;
+	uint64_t pre_write_space;
+
+	ASSERT(object < DN_MAX_OBJECT);
+	pre_write_space = tx->tx_space_towrite;
+	dmu_tx_count_write(tx, mdn, object << DNODE_SHIFT, 1 << DNODE_SHIFT);
+	if (dn && dn->dn_dbuf->db_blkptr &&
+	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+	    dn->dn_dbuf->db_blkptr->blk_birth, tx)) {
+		tx->tx_space_tooverwrite +=
+			tx->tx_space_towrite - pre_write_space;
+		tx->tx_space_towrite = pre_write_space;
+	}
+}
+
+/* ARGSUSED */
+static void
+dmu_tx_hold_write_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
+{
+	dmu_tx_count_write(tx, dn, off, len);
+	dmu_tx_count_dnode(tx, dn);
+}
+
+void
+dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
+{
+	ASSERT(tx->tx_txg == 0);
+	ASSERT(len > 0 && len < DMU_MAX_ACCESS);
+	ASSERT(UINT64_MAX - off >= len - 1);
+
+	dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE,
+	    dmu_tx_hold_write_impl, off, len);
+}
+
+static void
+dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
+{
+	uint64_t blkid, nblks;
+	uint64_t space = 0;
+	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+
+	ASSERT(dn->dn_assigned_tx == tx || dn->dn_assigned_tx == NULL);
+
+	if (dn->dn_datablkshift == 0)
+		return;
+	/*
+	 * not that the dnode can change, since it isn't dirty, but
+	 * dbuf_hold_impl() wants us to have the struct_rwlock.
+	 * also need it to protect dn_maxblkid.
+	 */
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	blkid = off >> dn->dn_datablkshift;
+	nblks = (off + len) >> dn->dn_datablkshift;
+
+	if (blkid >= dn->dn_maxblkid)
+		goto out;
+	if (blkid + nblks > dn->dn_maxblkid)
+		nblks = dn->dn_maxblkid - blkid;
+
+	/* don't bother after the 100,000 blocks */
+	nblks = MIN(nblks, 128*1024);
+
+	if (dn->dn_phys->dn_nlevels == 1) {
+		int i;
+		for (i = 0; i < nblks; i++) {
+			blkptr_t *bp = dn->dn_phys->dn_blkptr;
+			ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
+			bp += blkid + i;
+			if (dsl_dataset_block_freeable(ds, bp->blk_birth, tx)) {
+				dprintf_bp(bp, "can free old%s", "");
+				space += BP_GET_ASIZE(bp);
+			}
+		}
+		goto out;
+	}
+
+	while (nblks) {
+		dmu_buf_impl_t *dbuf;
+		int err, epbs, blkoff, tochk;
+
+		epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+		blkoff = P2PHASE(blkid, 1<<epbs);
+		tochk = MIN((1<<epbs) - blkoff, nblks);
+
+		err = dbuf_hold_impl(dn, 1, blkid >> epbs, TRUE, FTAG, &dbuf);
+		if (err == 0) {
+			int i;
+			blkptr_t *bp;
+
+			dbuf_read_havestruct(dbuf);
+
+			bp = dbuf->db.db_data;
+			bp += blkoff;
+
+			for (i = 0; i < tochk; i++) {
+				if (dsl_dataset_block_freeable(ds,
+				    bp[i].blk_birth, tx)) {
+					dprintf_bp(&bp[i],
+					    "can free old%s", "");
+					space += BP_GET_ASIZE(&bp[i]);
+				}
+			}
+			dbuf_remove_ref(dbuf, FTAG);
+		} else {
+			/* the indirect block is sparse */
+			ASSERT(err == ENOENT);
+		}
+
+		blkid += tochk;
+		nblks -= tochk;
+	}
+out:
+	rw_exit(&dn->dn_struct_rwlock);
+
+	tx->tx_space_tofree += space;
+}
+
+static void
+dmu_tx_hold_free_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
+{
+	int dirty;
+
+	/* first block */
+	if (off != 0 /* || dn->dn_maxblkid == 0 */)
+		dmu_tx_count_write(tx, dn, off, 1);
+	/* last block */
+	if (len != DMU_OBJECT_END)
+		dmu_tx_count_write(tx, dn, off+len, 1);
+
+	dmu_tx_count_dnode(tx, dn);
+
+	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
+		return;
+	if (len == DMU_OBJECT_END)
+		len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
+
+	/* XXX locking */
+	dirty = dn->dn_dirtyblksz[0] | dn->dn_dirtyblksz[1] |
+	    dn->dn_dirtyblksz[2] | dn->dn_dirtyblksz[3];
+	if (dn->dn_assigned_tx != NULL && !dirty)
+		dmu_tx_count_free(tx, dn, off, len);
+}
+
+void
+dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
+{
+	ASSERT(tx->tx_txg == 0);
+
+	dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_FREE,
+	    dmu_tx_hold_free_impl, off, len);
+}
+
+/* ARGSUSED */
+static void
+dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t nops, uint64_t cops)
+{
+	uint64_t nblocks;
+	int epbs;
+
+	dmu_tx_count_dnode(tx, dn);
+
+	if (dn == NULL) {
+		/*
+		 * Assuming that nops+cops is not super huge, we will be
+		 * able to fit a new object's entries into one leaf
+		 * block.  So there will be at most 2 blocks total,
+		 * including the header block.
+		 */
+		dmu_tx_count_write(tx, dn, 0, 2 << ZAP_BLOCK_SHIFT);
+		return;
+	}
+
+	ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
+
+	if (dn->dn_maxblkid == 0 && nops == 0) {
+		/*
+		 * If there is only one block  (i.e. this is a micro-zap)
+		 * and we are only doing updates, the accounting is simple.
+		 */
+		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+		    dn->dn_phys->dn_blkptr[0].blk_birth, tx))
+			tx->tx_space_tooverwrite += dn->dn_datablksz;
+		else
+			tx->tx_space_towrite += dn->dn_datablksz;
+		return;
+	}
+
+	/*
+	 * 3 blocks overwritten per op: target leaf, ptrtbl block, header block
+	 * 3 new blocks written per op: new split leaf, 2 grown ptrtbl blocks
+	 */
+	dmu_tx_count_write(tx, dn, dn->dn_maxblkid * dn->dn_datablksz,
+	    (nops * 6ULL + cops * 3ULL) << ZAP_BLOCK_SHIFT);
+
+	/*
+	 * If the modified blocks are scattered to the four winds,
+	 * we'll have to modify an indirect twig for each.
+	 */
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
+		tx->tx_space_towrite +=
+		    ((nops + cops) * 3ULL) << dn->dn_indblkshift;
+}
+
+void
+dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int ops)
+{
+	ASSERT(tx->tx_txg == 0);
+
+	dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_ZAP,
+	    dmu_tx_hold_zap_impl, (ops > 0?ops:0), (ops < 0?-ops:0));
+}
+
+void
+dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
+{
+	ASSERT(tx->tx_txg == 0);
+
+	dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_BONUS,
+	    dmu_tx_hold_write_impl, 0, 0);
+}
+
+
+/* ARGSUSED */
+static void
+dmu_tx_hold_space_impl(dmu_tx_t *tx, dnode_t *dn,
+    uint64_t space, uint64_t unused)
+{
+	tx->tx_space_towrite += space;
+}
+
+void
+dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
+{
+	ASSERT(tx->tx_txg == 0);
+
+	dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPACE,
+	    dmu_tx_hold_space_impl, space, 0);
+}
+
+int
+dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
+{
+	dmu_tx_hold_t *dth;
+	int holds = 0;
+
+	/*
+	 * By asserting that the tx is assigned, we're counting the
+	 * number of dn_tx_holds, which is the same as the number of
+	 * dn_holds.  Otherwise, we'd be counting dn_holds, but
+	 * dn_tx_holds could be 0.
+	 */
+	ASSERT(tx->tx_txg != 0);
+
+	/* if (tx->tx_anyobj == TRUE) */
+		/* return (0); */
+
+	for (dth = list_head(&tx->tx_holds); dth;
+	    dth = list_next(&tx->tx_holds, dth)) {
+		if (dth->dth_dnode && dth->dth_dnode->dn_object == object)
+			holds++;
+	}
+
+	return (holds);
+}
+
+void
+dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
+{
+#ifdef ZFS_DEBUG
+	dmu_tx_hold_t *dth;
+	int match_object = FALSE, match_offset = FALSE;
+	dnode_t *dn = db->db_dnode;
+
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os);
+	ASSERT3U(dn->dn_object, ==, db->db.db_object);
+
+	if (tx->tx_anyobj)
+		return;
+
+	/* XXX No checking on the meta dnode for now */
+	if (db->db.db_object & DMU_PRIVATE_OBJECT)
+		return;
+
+	for (dth = list_head(&tx->tx_holds); dth;
+	    dth = list_next(&tx->tx_holds, dth)) {
+		ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
+		if (dth->dth_dnode == dn && dth->dth_type != THT_NEWOBJECT)
+			match_object = TRUE;
+		if (dth->dth_dnode == NULL || dth->dth_dnode == dn) {
+			int datablkshift = dn->dn_datablkshift ?
+			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
+			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+			int shift = datablkshift + epbs * db->db_level;
+			uint64_t beginblk = shift >= 64 ? 0 :
+			    (dth->dth_arg1 >> shift);
+			uint64_t endblk = shift >= 64 ? 0 :
+			    ((dth->dth_arg1 + dth->dth_arg2 - 1) >> shift);
+			uint64_t blkid = db->db_blkid;
+
+			/* XXX dth_arg2 better not be zero... */
+
+			dprintf("found dth type %x beginblk=%llx endblk=%llx\n",
+			    dth->dth_type, beginblk, endblk);
+
+			switch (dth->dth_type) {
+			case THT_WRITE:
+				if (blkid >= beginblk && blkid <= endblk)
+					match_offset = TRUE;
+				/*
+				 * We will let this hold work for the bonus
+				 * buffer so that we don't need to hold it
+				 * when creating a new object.
+				 */
+				if (blkid == DB_BONUS_BLKID)
+					match_offset = TRUE;
+				/*
+				 * They might have to increase nlevels,
+				 * thus dirtying the new TLIBs.  Or the
+				 * might have to change the block size,
+				 * thus dirying the new lvl=0 blk=0.
+				 */
+				if (blkid == 0)
+					match_offset = TRUE;
+				break;
+			case THT_FREE:
+				if (blkid == beginblk &&
+				    (dth->dth_arg1 != 0 ||
+				    dn->dn_maxblkid == 0))
+					match_offset = TRUE;
+				if (blkid == endblk &&
+				    dth->dth_arg2 != DMU_OBJECT_END)
+					match_offset = TRUE;
+				break;
+			case THT_BONUS:
+				if (blkid == DB_BONUS_BLKID)
+					match_offset = TRUE;
+				break;
+			case THT_ZAP:
+				match_offset = TRUE;
+				break;
+			case THT_NEWOBJECT:
+				match_object = TRUE;
+				break;
+			default:
+				ASSERT(!"bad dth_type");
+			}
+		}
+		if (match_object && match_offset)
+			return;
+	}
+	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
+	    (u_longlong_t)db->db.db_object, db->db_level,
+	    (u_longlong_t)db->db_blkid);
+#endif
+}
+
+static int
+dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth)
+{
+	dmu_tx_hold_t *dth;
+	uint64_t lsize, asize, fsize;
+
+	*last_dth = NULL;
+
+	tx->tx_space_towrite = 0;
+	tx->tx_space_tofree = 0;
+	tx->tx_space_tooverwrite = 0;
+	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
+
+	if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
+		return (ERESTART);
+
+	for (dth = list_head(&tx->tx_holds); dth;
+	    *last_dth = dth, dth = list_next(&tx->tx_holds, dth)) {
+		dnode_t *dn = dth->dth_dnode;
+		if (dn != NULL) {
+			mutex_enter(&dn->dn_mtx);
+			while (dn->dn_assigned_txg == tx->tx_txg - 1) {
+				if (txg_how != TXG_WAIT) {
+					mutex_exit(&dn->dn_mtx);
+					return (ERESTART);
+				}
+				cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
+			}
+			if (dn->dn_assigned_txg == 0) {
+				ASSERT(dn->dn_assigned_tx == NULL);
+				dn->dn_assigned_txg = tx->tx_txg;
+				dn->dn_assigned_tx = tx;
+			} else {
+				ASSERT(dn->dn_assigned_txg == tx->tx_txg);
+				if (dn->dn_assigned_tx != tx)
+					dn->dn_assigned_tx = NULL;
+			}
+			(void) refcount_add(&dn->dn_tx_holds, tx);
+			mutex_exit(&dn->dn_mtx);
+		}
+		if (dth->dth_func)
+			dth->dth_func(tx, dn, dth->dth_arg1, dth->dth_arg2);
+	}
+
+	/*
+	 * Convert logical size to worst-case allocated size.
+	 */
+	fsize = spa_get_asize(tx->tx_pool->dp_spa, tx->tx_space_tooverwrite) +
+	    tx->tx_space_tofree;
+	lsize = tx->tx_space_towrite + tx->tx_space_tooverwrite;
+	asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
+	tx->tx_space_towrite = asize;
+
+	if (tx->tx_dir && asize != 0) {
+		int err = dsl_dir_tempreserve_space(tx->tx_dir,
+		    lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx);
+		if (err)
+			return (err);
+	}
+
+	return (0);
+}
+
+static uint64_t
+dmu_tx_unassign(dmu_tx_t *tx, dmu_tx_hold_t *last_dth)
+{
+	uint64_t txg = tx->tx_txg;
+	dmu_tx_hold_t *dth;
+
+	ASSERT(txg != 0);
+
+	txg_rele_to_quiesce(&tx->tx_txgh);
+
+	for (dth = last_dth; dth; dth = list_prev(&tx->tx_holds, dth)) {
+		dnode_t *dn = dth->dth_dnode;
+
+		if (dn == NULL)
+			continue;
+		mutex_enter(&dn->dn_mtx);
+		ASSERT3U(dn->dn_assigned_txg, ==, txg);
+
+		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+			dn->dn_assigned_txg = 0;
+			dn->dn_assigned_tx = NULL;
+			cv_broadcast(&dn->dn_notxholds);
+		}
+		mutex_exit(&dn->dn_mtx);
+	}
+
+	txg_rele_to_sync(&tx->tx_txgh);
+
+	tx->tx_txg = 0;
+	return (txg);
+}
+
+/*
+ * Assign tx to a transaction group.  txg_how can be one of:
+ *
+ * (1)	TXG_WAIT.  If the current open txg is full, waits until there's
+ *	a new one.  This should be used when you're not holding locks.
+ *	If will only fail if we're truly out of space (or over quota).
+ *
+ * (2)	TXG_NOWAIT.  If we can't assign into the current open txg without
+ *	blocking, returns immediately with ERESTART.  This should be used
+ *	whenever you're holding locks.  On an ERESTART error, the caller
+ *	should drop locks, do a txg_wait_open(dp, 0), and try again.
+ *
+ * (3)	A specific txg.  Use this if you need to ensure that multiple
+ *	transactions all sync in the same txg.  Like TXG_NOWAIT, it
+ *	returns ERESTART if it can't assign you into the requested txg.
+ */
+int
+dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
+{
+	dmu_tx_hold_t *last_dth;
+	int err;
+
+	ASSERT(tx->tx_txg == 0);
+	ASSERT(txg_how != 0);
+	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
+	ASSERT3U(tx->tx_space_towrite, ==, 0);
+	ASSERT3U(tx->tx_space_tofree, ==, 0);
+
+	while ((err = dmu_tx_try_assign(tx, txg_how, &last_dth)) != 0) {
+		uint64_t txg = dmu_tx_unassign(tx, last_dth);
+
+		if (err != ERESTART || txg_how != TXG_WAIT)
+			return (err);
+
+		txg_wait_open(tx->tx_pool, txg + 1);
+	}
+
+	txg_rele_to_quiesce(&tx->tx_txgh);
+
+	return (0);
+}
+
+void
+dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
+{
+	if (tx->tx_dir == NULL || delta == 0)
+		return;
+
+	if (delta > 0) {
+		ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
+		    tx->tx_space_towrite);
+		(void) refcount_add_many(&tx->tx_space_written, delta, NULL);
+	} else {
+		(void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
+	}
+}
+
+void
+dmu_tx_commit(dmu_tx_t *tx)
+{
+	dmu_tx_hold_t *dth;
+
+	ASSERT(tx->tx_txg != 0);
+
+	while (dth = list_head(&tx->tx_holds)) {
+		dnode_t *dn = dth->dth_dnode;
+
+		list_remove(&tx->tx_holds, dth);
+		kmem_free(dth, sizeof (dmu_tx_hold_t));
+		if (dn == NULL)
+			continue;
+		mutex_enter(&dn->dn_mtx);
+		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+			dn->dn_assigned_txg = 0;
+			dn->dn_assigned_tx = NULL;
+			cv_broadcast(&dn->dn_notxholds);
+		}
+		mutex_exit(&dn->dn_mtx);
+		dnode_rele(dn, tx);
+	}
+
+	if (tx->tx_dir && tx->tx_space_towrite > 0) {
+		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
+	}
+
+	if (tx->tx_anyobj == FALSE)
+		txg_rele_to_sync(&tx->tx_txgh);
+	dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
+	    tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
+	    tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
+	refcount_destroy_many(&tx->tx_space_written,
+	    refcount_count(&tx->tx_space_written));
+	refcount_destroy_many(&tx->tx_space_freed,
+	    refcount_count(&tx->tx_space_freed));
+#ifdef ZFS_DEBUG
+	if (tx->tx_debug_buf)
+		kmem_free(tx->tx_debug_buf, 4096);
+#endif
+	kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+void
+dmu_tx_abort(dmu_tx_t *tx)
+{
+	dmu_tx_hold_t *dth;
+
+	ASSERT(tx->tx_txg == 0);
+
+	while (dth = list_head(&tx->tx_holds)) {
+		dnode_t *dn = dth->dth_dnode;
+
+		list_remove(&tx->tx_holds, dth);
+		kmem_free(dth, sizeof (dmu_tx_hold_t));
+		if (dn != NULL)
+			dnode_rele(dn, tx);
+	}
+	refcount_destroy_many(&tx->tx_space_written,
+	    refcount_count(&tx->tx_space_written));
+	refcount_destroy_many(&tx->tx_space_freed,
+	    refcount_count(&tx->tx_space_freed));
+#ifdef ZFS_DEBUG
+	if (tx->tx_debug_buf)
+		kmem_free(tx->tx_debug_buf, 4096);
+#endif
+	kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+uint64_t
+dmu_tx_get_txg(dmu_tx_t *tx)
+{
+	ASSERT(tx->tx_txg != 0);
+	return (tx->tx_txg);
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_zfetch.c b/usr/src/uts/common/fs/zfs/dmu_zfetch.c
new file mode 100644
index 000000000000..cfaeaf067492
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dmu_zfetch.c
@@ -0,0 +1,603 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/dmu.h>
+#include <sys/dbuf.h>
+
+/*
+ * I'm against tune-ables, but these should probably exist as tweakable globals
+ * until we can get this working the way we want it to.
+ */
+
+/* max # of streams per zfetch */
+uint32_t	zfetch_max_streams = 8;
+/* min time before stream reclaim */
+uint32_t	zfetch_min_sec_reap = 2;
+/* max number of blocks to fetch at a time */
+uint32_t	zfetch_block_cap = 32;
+/* number of bytes in a array_read at which we stop prefetching (1Mb) */
+uint64_t	zfetch_array_rd_sz = 1024 * 1024;
+
+/* forward decls for static routines */
+static int		dmu_zfetch_colinear(zfetch_t *, zstream_t *);
+static void		dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
+static uint64_t		dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
+static uint64_t		dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
+static int		dmu_zfetch_find(zfetch_t *, zstream_t *);
+static int		dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
+static zstream_t	*dmu_zfetch_stream_reclaim(zfetch_t *);
+static void		dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
+static void		dmu_zfetch_stream_update(zfetch_t *, zstream_t *);
+static int		dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
+
+
+/*
+ * Given a zfetch structure and a zstream structure, determine whether the
+ * blocks to be read are part of a co-linear to a pair of existing prefetch
+ * streams.  If a set is found, coalesce the streams, removing one, and
+ * configure the prefetch so it looks for a strided access pattern.
+ *
+ * If no co-linear streams are found, return NULL.
+ */
+static int
+dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
+{
+	zstream_t	*z_walk;
+	zstream_t	*z_comp;
+
+	rw_enter(&zf->zf_rwlock, RW_WRITER);
+
+	if (zh == NULL) {
+		rw_exit(&zf->zf_rwlock);
+		return (0);
+	}
+
+	for (z_walk = list_head(&zf->zf_stream); z_walk;
+	    z_walk = list_next(&zf->zf_stream, z_walk)) {
+		for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
+		    z_comp = list_next(&zf->zf_stream, z_comp)) {
+			int64_t		diff;
+
+			if (z_walk->zst_len != z_walk->zst_stride ||
+			    z_comp->zst_len != z_comp->zst_stride) {
+				continue;
+			}
+
+			diff = z_comp->zst_offset - z_walk->zst_offset;
+			if (z_comp->zst_offset + diff == zh->zst_offset) {
+				z_walk->zst_offset = zh->zst_offset;
+				z_walk->zst_direction = diff < 0 ? -1 : 1;
+				z_walk->zst_stride =
+				    diff * z_walk->zst_direction;
+				z_walk->zst_ph_offset =
+				    zh->zst_offset + z_walk->zst_stride;
+				dmu_zfetch_stream_remove(zf, z_comp);
+				mutex_destroy(&z_comp->zst_lock);
+				kmem_free(z_comp, sizeof (zstream_t));
+
+				dmu_zfetch_dofetch(zf, z_walk);
+
+				rw_exit(&zf->zf_rwlock);
+				return (1);
+			}
+
+			diff = z_walk->zst_offset - z_comp->zst_offset;
+			if (z_walk->zst_offset + diff == zh->zst_offset) {
+				z_walk->zst_offset = zh->zst_offset;
+				z_walk->zst_direction = diff < 0 ? -1 : 1;
+				z_walk->zst_stride =
+				    diff * z_walk->zst_direction;
+				z_walk->zst_ph_offset =
+				    zh->zst_offset + z_walk->zst_stride;
+				dmu_zfetch_stream_remove(zf, z_comp);
+				mutex_destroy(&z_comp->zst_lock);
+				kmem_free(z_comp, sizeof (zstream_t));
+
+				dmu_zfetch_dofetch(zf, z_walk);
+
+				rw_exit(&zf->zf_rwlock);
+				return (1);
+			}
+		}
+	}
+
+	rw_exit(&zf->zf_rwlock);
+	return (0);
+}
+
+/*
+ * Given a zstream_t, determine the bounds of the prefetch.  Then call the
+ * routine that actually prefetches the individual blocks.
+ */
+static void
+dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
+{
+	uint64_t	prefetch_tail;
+	uint64_t	prefetch_limit;
+	uint64_t	prefetch_ofst;
+	uint64_t	prefetch_len;
+	uint64_t	blocks_fetched;
+
+	zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
+	zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
+
+	prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
+	    (int64_t)(zs->zst_offset + zs->zst_stride));
+	/*
+	 * XXX: use a faster division method?
+	 */
+	prefetch_limit = zs->zst_offset + zs->zst_len +
+	    (zs->zst_cap * zs->zst_stride) / zs->zst_len;
+
+	while (prefetch_tail < prefetch_limit) {
+		prefetch_ofst = zs->zst_offset + zs->zst_direction *
+		    (prefetch_tail - zs->zst_offset);
+
+		prefetch_len = zs->zst_len;
+
+		/*
+		 * Don't prefetch beyond the end of the file, if working
+		 * backwards.
+		 */
+		if ((zs->zst_direction == ZFETCH_BACKWARD) &&
+		    (prefetch_ofst > prefetch_tail)) {
+			prefetch_len += prefetch_ofst;
+			prefetch_ofst = 0;
+		}
+
+		/* don't prefetch more than we're supposed to */
+		if (prefetch_len > zs->zst_len)
+			break;
+
+		blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
+		    prefetch_ofst, zs->zst_len);
+
+		prefetch_tail += zs->zst_stride;
+		/* stop if we've run out of stuff to prefetch */
+		if (blocks_fetched < zs->zst_len)
+			break;
+	}
+	zs->zst_ph_offset = prefetch_tail;
+	zs->zst_last = lbolt;
+}
+
+/*
+ * This takes a pointer to a zfetch structure and a dnode.  It performs the
+ * necessary setup for the zfetch structure, grokking data from the
+ * associated dnode.
+ */
+void
+dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
+{
+	if (zf == NULL) {
+		return;
+	}
+
+	zf->zf_dnode = dno;
+	zf->zf_stream_cnt = 0;
+	zf->zf_alloc_fail = 0;
+
+	list_create(&zf->zf_stream, sizeof (zstream_t),
+	    offsetof(zstream_t, zst_node));
+
+	rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
+}
+
+/*
+ * This function computes the actual size, in blocks, that can be prefetched,
+ * and fetches it.
+ */
+static uint64_t
+dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
+{
+	uint64_t	fetchsz;
+	uint64_t	i;
+
+	fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
+
+	for (i = 0; i < fetchsz; i++) {
+		dbuf_prefetch(dn, blkid + i);
+	}
+
+	return (fetchsz);
+}
+
+/*
+ * this function returns the number of blocks that would be prefetched, based
+ * upon the supplied dnode, blockid, and nblks.  This is used so that we can
+ * update streams in place, and then prefetch with their old value after the
+ * fact.  This way, we can delay the prefetch, but subsequent accesses to the
+ * stream won't result in the same data being prefetched multiple times.
+ */
+static uint64_t
+dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
+{
+	uint64_t	fetchsz;
+
+	if (blkid > dn->dn_maxblkid) {
+		return (0);
+	}
+
+	/* compute fetch size */
+	if (blkid + nblks > dn->dn_maxblkid) {
+		fetchsz = dn->dn_maxblkid - blkid;
+		ASSERT(blkid + fetchsz <= dn->dn_maxblkid);
+	} else {
+		fetchsz = nblks;
+	}
+
+
+	return (fetchsz);
+}
+
+/*
+ * given a zfetch and a zsearch structure, see if there is an associated zstream
+ * for this block read.  If so, it starts a prefetch for the stream it
+ * located and returns true, otherwise it returns false
+ */
+static int
+dmu_zfetch_find(zfetch_t *zf, zstream_t *zh)
+{
+	zstream_t	*zs;
+	int64_t		diff;
+	int		rc = 0;
+
+	if (zh == NULL)
+		return (0);
+
+	/*
+	 * XXX: This locking strategy is a bit coarse; however, it's impact has
+	 * yet to be tested.  If this turns out to be an issue, it can be
+	 * modified in a number of different ways.
+	 */
+
+	rw_enter(&zf->zf_rwlock, RW_READER);
+top:
+
+	for (zs = list_head(&zf->zf_stream); zs;
+	    zs = list_next(&zf->zf_stream, zs)) {
+
+
+		if (zs->zst_len == 0) {
+			/* bogus stream */
+			continue;
+		}
+
+		if (zh->zst_offset - zs->zst_offset < zs->zst_len) {
+			/* already fetched */
+			rw_exit(&zf->zf_rwlock);
+			return (1);
+		}
+
+		if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
+			/* forward sequential access */
+
+			mutex_enter(&zs->zst_lock);
+
+			if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
+				mutex_exit(&zs->zst_lock);
+				goto top;
+			}
+
+			zs->zst_len += zh->zst_len;
+			diff = zs->zst_len - zfetch_block_cap;
+			if (diff > 0) {
+				zs->zst_offset += diff;
+				zs->zst_len = zs->zst_len > diff ?
+				    zs->zst_len - diff : 0;
+			}
+			zs->zst_direction = ZFETCH_FORWARD;
+
+			break;
+
+		} else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
+			/* backwards sequential access */
+
+			mutex_enter(&zs->zst_lock);
+
+			if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
+				mutex_exit(&zs->zst_lock);
+				goto top;
+			}
+
+			zs->zst_offset = zs->zst_offset > zh->zst_len ?
+			    zs->zst_offset - zh->zst_len : 0;
+			zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
+			    zs->zst_ph_offset - zh->zst_len : 0;
+			zs->zst_len += zh->zst_len;
+
+			diff = zs->zst_len - zfetch_block_cap;
+			if (diff > 0) {
+				zs->zst_ph_offset = zs->zst_ph_offset > diff ?
+				    zs->zst_ph_offset - diff : 0;
+				zs->zst_len = zs->zst_len > diff ?
+				    zs->zst_len - diff : zs->zst_len;
+			}
+			zs->zst_direction = ZFETCH_BACKWARD;
+
+			break;
+
+		} else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
+		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
+			/* strided forward access */
+
+			mutex_enter(&zs->zst_lock);
+
+			if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
+			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
+				mutex_exit(&zs->zst_lock);
+				goto top;
+			}
+
+			zs->zst_offset += zs->zst_stride;
+			zs->zst_direction = ZFETCH_FORWARD;
+
+			break;
+
+		} else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
+		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
+			/* strided reverse access */
+
+			mutex_enter(&zs->zst_lock);
+
+			if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
+			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
+				mutex_exit(&zs->zst_lock);
+				goto top;
+			}
+
+			zs->zst_offset = zs->zst_offset > zs->zst_stride ?
+			    zs->zst_offset - zs->zst_stride : 0;
+			zs->zst_ph_offset = (zs->zst_ph_offset >
+			    (2 * zs->zst_stride)) ?
+			    (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
+			zs->zst_direction = ZFETCH_BACKWARD;
+
+			break;
+		}
+	}
+
+	if (zs) {
+		rc = 1;
+		dmu_zfetch_dofetch(zf, zs);
+		mutex_exit(&zs->zst_lock);
+	}
+
+	rw_exit(&zf->zf_rwlock);
+	return (rc);
+}
+
+/*
+ * Clean-up state associated with a zfetch structure.  This frees allocated
+ * structure members, empties the zf_stream tree, and generally makes things
+ * nice.  This doesn't free the zfetch_t itself, that's left to the caller.
+ */
+void
+dmu_zfetch_rele(zfetch_t *zf)
+{
+	zstream_t	*zs;
+	zstream_t	*zs_next;
+
+	ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
+
+	for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
+		zs_next = list_next(&zf->zf_stream, zs);
+
+		list_remove(&zf->zf_stream, zs);
+		mutex_destroy(&zs->zst_lock);
+		kmem_free(zs, sizeof (zstream_t));
+	}
+	list_destroy(&zf->zf_stream);
+	rw_destroy(&zf->zf_rwlock);
+
+	zf->zf_dnode = NULL;
+}
+
+/*
+ * Given a zfetch and zstream structure, insert the zstream structure into the
+ * AVL tree contained within the zfetch structure.  Peform the appropriate
+ * book-keeping.  It is possible that another thread has inserted a stream which
+ * matches one that we are about to insert, so we must be sure to check for this
+ * case.  If one is found, return failure, and let the caller cleanup the
+ * duplicates.
+ */
+static int
+dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
+{
+	zstream_t	*zs_walk;
+	zstream_t	*zs_next;
+
+	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+
+	for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
+		zs_next = list_next(&zf->zf_stream, zs_walk);
+
+		if (dmu_zfetch_streams_equal(zs_walk, zs)) {
+		    return (0);
+		}
+	}
+
+	list_insert_head(&zf->zf_stream, zs);
+	zf->zf_stream_cnt++;
+
+	return (1);
+}
+
+
+/*
+ * Walk the list of zstreams in the given zfetch, find an old one (by time), and
+ * reclaim it for use by the caller.
+ */
+static zstream_t *
+dmu_zfetch_stream_reclaim(zfetch_t *zf)
+{
+	zstream_t	*zs;
+
+	rw_enter(&zf->zf_rwlock, RW_WRITER);
+
+	for (zs = list_head(&zf->zf_stream); zs;
+	    zs = list_next(&zf->zf_stream, zs)) {
+
+		if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap)
+			break;
+	}
+
+	if (zs) {
+		dmu_zfetch_stream_remove(zf, zs);
+		mutex_destroy(&zs->zst_lock);
+		bzero(zs, sizeof (zstream_t));
+	} else {
+		zf->zf_alloc_fail++;
+	}
+	rw_exit(&zf->zf_rwlock);
+
+	return (zs);
+}
+
+/*
+ * Given a zfetch and zstream structure, remove the zstream structure from its
+ * container in the zfetch structure.  Perform the appropriate book-keeping.
+ */
+static void
+dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
+{
+	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+
+	list_remove(&zf->zf_stream, zs);
+	zf->zf_stream_cnt--;
+}
+
+static int
+dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
+{
+	if (zs1->zst_offset != zs2->zst_offset)
+		return (0);
+
+	if (zs1->zst_len != zs2->zst_len)
+		return (0);
+
+	if (zs1->zst_stride != zs2->zst_stride)
+		return (0);
+
+	if (zs1->zst_ph_offset != zs2->zst_ph_offset)
+		return (0);
+
+	if (zs1->zst_cap != zs2->zst_cap)
+		return (0);
+
+	if (zs1->zst_direction != zs2->zst_direction)
+		return (0);
+
+	return (1);
+}
+
+/*
+ * This is the prefetch entry point.  It calls all of the other dmu_zfetch
+ * routines to create, delete, find, or operate upon prefetch streams.
+ */
+void
+dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size)
+{
+	zstream_t	zst;
+	zstream_t	*newstream;
+	int		fetched;
+	int		inserted;
+	unsigned int	blkshft;
+	uint64_t	blksz;
+
+	/* files that aren't ln2 blocksz are only one block -- nothing to do */
+	if (!zf->zf_dnode->dn_datablkshift) {
+		return;
+	}
+
+	/* convert offset and size, into blockid and nblocks */
+	blkshft = zf->zf_dnode->dn_datablkshift;
+	blksz = (1 << blkshft);
+
+	bzero(&zst, sizeof (zstream_t));
+	zst.zst_offset = offset >> blkshft;
+	zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
+	    P2ALIGN(offset, blksz)) >> blkshft;
+
+	fetched = dmu_zfetch_find(zf, &zst);
+	if (!fetched) {
+		fetched = dmu_zfetch_colinear(zf, &zst);
+	}
+
+	if (!fetched) {
+		newstream = dmu_zfetch_stream_reclaim(zf);
+
+		/*
+		 * we still couldn't find a stream, drop the lock, and allocate
+		 * one if possible.  Otherwise, give up and go home.
+		 */
+		if (newstream == NULL) {
+			uint64_t	maxblocks;
+			uint32_t	max_streams;
+			uint32_t	cur_streams;
+
+			cur_streams = zf->zf_stream_cnt;
+			maxblocks = zf->zf_dnode->dn_maxblkid;
+
+			max_streams = MIN(zfetch_max_streams,
+			    (maxblocks / zfetch_block_cap));
+			if (max_streams == 0) {
+				max_streams++;
+			}
+
+			if (cur_streams >= max_streams) {
+				return;
+			}
+
+			newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
+		}
+
+		newstream->zst_offset = zst.zst_offset;
+		newstream->zst_len = zst.zst_len;
+		newstream->zst_stride = zst.zst_len;
+		newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
+		newstream->zst_cap = zst.zst_len;
+		newstream->zst_direction = ZFETCH_FORWARD;
+		newstream->zst_last = lbolt;
+
+		mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
+
+		rw_enter(&zf->zf_rwlock, RW_WRITER);
+		inserted = dmu_zfetch_stream_insert(zf, newstream);
+		rw_exit(&zf->zf_rwlock);
+
+		if (!inserted) {
+			mutex_destroy(&newstream->zst_lock);
+			kmem_free(newstream, sizeof (zstream_t));
+		}
+	}
+}
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c
new file mode 100644
index 000000000000..6b25b35ab13a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dnode.c
@@ -0,0 +1,1304 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+
+static int free_range_compar(const void *node1, const void *node2);
+
+static kmem_cache_t *dnode_cache;
+
+static dnode_phys_t dnode_phys_zero;
+
+int zfs_default_bs = SPA_MINBLOCKSHIFT;
+int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
+
+/* ARGSUSED */
+static int
+dnode_cons(void *arg, void *unused, int kmflag)
+{
+	int i;
+	dnode_t *dn = arg;
+	bzero(dn, sizeof (dnode_t));
+
+	rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
+	refcount_create(&dn->dn_holds);
+	refcount_create(&dn->dn_tx_holds);
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		avl_create(&dn->dn_ranges[i], free_range_compar,
+		    sizeof (free_range_t),
+		    offsetof(struct free_range, fr_node));
+		list_create(&dn->dn_dirty_dbufs[i],
+		    sizeof (dmu_buf_impl_t),
+		    offsetof(dmu_buf_impl_t, db_dirty_node[i]));
+	}
+
+	list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+	    offsetof(dmu_buf_impl_t, db_link));
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+dnode_dest(void *arg, void *unused)
+{
+	int i;
+	dnode_t *dn = arg;
+
+	rw_destroy(&dn->dn_struct_rwlock);
+	mutex_destroy(&dn->dn_mtx);
+	mutex_destroy(&dn->dn_dbufs_mtx);
+	refcount_destroy(&dn->dn_holds);
+	refcount_destroy(&dn->dn_tx_holds);
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		avl_destroy(&dn->dn_ranges[i]);
+		list_destroy(&dn->dn_dirty_dbufs[i]);
+	}
+
+	list_destroy(&dn->dn_dbufs);
+}
+
+void
+dnode_init(void)
+{
+	dnode_cache = kmem_cache_create("dnode_t",
+	    sizeof (dnode_t),
+	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+}
+
+void
+dnode_fini(void)
+{
+	kmem_cache_destroy(dnode_cache);
+}
+
+
+void
+dnode_verify(dnode_t *dn)
+{
+#ifdef ZFS_DEBUG
+	int drop_struct_lock = FALSE;
+
+	ASSERT(dn->dn_phys);
+	ASSERT(dn->dn_objset);
+
+	ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+
+	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
+		return;
+
+	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		drop_struct_lock = TRUE;
+	}
+	if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
+		int i;
+		ASSERT3U(dn->dn_indblkshift, >=, 0);
+		ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
+		if (dn->dn_datablkshift) {
+			ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
+			ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
+			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
+		}
+		ASSERT3U(dn->dn_nlevels, <=, 30);
+		ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES);
+		ASSERT3U(dn->dn_nblkptr, >=, 1);
+		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+		ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+		ASSERT3U(dn->dn_datablksz, ==,
+		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+		ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
+		ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
+		    dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+		for (i = 0; i < TXG_SIZE; i++) {
+			ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
+		}
+	}
+	if (dn->dn_phys->dn_type != DMU_OT_NONE)
+		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
+	ASSERT(IS_DNODE_DNODE(dn->dn_object) || dn->dn_dbuf);
+	if (dn->dn_dbuf != NULL) {
+		ASSERT3P(dn->dn_phys, ==,
+		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
+		    (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
+	}
+	if (drop_struct_lock)
+		rw_exit(&dn->dn_struct_rwlock);
+#endif
+}
+
+void
+dnode_byteswap(dnode_phys_t *dnp)
+{
+	uint64_t *buf64 = (void*)&dnp->dn_blkptr;
+	int i;
+
+	if (dnp->dn_type == DMU_OT_NONE) {
+		bzero(dnp, sizeof (dnode_phys_t));
+		return;
+	}
+
+	dnp->dn_type = BSWAP_8(dnp->dn_type);
+	dnp->dn_indblkshift = BSWAP_8(dnp->dn_indblkshift);
+	dnp->dn_nlevels = BSWAP_8(dnp->dn_nlevels);
+	dnp->dn_nblkptr = BSWAP_8(dnp->dn_nblkptr);
+	dnp->dn_bonustype = BSWAP_8(dnp->dn_bonustype);
+	dnp->dn_checksum = BSWAP_8(dnp->dn_checksum);
+	dnp->dn_compress = BSWAP_8(dnp->dn_compress);
+	dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
+	dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+	dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
+	dnp->dn_secphys = BSWAP_64(dnp->dn_secphys);
+
+	/*
+	 * dn_nblkptr is only one byte, so it's OK to read it in either
+	 * byte order.  We can't read dn_bouslen.
+	 */
+	ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
+	ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
+	for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
+		buf64[i] = BSWAP_64(buf64[i]);
+
+	/*
+	 * OK to check dn_bonuslen for zero, because it won't matter if
+	 * we have the wrong byte order.  This is necessary because the
+	 * dnode dnode is smaller than a regular dnode.
+	 */
+	if (dnp->dn_bonuslen != 0) {
+		/*
+		 * Note that the bonus length calculated here may be
+		 * longer than the actual bonus buffer.  This is because
+		 * we always put the bonus buffer after the last block
+		 * pointer (instead of packing it against the end of the
+		 * dnode buffer).
+		 */
+		int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
+		size_t len = DN_MAX_BONUSLEN - off;
+		dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
+	}
+}
+
+void
+dnode_buf_byteswap(void *vbuf, size_t size)
+{
+	dnode_phys_t *buf = vbuf;
+	int i;
+
+	ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
+	ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
+
+	size >>= DNODE_SHIFT;
+	for (i = 0; i < size; i++) {
+		dnode_byteswap(buf);
+		buf++;
+	}
+}
+
+static int
+free_range_compar(const void *node1, const void *node2)
+{
+	const free_range_t *rp1 = node1;
+	const free_range_t *rp2 = node2;
+
+	if (rp1->fr_blkid < rp2->fr_blkid)
+		return (-1);
+	else if (rp1->fr_blkid > rp2->fr_blkid)
+		return (1);
+	else return (0);
+}
+
+static void
+dnode_setdblksz(dnode_t *dn, int size)
+{
+	ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0);
+	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
+	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
+	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
+	dn->dn_datablksz = size;
+	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
+	dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
+}
+
+static dnode_t *
+dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+    uint64_t object)
+{
+	dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+	(void) dnode_cons(dn, NULL, 0); /* XXX */
+
+	dn->dn_objset = os;
+	dn->dn_object = object;
+	dn->dn_dbuf = db;
+	dn->dn_phys = dnp;
+
+	if (dnp->dn_datablkszsec)
+		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+	dn->dn_indblkshift = dnp->dn_indblkshift;
+	dn->dn_nlevels = dnp->dn_nlevels;
+	dn->dn_type = dnp->dn_type;
+	dn->dn_nblkptr = dnp->dn_nblkptr;
+	dn->dn_checksum = dnp->dn_checksum;
+	dn->dn_compress = dnp->dn_compress;
+	dn->dn_bonustype = dnp->dn_bonustype;
+	dn->dn_bonuslen = dnp->dn_bonuslen;
+	dn->dn_maxblkid = dnp->dn_maxblkid;
+
+	dmu_zfetch_init(&dn->dn_zfetch, dn);
+
+	ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+	mutex_enter(&os->os_lock);
+	list_insert_head(&os->os_dnodes, dn);
+	mutex_exit(&os->os_lock);
+
+	return (dn);
+}
+
+static void
+dnode_destroy(dnode_t *dn)
+{
+	objset_impl_t *os = dn->dn_objset;
+
+	mutex_enter(&os->os_lock);
+	list_remove(&os->os_dnodes, dn);
+	mutex_exit(&os->os_lock);
+
+	if (dn->dn_dirtyctx_firstset) {
+		kmem_free(dn->dn_dirtyctx_firstset, 1);
+		dn->dn_dirtyctx_firstset = NULL;
+	}
+	dmu_zfetch_rele(&dn->dn_zfetch);
+	kmem_cache_free(dnode_cache, dn);
+}
+
+void
+dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+	dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	int i;
+
+	if (blocksize == 0)
+		blocksize = 1 << zfs_default_bs;
+
+	blocksize = MIN(MAX(blocksize, SPA_MINBLOCKSIZE), SPA_MAXBLOCKSIZE);
+
+	if (ibs == 0)
+		ibs = zfs_default_ibs;
+
+	ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
+
+	dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
+	    dn->dn_object, tx->tx_txg, blocksize, ibs);
+
+	ASSERT(dn->dn_type == DMU_OT_NONE);
+	ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
+	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
+	ASSERT(ot != DMU_OT_NONE);
+	ASSERT3U(ot, <, DMU_OT_NUMTYPES);
+	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+	    (bonustype != DMU_OT_NONE && bonuslen != 0));
+	ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+	ASSERT(dn->dn_type == DMU_OT_NONE);
+	ASSERT3U(dn->dn_maxblkid, ==, 0);
+	ASSERT3U(dn->dn_allocated_txg, ==, 0);
+	ASSERT3U(dn->dn_assigned_txg, ==, 0);
+	ASSERT(refcount_is_zero(&dn->dn_tx_holds));
+	ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
+	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
+		ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+		ASSERT3U(dn->dn_dirtyblksz[i], ==, 0);
+		ASSERT3P(list_head(&dn->dn_dirty_dbufs[i]), ==, NULL);
+		ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
+	}
+
+	dn->dn_type = ot;
+	dnode_setdblksz(dn, blocksize);
+	dn->dn_indblkshift = ibs;
+	dn->dn_nlevels = 1;
+	dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+	dn->dn_bonustype = bonustype;
+	dn->dn_bonuslen = bonuslen;
+	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+	dn->dn_compress = ZIO_COMPRESS_INHERIT;
+	dn->dn_dirtyctx = 0;
+
+	dn->dn_free_txg = 0;
+	if (dn->dn_dirtyctx_firstset) {
+		kmem_free(dn->dn_dirtyctx_firstset, 1);
+		dn->dn_dirtyctx_firstset = NULL;
+	}
+
+	dn->dn_allocated_txg = tx->tx_txg;
+	dnode_setdirty(dn, tx);
+}
+
+void
+dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = NULL;
+
+	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
+	ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+	ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
+	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+	ASSERT(!(dn->dn_object & DMU_PRIVATE_OBJECT) || dmu_tx_private_ok(tx));
+	ASSERT(tx->tx_txg != 0);
+	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+	    (bonustype != DMU_OT_NONE && bonuslen != 0));
+	ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+	ASSERT(dn->dn_dirtyblksz[0] == 0);
+	ASSERT(dn->dn_dirtyblksz[1] == 0);
+	ASSERT(dn->dn_dirtyblksz[2] == 0);
+	ASSERT(dn->dn_dirtyblksz[3] == 0);
+
+	/*
+	 * XXX I should really have a generation number to tell if we
+	 * need to do this...
+	 */
+	if (blocksize != dn->dn_datablksz ||
+	    dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) {
+		/* free all old data */
+		dnode_free_range(dn, 0, -1ULL, tx);
+	}
+
+	/* change blocksize */
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	dnode_setdblksz(dn, blocksize);
+	dnode_setdirty(dn, tx);
+	/* don't need dd_dirty_mtx, dnode is already dirty */
+	ASSERT(dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] != 0);
+	dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = blocksize;
+	rw_exit(&dn->dn_struct_rwlock);
+
+	/* change type */
+	dn->dn_type = ot;
+
+	if (dn->dn_bonuslen != bonuslen) {
+		/* change bonus size */
+		if (bonuslen == 0)
+			bonuslen = 1; /* XXX */
+		db = dbuf_hold_bonus(dn, FTAG);
+		dbuf_read(db);
+		mutex_enter(&db->db_mtx);
+		ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+		ASSERT(db->db.db_data != NULL);
+		db->db.db_size = bonuslen;
+		mutex_exit(&db->db_mtx);
+		dbuf_dirty(db, tx);
+	}
+
+	/* change bonus size and type */
+	mutex_enter(&dn->dn_mtx);
+	dn->dn_bonustype = bonustype;
+	dn->dn_bonuslen = bonuslen;
+	dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+	dn->dn_compress = ZIO_COMPRESS_INHERIT;
+	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+
+	dn->dn_allocated_txg = tx->tx_txg;
+	mutex_exit(&dn->dn_mtx);
+
+	if (db)
+		dbuf_remove_ref(db, FTAG);
+}
+
+void
+dnode_special_close(dnode_t *dn)
+{
+	dnode_destroy(dn);
+}
+
+dnode_t *
+dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object)
+{
+	dnode_t *dn = dnode_create(os, dnp, NULL, object);
+	dnode_verify(dn);
+	return (dn);
+}
+
+static void
+dnode_buf_pageout(dmu_buf_t *db, void *arg)
+{
+	dnode_t **children_dnodes = arg;
+	int i;
+	int epb = db->db_size >> DNODE_SHIFT;
+
+	for (i = 0; i < epb; i++) {
+		dnode_t *dn = children_dnodes[i];
+		int n;
+
+		if (dn == NULL)
+			continue;
+#ifdef ZFS_DEBUG
+		/*
+		 * If there are holds on this dnode, then there should
+		 * be holds on the dnode's containing dbuf as well; thus
+		 * it wouldn't be eligable for eviction and this function
+		 * would not have been called.
+		 */
+		ASSERT(refcount_is_zero(&dn->dn_holds));
+		ASSERT(list_head(&dn->dn_dbufs) == NULL);
+		ASSERT(refcount_is_zero(&dn->dn_tx_holds));
+
+		for (n = 0; n < TXG_SIZE; n++)
+			ASSERT(dn->dn_dirtyblksz[n] == 0);
+#endif
+		children_dnodes[i] = NULL;
+		dnode_destroy(dn);
+	}
+	kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+}
+
+/*
+ * Returns held dnode if the object number is valid, NULL if not.
+ * Note that this will succeed even for free dnodes.
+ */
+dnode_t *
+dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, void *ref)
+{
+	int epb, idx;
+	int drop_struct_lock = FALSE;
+	uint64_t blk;
+	dnode_t *mdn, *dn;
+	dmu_buf_impl_t *db;
+	dnode_t **children_dnodes;
+
+	if (object == 0 || object >= DN_MAX_OBJECT)
+		return (NULL);
+
+	mdn = os->os_meta_dnode;
+
+	dnode_verify(mdn);
+
+	if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
+		rw_enter(&mdn->dn_struct_rwlock, RW_READER);
+		drop_struct_lock = TRUE;
+	}
+
+	blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+
+	db = dbuf_hold(mdn, blk);
+	if (drop_struct_lock)
+		rw_exit(&mdn->dn_struct_rwlock);
+	dbuf_read(db);
+
+	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
+	epb = db->db.db_size >> DNODE_SHIFT;
+
+	idx = object & (epb-1);
+
+	children_dnodes = dmu_buf_get_user(&db->db);
+	if (children_dnodes == NULL) {
+		dnode_t **winner;
+		children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *),
+		    KM_SLEEP);
+		if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
+		    dnode_buf_pageout)) {
+			kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+			children_dnodes = winner;
+		}
+	}
+
+	if ((dn = children_dnodes[idx]) == NULL) {
+		dnode_t *winner;
+		dn = dnode_create(os, (dnode_phys_t *)db->db.db_data+idx,
+			db, object);
+		winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
+		if (winner != NULL) {
+			dnode_destroy(dn);
+			dn = winner;
+		}
+	}
+
+	mutex_enter(&dn->dn_mtx);
+	if (dn->dn_free_txg ||
+	    ((flag & DNODE_MUST_BE_ALLOCATED) && dn->dn_type == DMU_OT_NONE) ||
+	    ((flag & DNODE_MUST_BE_FREE) && dn->dn_type != DMU_OT_NONE)) {
+		mutex_exit(&dn->dn_mtx);
+		dbuf_rele(db);
+		return (NULL);
+	}
+	mutex_exit(&dn->dn_mtx);
+
+	if (refcount_add(&dn->dn_holds, ref) == 1)
+		dbuf_add_ref(db, dn);
+
+	dnode_verify(dn);
+	ASSERT3P(dn->dn_dbuf, ==, db);
+	ASSERT3U(dn->dn_object, ==, object);
+	dbuf_rele(db);
+
+	return (dn);
+}
+
+/*
+ * Return held dnode if the object is allocated, NULL if not.
+ */
+dnode_t *
+dnode_hold(objset_impl_t *os, uint64_t object, void *ref)
+{
+	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, ref));
+}
+
+void
+dnode_add_ref(dnode_t *dn, void *ref)
+{
+	ASSERT(refcount_count(&dn->dn_holds) > 0);
+	(void) refcount_add(&dn->dn_holds, ref);
+}
+
+void
+dnode_rele(dnode_t *dn, void *ref)
+{
+	uint64_t refs;
+
+	refs = refcount_remove(&dn->dn_holds, ref);
+	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
+	if (refs == 0 && dn->dn_dbuf)
+		dbuf_remove_ref(dn->dn_dbuf, dn);
+}
+
+void
+dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
+{
+	objset_impl_t *os = dn->dn_objset;
+	uint64_t txg = tx->tx_txg;
+
+	if (IS_DNODE_DNODE(dn->dn_object))
+		return;
+
+	dnode_verify(dn);
+
+#ifdef ZFS_DEBUG
+	mutex_enter(&dn->dn_mtx);
+	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
+	/* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */
+	mutex_exit(&dn->dn_mtx);
+#endif
+
+	mutex_enter(&os->os_lock);
+
+	/*
+	 * If we are already marked dirty, we're done.
+	 */
+	if (dn->dn_dirtyblksz[txg&TXG_MASK] > 0) {
+		mutex_exit(&os->os_lock);
+		return;
+	}
+
+	ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+	ASSERT(dn->dn_datablksz != 0);
+	dn->dn_dirtyblksz[txg&TXG_MASK] = dn->dn_datablksz;
+
+	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
+	    dn->dn_object, txg);
+
+	if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
+		list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
+	} else {
+		list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
+	}
+
+	mutex_exit(&os->os_lock);
+
+	/*
+	 * The dnode maintains a hold on its containing dbuf as
+	 * long as there are holds on it.  Each instantiated child
+	 * dbuf maintaines a hold on the dnode.  When the last child
+	 * drops its hold, the dnode will drop its hold on the
+	 * containing dbuf. We add a "dirty hold" here so that the
+	 * dnode will hang around after we finish processing its
+	 * children.
+	 */
+	(void) refcount_add(&dn->dn_holds, (void *)(uintptr_t)tx->tx_txg);
+
+	dbuf_dirty(dn->dn_dbuf, tx);
+
+	dsl_dataset_dirty(os->os_dsl_dataset, tx);
+}
+
+void
+dnode_free(dnode_t *dn, dmu_tx_t *tx)
+{
+	dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
+
+	/* we should be the only holder... hopefully */
+	/* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
+
+	mutex_enter(&dn->dn_mtx);
+	if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
+		mutex_exit(&dn->dn_mtx);
+		return;
+	}
+	dn->dn_free_txg = tx->tx_txg;
+	mutex_exit(&dn->dn_mtx);
+
+	/*
+	 * If the dnode is already dirty, it needs to be moved from
+	 * the dirty list to the free list.
+	 */
+	mutex_enter(&dn->dn_objset->os_lock);
+	if (dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] > 0) {
+		list_remove(
+		    &dn->dn_objset->os_dirty_dnodes[tx->tx_txg&TXG_MASK], dn);
+		list_insert_tail(
+		    &dn->dn_objset->os_free_dnodes[tx->tx_txg&TXG_MASK], dn);
+		mutex_exit(&dn->dn_objset->os_lock);
+	} else {
+		mutex_exit(&dn->dn_objset->os_lock);
+		dnode_setdirty(dn, tx);
+	}
+}
+
+/*
+ * Try to change the block size for the indicated dnode.  This can only
+ * succeed if there are no blocks allocated or dirty beyond first block
+ */
+int
+dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db, *db_next;
+	int have_db0 = FALSE;
+	int err = ENOTSUP;
+
+	if (size == 0)
+		size = SPA_MINBLOCKSIZE;
+	if (size > SPA_MAXBLOCKSIZE)
+		size = SPA_MAXBLOCKSIZE;
+	else
+		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
+
+	if (ibs == 0)
+		ibs = dn->dn_indblkshift;
+
+	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec &&
+	    ibs == dn->dn_indblkshift)
+		return (0);
+
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+	/* Check for any allocated blocks beyond the first */
+	if (dn->dn_phys->dn_maxblkid != 0)
+		goto end;
+
+	/*
+	 * Any buffers allocated for blocks beyond the first
+	 * must be evictable/evicted, because they're the wrong size.
+	 */
+	mutex_enter(&dn->dn_dbufs_mtx);
+	/*
+	 * Since we have the dn_dbufs_mtx, nothing can be
+	 * removed from dn_dbufs.  Since we have dn_struct_rwlock/w,
+	 * nothing can be added to dn_dbufs.
+	 */
+	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+		db_next = list_next(&dn->dn_dbufs, db);
+
+		if (db->db_blkid == 0) {
+			have_db0 = TRUE;
+		} else if (db->db_blkid != DB_BONUS_BLKID) {
+			mutex_exit(&dn->dn_dbufs_mtx);
+			goto end;
+		}
+	}
+	mutex_exit(&dn->dn_dbufs_mtx);
+
+	/* Fast-track if there is no data in the file */
+	if (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) && !have_db0) {
+		dnode_setdblksz(dn, size);
+		dn->dn_indblkshift = ibs;
+		dnode_setdirty(dn, tx);
+		/* don't need dd_dirty_mtx, dnode is already dirty */
+		dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = size;
+		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+		rw_exit(&dn->dn_struct_rwlock);
+		return (0);
+	}
+
+	/* obtain the old block */
+	db = dbuf_hold(dn, 0);
+
+	/* Not allowed to decrease the size if there is data present */
+	if (size < db->db.db_size) {
+		dbuf_rele(db);
+		goto end;
+	}
+
+	dbuf_new_size(db, size, tx);
+
+	dnode_setdblksz(dn, size);
+	dn->dn_indblkshift = ibs;
+	/* don't need dd_dirty_mtx, dnode is already dirty */
+	dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = size;
+	dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+	dbuf_rele(db);
+
+	err = 0;
+end:
+	rw_exit(&dn->dn_struct_rwlock);
+	return (err);
+}
+
+uint64_t
+dnode_max_nonzero_offset(dnode_t *dn)
+{
+	if (dn->dn_phys->dn_maxblkid == 0 &&
+	    BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]))
+		return (0);
+	else
+		return ((dn->dn_phys->dn_maxblkid+1) * dn->dn_datablksz);
+}
+
+void
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
+{
+	uint64_t txgoff = tx->tx_txg & TXG_MASK;
+	int drop_struct_lock = FALSE;
+	int epbs, old_nlevels, new_nlevels;
+	uint64_t sz;
+
+	if (blkid == DB_BONUS_BLKID)
+		return;
+
+	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		drop_struct_lock = TRUE;
+	}
+
+	if (blkid > dn->dn_maxblkid)
+		dn->dn_maxblkid = blkid;
+
+	/*
+	 * Compute the number of levels necessary to support the
+	 * new blkid.
+	 */
+	new_nlevels = 1;
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+	for (sz = dn->dn_nblkptr; sz <= blkid && sz >= dn->dn_nblkptr;
+	    sz <<= epbs)
+		new_nlevels++;
+	old_nlevels = dn->dn_nlevels;
+
+	if (new_nlevels > dn->dn_next_nlevels[txgoff])
+		dn->dn_next_nlevels[txgoff] = new_nlevels;
+
+	if (new_nlevels > old_nlevels) {
+		dprintf("dn %p increasing nlevels from %u to %u\n",
+		    dn, dn->dn_nlevels, new_nlevels);
+		dn->dn_nlevels = new_nlevels;
+	}
+
+	/*
+	 * Dirty the left indirects.
+	 * Note: the caller should have just dnode_use_space()'d one
+	 * data block's worth, so we could subtract that out of
+	 * dn_inflight_data to determine if there is any dirty data
+	 * besides this block.
+	 * We don't strictly need to dirty them unless there's
+	 * *something* in the object (eg. on disk or dirty)...
+	 */
+	if (new_nlevels > old_nlevels) {
+		dmu_buf_impl_t *db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+		dprintf("dn %p dirtying left indirects\n", dn);
+		dbuf_dirty(db, tx);
+		dbuf_remove_ref(db, FTAG);
+	}
+#ifdef ZFS_DEBUG
+	else if (old_nlevels > 1 && new_nlevels > old_nlevels) {
+		dmu_buf_impl_t *db;
+		int i;
+
+		for (i = 0; i < dn->dn_nblkptr; i++) {
+			db = dbuf_hold_level(dn, old_nlevels-1, i, FTAG);
+			ASSERT(!
+			    list_link_active(&db->db_dirty_node[txgoff]));
+			dbuf_remove_ref(db, FTAG);
+		}
+	}
+#endif
+
+	dprintf("dn %p done\n", dn);
+
+out:
+	if (drop_struct_lock)
+		rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+	avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
+	avl_index_t where;
+	free_range_t *rp;
+	free_range_t rp_tofind;
+	uint64_t endblk = blkid + nblks;
+
+	ASSERT(MUTEX_HELD(&dn->dn_mtx));
+	ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
+
+	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+	    blkid, nblks, tx->tx_txg);
+	rp_tofind.fr_blkid = blkid;
+	rp = avl_find(tree, &rp_tofind, &where);
+	if (rp == NULL)
+		rp = avl_nearest(tree, where, AVL_BEFORE);
+	if (rp == NULL)
+		rp = avl_nearest(tree, where, AVL_AFTER);
+
+	while (rp && (rp->fr_blkid <= blkid + nblks)) {
+		uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
+		free_range_t *nrp = AVL_NEXT(tree, rp);
+
+		if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
+			/* clear this entire range */
+			avl_remove(tree, rp);
+			kmem_free(rp, sizeof (free_range_t));
+		} else if (blkid <= rp->fr_blkid &&
+		    endblk > rp->fr_blkid && endblk < fr_endblk) {
+			/* clear the beginning of this range */
+			rp->fr_blkid = endblk;
+			rp->fr_nblks = fr_endblk - endblk;
+		} else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
+		    endblk >= fr_endblk) {
+			/* clear the end of this range */
+			rp->fr_nblks = blkid - rp->fr_blkid;
+		} else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
+			/* clear a chunk out of this range */
+			free_range_t *new_rp =
+			    kmem_alloc(sizeof (free_range_t), KM_SLEEP);
+
+			new_rp->fr_blkid = endblk;
+			new_rp->fr_nblks = fr_endblk - endblk;
+			avl_insert_here(tree, new_rp, rp, AVL_AFTER);
+			rp->fr_nblks = blkid - rp->fr_blkid;
+		}
+		/* there may be no overlap */
+		rp = nrp;
+	}
+}
+
+void
+dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db;
+	uint64_t start, objsize, blkid, nblks;
+	int blkshift, blksz, tail, head, epbs;
+	int trunc = FALSE;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	blksz = dn->dn_datablksz;
+	blkshift = dn->dn_datablkshift;
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+	/* If the range is past the end of the file, this is a no-op */
+	objsize = blksz * (dn->dn_maxblkid+1);
+	if (off >= objsize)
+		goto out;
+	if (len == -1ULL) {
+		len = UINT64_MAX - off;
+		trunc = TRUE;
+	}
+
+	/*
+	 * First, block align the region to free:
+	 */
+	if (dn->dn_maxblkid == 0) {
+		if (off == 0) {
+			head = 0;
+		} else {
+			head = blksz - off;
+			ASSERT3U(head, >, 0);
+		}
+		start = off;
+	} else {
+		ASSERT(ISP2(blksz));
+		head = P2NPHASE(off, blksz);
+		start = P2PHASE(off, blksz);
+	}
+	/* zero out any partial block data at the start of the range */
+	if (head) {
+		ASSERT3U(start + head, ==, blksz);
+		if (len < head)
+			head = len;
+		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
+		    FTAG, &db) == 0) {
+			caddr_t data;
+
+			/* don't dirty if it isn't on disk and isn't dirty */
+			if (db->db_dirtied ||
+			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+				rw_exit(&dn->dn_struct_rwlock);
+				dbuf_will_dirty(db, tx);
+				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+				data = db->db.db_data;
+				bzero(data + start, head);
+			}
+			dbuf_remove_ref(db, FTAG);
+		}
+		off += head;
+		len -= head;
+	}
+	/* If the range was less than one block, we are done */
+	if (len == 0)
+		goto out;
+
+	/* If the remaining range is past the end of the file, we are done */
+	if (off > dn->dn_maxblkid << blkshift)
+		goto out;
+
+	if (off + len == UINT64_MAX)
+		tail = 0;
+	else
+		tail = P2PHASE(len, blksz);
+
+	ASSERT3U(P2PHASE(off, blksz), ==, 0);
+	/* zero out any partial block data at the end of the range */
+	if (tail) {
+		if (len < tail)
+			tail = len;
+		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
+		    TRUE, FTAG, &db) == 0) {
+			/* don't dirty if it isn't on disk and isn't dirty */
+			if (db->db_dirtied ||
+			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+				rw_exit(&dn->dn_struct_rwlock);
+				dbuf_will_dirty(db, tx);
+				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+				bzero(db->db.db_data, tail);
+			}
+			dbuf_remove_ref(db, FTAG);
+		}
+		len -= tail;
+	}
+	/* If the range did not include a full block, we are done */
+	if (len == 0)
+		goto out;
+
+	/* dirty the left indirects */
+	if (dn->dn_nlevels > 1 && off != 0) {
+		db = dbuf_hold_level(dn, 1,
+		    (off - head) >> (blkshift + epbs), FTAG);
+		dbuf_will_dirty(db, tx);
+		dbuf_remove_ref(db, FTAG);
+	}
+
+	/* dirty the right indirects */
+	if (dn->dn_nlevels > 1 && !trunc) {
+		db = dbuf_hold_level(dn, 1,
+		    (off + len + tail - 1) >> (blkshift + epbs), FTAG);
+		dbuf_will_dirty(db, tx);
+		dbuf_remove_ref(db, FTAG);
+	}
+
+	/*
+	 * Finally, add this range to the dnode range list, we
+	 * will finish up this free operation in the syncing phase.
+	 */
+	ASSERT(IS_P2ALIGNED(off, 1<<blkshift));
+	ASSERT(off + len == UINT64_MAX || IS_P2ALIGNED(len, 1<<blkshift));
+	blkid = off >> blkshift;
+	nblks = len >> blkshift;
+
+	if (trunc)
+		dn->dn_maxblkid = (blkid ? blkid - 1 : 0);
+
+	mutex_enter(&dn->dn_mtx);
+	dnode_clear_range(dn, blkid, nblks, tx);
+	{
+		free_range_t *rp, *found;
+		avl_index_t where;
+		avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
+
+		/* Add new range to dn_ranges */
+		rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
+		rp->fr_blkid = blkid;
+		rp->fr_nblks = nblks;
+		found = avl_find(tree, rp, &where);
+		ASSERT(found == NULL);
+		avl_insert(tree, rp, where);
+		dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+		    blkid, nblks, tx->tx_txg);
+	}
+	mutex_exit(&dn->dn_mtx);
+
+	dbuf_free_range(dn, blkid, nblks, tx);
+	dnode_setdirty(dn, tx);
+out:
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
+uint64_t
+dnode_block_freed(dnode_t *dn, uint64_t blkid)
+{
+	free_range_t range_tofind;
+	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
+	int i;
+
+	if (blkid == DB_BONUS_BLKID)
+		return (FALSE);
+
+	/*
+	 * If we're in the process of opening the pool, dp will not be
+	 * set yet, but there shouldn't be anything dirty.
+	 */
+	if (dp == NULL)
+		return (FALSE);
+
+	if (dn->dn_free_txg)
+		return (TRUE);
+
+	/*
+	 * If dn_datablkshift is not set, then there's only a single
+	 * block, in which case there will never be a free range so it
+	 * won't matter.
+	 */
+	range_tofind.fr_blkid = blkid;
+	mutex_enter(&dn->dn_mtx);
+	for (i = 0; i < TXG_SIZE; i++) {
+		free_range_t *range_found;
+		avl_index_t idx;
+
+		range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
+		if (range_found) {
+			ASSERT(range_found->fr_nblks > 0);
+			break;
+		}
+		range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
+		if (range_found &&
+		    range_found->fr_blkid + range_found->fr_nblks > blkid)
+			break;
+	}
+	mutex_exit(&dn->dn_mtx);
+	return (i < TXG_SIZE);
+}
+
+/* call from syncing context when we actually write/free space for this dnode */
+void
+dnode_diduse_space(dnode_t *dn, int64_t space)
+{
+	uint64_t sectors;
+
+	dprintf_dnode(dn, "dn=%p dnp=%p secphys=%llu space=%lld\n",
+	    dn, dn->dn_phys,
+	    (u_longlong_t)dn->dn_phys->dn_secphys,
+	    (longlong_t)space);
+
+	ASSERT(P2PHASE(space, 1<<DEV_BSHIFT) == 0);
+
+	mutex_enter(&dn->dn_mtx);
+	if (space > 0) {
+		sectors = space >> DEV_BSHIFT;
+		ASSERT3U(dn->dn_phys->dn_secphys + sectors, >=,
+		    dn->dn_phys->dn_secphys);
+		dn->dn_phys->dn_secphys += sectors;
+	} else {
+		sectors = -space >> DEV_BSHIFT;
+		ASSERT3U(dn->dn_phys->dn_secphys, >=, sectors);
+		dn->dn_phys->dn_secphys -= sectors;
+	}
+	mutex_exit(&dn->dn_mtx);
+}
+
+/*
+ * Call when we think we're going to write/free space in open context.
+ * Be conservative (ie. OK to write less than this or free more than
+ * this, but don't write more or free less).
+ */
+void
+dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
+{
+	objset_impl_t *os = dn->dn_objset;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+
+	if (space > 0)
+		space = spa_get_asize(os->os_spa, space);
+
+	if (ds)
+		dsl_dir_willuse_space(ds->ds_dir, space, tx);
+
+	dmu_tx_willuse_space(tx, space);
+}
+
+static int
+dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
+	int lvl, uint64_t blkfill)
+{
+	dmu_buf_impl_t *db = NULL;
+	void *data = NULL;
+	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	uint64_t epb = 1ULL << epbs;
+	uint64_t minfill, maxfill;
+	int i, error, span;
+
+	dprintf("probing object %llu offset %llx level %d of %u\n",
+	    dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
+
+	if (lvl == dn->dn_phys->dn_nlevels) {
+		error = 0;
+		epb = dn->dn_phys->dn_nblkptr;
+		data = dn->dn_phys->dn_blkptr;
+	} else {
+		uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
+		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+		if (error) {
+			if (error == ENOENT)
+				return (hole ? 0 : ESRCH);
+			return (error);
+		}
+		dbuf_read_havestruct(db);
+		data = db->db.db_data;
+	}
+
+	if (lvl == 0) {
+		dnode_phys_t *dnp = data;
+		span = DNODE_SHIFT;
+		ASSERT(dn->dn_type == DMU_OT_DNODE);
+
+		for (i = (*offset >> span) & (blkfill - 1); i < blkfill; i++) {
+			if (!dnp[i].dn_type == hole)
+				break;
+			*offset += 1ULL << span;
+		}
+		if (i == blkfill)
+			error = ESRCH;
+	} else {
+		blkptr_t *bp = data;
+		span = (lvl - 1) * epbs + dn->dn_datablkshift;
+		minfill = 0;
+		maxfill = blkfill << ((lvl - 1) * epbs);
+
+		if (hole)
+			maxfill--;
+		else
+			minfill++;
+
+		for (i = (*offset >> span) & ((1ULL << epbs) - 1);
+		    i < epb; i++) {
+			if (bp[i].blk_fill >= minfill &&
+			    bp[i].blk_fill <= maxfill)
+				break;
+			*offset += 1ULL << span;
+		}
+		if (i >= epb)
+			error = ESRCH;
+	}
+
+	if (db)
+		dbuf_remove_ref(db, FTAG);
+
+	return (error);
+}
+
+/*
+ * Find the next hole, data, or sparse region at or after *offset.
+ * The value 'blkfill' tells us how many items we expect to find
+ * in an L0 data block; this value is 1 for normal objects,
+ * DNODES_PER_BLOCK for the meta dnode, and some fraction of
+ * DNODES_PER_BLOCK when searching for sparse regions thereof.
+ * Examples:
+ *
+ * dnode_next_offset(dn, hole, offset, 1, 1);
+ *	Finds the next hole/data in a file.
+ *	Used in dmu_offset_next().
+ *
+ * dnode_next_offset(mdn, hole, offset, 0, DNODES_PER_BLOCK);
+ *	Finds the next free/allocated dnode an objset's meta-dnode.
+ *	Used in dmu_object_next().
+ *
+ * dnode_next_offset(mdn, TRUE, offset, 2, DNODES_PER_BLOCK >> 2);
+ *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
+ *	Used in dmu_object_alloc().
+ */
+int
+dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *offset,
+    int minlvl, uint64_t blkfill)
+{
+	int lvl, maxlvl;
+	int error = 0;
+	uint64_t initial_offset = *offset;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+	if (dn->dn_phys->dn_nlevels == 0) {
+		rw_exit(&dn->dn_struct_rwlock);
+		return (ESRCH);
+	}
+
+	if (dn->dn_datablkshift == 0) {
+		if (*offset < dn->dn_datablksz) {
+			if (hole)
+				*offset = dn->dn_datablksz;
+		} else {
+			error = ESRCH;
+		}
+		rw_exit(&dn->dn_struct_rwlock);
+		return (error);
+	}
+
+	maxlvl = dn->dn_phys->dn_nlevels;
+
+	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
+		error = dnode_next_offset_level(dn, hole, offset, lvl, blkfill);
+		if (error == 0)
+			break;
+	}
+
+	while (--lvl >= minlvl && error == 0)
+		error = dnode_next_offset_level(dn, hole, offset, lvl, blkfill);
+
+	rw_exit(&dn->dn_struct_rwlock);
+
+	if (initial_offset > *offset)
+		return (ESRCH);
+
+	return (error);
+}
diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c
new file mode 100644
index 000000000000..56fc3e19aea3
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c
@@ -0,0 +1,560 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+
+
+static void
+dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db;
+	int i;
+	uint64_t txg = tx->tx_txg;
+
+	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+	/* this dnode can't be paged out because it's dirty */
+
+	db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
+	for (i = 0; i < dn->dn_phys->dn_nblkptr; i++)
+		if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
+			break;
+	if (i != dn->dn_phys->dn_nblkptr) {
+		ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]));
+
+		dbuf_read_havestruct(db);
+		arc_release(db->db_buf, db);
+		/* copy dnode's block pointers to new indirect block */
+		ASSERT3U(sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr, <=,
+		    db->db.db_size);
+		bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+		    sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr);
+	}
+
+	dn->dn_phys->dn_nlevels += 1;
+	dprintf("os=%p obj=%llu, increase to %d\n",
+		dn->dn_objset, dn->dn_object,
+		dn->dn_phys->dn_nlevels);
+
+	/* set dbuf's parent pointers to new indirect buf */
+	for (i = 0; i < dn->dn_phys->dn_nblkptr; i++) {
+		dmu_buf_impl_t *child =
+		    dbuf_find(dn, dn->dn_phys->dn_nlevels-2, i);
+		if (child == NULL)
+			continue;
+		if (child->db_dnode == NULL) {
+			mutex_exit(&child->db_mtx);
+			continue;
+		}
+
+		if (child->db_parent == NULL ||
+		    child->db_parent == dn->dn_dbuf) {
+			dprintf_dbuf_bp(child, child->db_blkptr,
+			    "changing db_blkptr to new indirect %s", "");
+			child->db_parent = db;
+			dbuf_add_ref(db, child);
+			if (db->db.db_data) {
+				child->db_blkptr =
+				    (blkptr_t *)db->db.db_data + i;
+			} else {
+				child->db_blkptr = NULL;
+			}
+			dprintf_dbuf_bp(child, child->db_blkptr,
+			    "changed db_blkptr to new indirect %s", "");
+		}
+		ASSERT3P(child->db_parent, ==, db);
+
+		mutex_exit(&child->db_mtx);
+	}
+
+	bzero(dn->dn_phys->dn_blkptr,
+		sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr);
+
+	dbuf_remove_ref(db, FTAG);
+}
+
+static void
+free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
+{
+	objset_impl_t *os = dn->dn_objset;
+	uint64_t bytesfreed = 0;
+	int i;
+
+	dprintf("os=%p obj=%llx num=%d\n", os, dn->dn_object, num);
+
+	for (i = 0; i < num; i++, bp++) {
+		if (BP_IS_HOLE(bp))
+			continue;
+
+		bytesfreed += BP_GET_ASIZE(bp);
+		ASSERT3U(bytesfreed >> DEV_BSHIFT, <=, dn->dn_phys->dn_secphys);
+		dsl_dataset_block_kill(os->os_dsl_dataset, bp, tx);
+	}
+	dnode_diduse_space(dn, -bytesfreed);
+}
+
+static void
+free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
+{
+#ifdef ZFS_DEBUG
+	int off, num;
+	int i, err, epbs;
+	uint64_t txg = tx->tx_txg;
+
+	epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	off = start - (db->db_blkid * 1<<epbs);
+	num = end - start + 1;
+
+	ASSERT3U(off, >=, 0);
+	ASSERT3U(num, >=, 0);
+	ASSERT3U(db->db_level, >, 0);
+	ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift);
+	ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
+	ASSERT(db->db_blkptr != NULL);
+
+	for (i = off; i < off+num; i++) {
+		uint64_t *buf;
+		int j;
+		dmu_buf_impl_t *child;
+
+		ASSERT(db->db_level == 1);
+
+		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+		err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
+			(db->db_blkid << epbs) + i, TRUE, FTAG, &child);
+		rw_exit(&db->db_dnode->dn_struct_rwlock);
+		if (err == ENOENT)
+			continue;
+		ASSERT(err == 0);
+		ASSERT(child->db_level == 0);
+		ASSERT(!list_link_active(&child->db_dirty_node[txg&TXG_MASK]));
+
+		/* db_data_old better be zeroed */
+		if (child->db_d.db_data_old[txg & TXG_MASK]) {
+			buf = (child->db_d.db_data_old[txg & TXG_MASK])->b_data;
+			for (j = 0; j < child->db.db_size >> 3; j++) {
+				if (buf[j] != 0) {
+					panic("freed data not zero: "
+					    "child=%p i=%d off=%d num=%d\n",
+					    child, i, off, num);
+				}
+			}
+		}
+
+		/*
+		 * db_data better be zeroed unless it's dirty in a
+		 * future txg.
+		 */
+		mutex_enter(&child->db_mtx);
+		buf = child->db.db_data;
+		if (buf != NULL && child->db_state != DB_FILL &&
+		    !list_link_active(&child->db_dirty_node
+			[(txg+1) & TXG_MASK]) &&
+		    !list_link_active(&child->db_dirty_node
+			[(txg+2) & TXG_MASK])) {
+			for (j = 0; j < child->db.db_size >> 3; j++) {
+				if (buf[j] != 0) {
+					panic("freed data not zero: "
+					    "child=%p i=%d off=%d num=%d\n",
+					    child, i, off, num);
+				}
+			}
+		}
+		mutex_exit(&child->db_mtx);
+
+		dbuf_remove_ref(child, FTAG);
+	}
+#endif
+}
+
+static int
+free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
+    dmu_tx_t *tx)
+{
+	dnode_t *dn = db->db_dnode;
+	blkptr_t *bp;
+	dmu_buf_impl_t *subdb;
+	uint64_t start, end, dbstart, dbend, i;
+	int epbs, shift, err;
+	int txg_index = tx->tx_txg&TXG_MASK;
+	int all = TRUE;
+
+	dbuf_read(db);
+	arc_release(db->db_buf, db);
+	bp = (blkptr_t *)db->db.db_data;
+
+	epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	shift = (db->db_level - 1) * epbs;
+	dbstart = db->db_blkid << epbs;
+	start = blkid >> shift;
+	if (dbstart < start) {
+		bp += start - dbstart;
+		all = FALSE;
+	} else {
+		start = dbstart;
+	}
+	dbend = ((db->db_blkid + 1) << epbs) - 1;
+	end = (blkid + nblks - 1) >> shift;
+	if (dbend <= end)
+		end = dbend;
+	else if (all)
+		all = trunc;
+	ASSERT3U(start, <=, end);
+
+	if (db->db_level == 1) {
+		free_verify(db, start, end, tx);
+		free_blocks(dn, bp, end-start+1, tx);
+		ASSERT(all || list_link_active(&db->db_dirty_node[txg_index]));
+		return (all);
+	}
+
+	for (i = start; i <= end; i++, bp++) {
+		if (BP_IS_HOLE(bp))
+			continue;
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb);
+		ASSERT3U(err, ==, 0);
+		rw_exit(&dn->dn_struct_rwlock);
+
+		if (free_children(subdb, blkid, nblks, trunc, tx)) {
+			ASSERT3P(subdb->db_blkptr, ==, bp);
+			free_blocks(dn, bp, 1, tx);
+		}
+		dbuf_remove_ref(subdb, FTAG);
+	}
+#ifdef ZFS_DEBUG
+	bp -= (end-start)+1;
+	for (i = start; i <= end; i++, bp++) {
+		if (i == start && blkid != 0)
+			continue;
+		else if (i == end && !trunc)
+			continue;
+		ASSERT3U(bp->blk_birth, ==, 0);
+	}
+#endif
+	ASSERT(all || list_link_active(&db->db_dirty_node[txg_index]));
+	return (all);
+}
+
+/*
+ * free_range: Traverse the indicated range of the provided file
+ * and "free" all the blocks contained there.
+ */
+static void
+dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+{
+	blkptr_t *bp = dn->dn_phys->dn_blkptr;
+	dmu_buf_impl_t *db;
+	int trunc, start, end, shift, i, err;
+	int dnlevel = dn->dn_phys->dn_nlevels;
+
+	if (blkid > dn->dn_phys->dn_maxblkid)
+		return;
+
+	ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
+	trunc = blkid + nblks > dn->dn_phys->dn_maxblkid;
+	if (trunc)
+		nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
+
+	/* There are no indirect blocks in the object */
+	if (dnlevel == 1) {
+		if (blkid >= dn->dn_phys->dn_nblkptr) {
+			/* this range was never made persistent */
+			return;
+		}
+		ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
+		free_blocks(dn, bp + blkid, nblks, tx);
+		if (trunc) {
+			uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
+			    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+			dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
+			ASSERT(off < dn->dn_phys->dn_maxblkid ||
+			    dn->dn_phys->dn_maxblkid == 0 ||
+			    dnode_next_offset(dn, FALSE, &off, 1, 1) == ESRCH);
+		}
+		return;
+	}
+
+	shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
+	start = blkid >> shift;
+	ASSERT(start < dn->dn_phys->dn_nblkptr);
+	end = (blkid + nblks - 1) >> shift;
+	bp += start;
+	for (i = start; i <= end; i++, bp++) {
+		if (BP_IS_HOLE(bp))
+			continue;
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db);
+		ASSERT3U(err, ==, 0);
+		rw_exit(&dn->dn_struct_rwlock);
+
+		if (free_children(db, blkid, nblks, trunc, tx)) {
+			ASSERT3P(db->db_blkptr, ==, bp);
+			free_blocks(dn, bp, 1, tx);
+		}
+		dbuf_remove_ref(db, FTAG);
+	}
+	if (trunc) {
+		uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
+		    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+		dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
+		ASSERT(off < dn->dn_phys->dn_maxblkid ||
+		    dn->dn_phys->dn_maxblkid == 0 ||
+		    dnode_next_offset(dn, FALSE, &off, 1, 1) == ESRCH);
+	}
+}
+
+static int
+dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db;
+	int txgoff = tx->tx_txg & TXG_MASK;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	/* Undirty all buffers */
+	while (db = list_head(&dn->dn_dirty_dbufs[txgoff])) {
+		mutex_enter(&db->db_mtx);
+		/* XXX - use dbuf_undirty()? */
+		list_remove(&dn->dn_dirty_dbufs[txgoff], db);
+		if (db->db_level == 0) {
+			ASSERT3P(db->db_d.db_data_old[txgoff], ==, db->db_buf);
+			if (db->db_d.db_overridden_by[txgoff])
+				dbuf_unoverride(db, tx->tx_txg);
+			db->db_d.db_data_old[txgoff] = NULL;
+		}
+		db->db_dirtycnt -= 1;
+		mutex_exit(&db->db_mtx);
+		dbuf_remove_ref(db, (void *)(uintptr_t)tx->tx_txg);
+	}
+
+	ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
+
+	/* Undirty next bits */
+	dn->dn_next_nlevels[txgoff] = 0;
+	dn->dn_next_indblkshift[txgoff] = 0;
+
+	/* free up all the blocks in the file. */
+	dbuf_free_range(dn, 0, -1, tx);
+	dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
+	ASSERT3U(dn->dn_phys->dn_secphys, ==, 0);
+
+	/*
+	 * All dbufs should be gone, since all holds are gone...
+	 */
+	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+	/* ASSERT(blkptrs are zero); */
+	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+	ASSERT(dn->dn_type != DMU_OT_NONE);
+
+	ASSERT(dn->dn_free_txg > 0);
+	if (dn->dn_allocated_txg != dn->dn_free_txg)
+		dbuf_will_dirty(dn->dn_dbuf, tx);
+	bzero(dn->dn_phys, sizeof (dnode_phys_t));
+
+	mutex_enter(&dn->dn_mtx);
+	dn->dn_type = DMU_OT_NONE;
+	dn->dn_dirtyblksz[txgoff] = 0;
+	dn->dn_maxblkid = 0;
+	dn->dn_allocated_txg = 0;
+	mutex_exit(&dn->dn_mtx);
+
+	ASSERT(!IS_DNODE_DNODE(dn->dn_object));
+
+	dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+	/*
+	 * Now that we've released our hold, the dnode may
+	 * be evicted, so we musn't access it.
+	 */
+	return (1);
+}
+
+/*
+ * Write out the dnode's dirty buffers at the specified level.
+ * This may create more dirty buffers at the next level up.
+ *
+ * NOTE: The dnode is kept in memory by being dirty.  Once the
+ * dirty bit is cleared, it may be evicted.  Beware of this!
+ */
+int
+dnode_sync(dnode_t *dn, int level, zio_t *zio, dmu_tx_t *tx)
+{
+	free_range_t *rp;
+	int txgoff = tx->tx_txg & TXG_MASK;
+	dnode_phys_t *dnp = dn->dn_phys;
+
+	/* ASSERT(dn->dn_objset->dd_snapshot == NULL); */
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(IS_DNODE_DNODE(dn->dn_object) ||
+	    dn->dn_dirtyblksz[txgoff] > 0);
+
+	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
+	dnode_verify(dn);
+	/*
+	 * Make sure the dbuf for the dn_phys is released before we modify it.
+	 */
+	if (dn->dn_dbuf)
+		arc_release(dn->dn_dbuf->db_buf, dn->dn_dbuf);
+
+	mutex_enter(&dn->dn_mtx);
+	if (dn->dn_allocated_txg == tx->tx_txg) {
+		/* The dnode is newly allocated or reallocated */
+		if (dnp->dn_type == DMU_OT_NONE) {
+			/* this is a first alloc, not a realloc */
+			/* XXX shouldn't the phys already be zeroed? */
+			bzero(dnp, DNODE_CORE_SIZE);
+			dnp->dn_datablkszsec = dn->dn_datablkszsec;
+			dnp->dn_indblkshift = dn->dn_indblkshift;
+			dnp->dn_nlevels = 1;
+		}
+
+		if (dn->dn_nblkptr > dnp->dn_nblkptr) {
+			/* zero the new blkptrs we are gaining */
+			bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
+			    sizeof (blkptr_t) *
+			    (dn->dn_nblkptr - dnp->dn_nblkptr));
+		}
+		dnp->dn_type = dn->dn_type;
+		dnp->dn_bonustype = dn->dn_bonustype;
+		dnp->dn_bonuslen = dn->dn_bonuslen;
+		dnp->dn_nblkptr = dn->dn_nblkptr;
+	}
+
+	if (dn->dn_dirtyblksz[txgoff]) {
+		ASSERT(P2PHASE(dn->dn_dirtyblksz[txgoff],
+		    SPA_MINBLOCKSIZE) == 0);
+		dnp->dn_datablkszsec =
+		    dn->dn_dirtyblksz[txgoff] >> SPA_MINBLOCKSHIFT;
+	}
+
+	if (dn->dn_next_indblkshift[txgoff]) {
+		ASSERT(dnp->dn_nlevels == 1);
+		dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
+		dn->dn_next_indblkshift[txgoff] = 0;
+	}
+
+	/*
+	 * Just take the live (open-context) values for checksum and compress.
+	 * Strictly speaking it's a future leak, but nothing bad happens if we
+	 * start using the new checksum or compress algorithm a little early.
+	 */
+	dnp->dn_checksum = dn->dn_checksum;
+	dnp->dn_compress = dn->dn_compress;
+
+	mutex_exit(&dn->dn_mtx);
+
+	/* process all the "freed" ranges in the file */
+	if (dn->dn_free_txg == 0 || dn->dn_free_txg > tx->tx_txg) {
+		for (rp = avl_first(&dn->dn_ranges[txgoff]); rp != NULL;
+		    rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp))
+			dnode_sync_free_range(dn,
+			    rp->fr_blkid, rp->fr_nblks, tx);
+	}
+	mutex_enter(&dn->dn_mtx);
+	for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) {
+		free_range_t *last = rp;
+		rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp);
+		avl_remove(&dn->dn_ranges[txgoff], last);
+		kmem_free(last, sizeof (free_range_t));
+	}
+	mutex_exit(&dn->dn_mtx);
+
+	if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
+		ASSERT3U(level, ==, 0);
+		return (dnode_sync_free(dn, tx));
+	}
+
+	if (dn->dn_next_nlevels[txgoff]) {
+		int new_lvl = dn->dn_next_nlevels[txgoff];
+
+		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		while (new_lvl > dnp->dn_nlevels)
+			dnode_increase_indirection(dn, tx);
+		rw_exit(&dn->dn_struct_rwlock);
+		dn->dn_next_nlevels[txgoff] = 0;
+	}
+
+	if (level == dnp->dn_nlevels) {
+		uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
+		    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+
+		/* we've already synced out all data and indirect blocks */
+		/* there are no more dirty dbufs under this dnode */
+		ASSERT3P(list_head(&dn->dn_dirty_dbufs[txgoff]), ==, NULL);
+		ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= tx->tx_txg);
+
+		/* XXX this is expensive. remove once 6343073 is closed. */
+		/* NB: the "off < maxblkid" is to catch overflow */
+		/*
+		 * NB: if blocksize is changing, we could get confused,
+		 * so only bother if there are multiple blocks and thus
+		 * it can't be changing.
+		 */
+		ASSERT(off < dn->dn_phys->dn_maxblkid ||
+		    dn->dn_phys->dn_maxblkid == 0 ||
+		    dnode_next_offset(dn, FALSE, &off, 1, 1) == ESRCH);
+
+		dn->dn_dirtyblksz[txgoff] = 0;
+
+
+		if (!IS_DNODE_DNODE(dn->dn_object)) {
+			dbuf_will_dirty(dn->dn_dbuf, tx);
+			dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+		}
+
+		/*
+		 * Now that we've dropped the reference, the dnode may
+		 * be evicted, so we musn't access it.
+		 */
+		return (1);
+	} else {
+		dmu_buf_impl_t *db, *db_next;
+		list_t *list = &dn->dn_dirty_dbufs[txgoff];
+		/*
+		 * Iterate over the list, removing and sync'ing dbufs
+		 * which are on the level we want, and leaving others.
+		 */
+		for (db = list_head(list); db; db = db_next) {
+			db_next = list_next(list, db);
+			if (db->db_level == level) {
+				list_remove(list, db);
+				dbuf_sync(db, zio, tx);
+			}
+		}
+		return (0);
+	}
+}
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
new file mode 100644
index 000000000000..ab8dcfc3e3ef
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -0,0 +1,1463 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/unique.h>
+#include <sys/zfs_context.h>
+
+#define	DOS_REF_MAX	(1ULL << 62)
+
+#define	DSL_DEADLIST_BLOCKSIZE	SPA_MAXBLOCKSIZE
+
+#define	BP_GET_UCSIZE(bp) \
+	((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
+	BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
+
+/*
+ * We use weighted reference counts to express the various forms of exclusion
+ * between different open modes.  A STANDARD open is 1 point, an EXCLUSIVE open
+ * is DOS_REF_MAX, and a PRIMARY open is little more than half of an EXCLUSIVE.
+ * This makes the exclusion logic simple: the total refcnt for all opens cannot
+ * exceed DOS_REF_MAX.  For example, EXCLUSIVE opens are exclusive because their
+ * weight (DOS_REF_MAX) consumes the entire refcnt space.  PRIMARY opens consume
+ * just over half of the refcnt space, so there can't be more than one, but it
+ * can peacefully coexist with any number of STANDARD opens.
+ */
+static uint64_t ds_refcnt_weight[DS_MODE_LEVELS] = {
+	0,			/* DOS_MODE_NONE - invalid		*/
+	1,			/* DOS_MODE_STANDARD - unlimited number	*/
+	(DOS_REF_MAX >> 1) + 1,	/* DOS_MODE_PRIMARY - only one of these	*/
+	DOS_REF_MAX		/* DOS_MODE_EXCLUSIVE - no other opens	*/
+};
+
+
+void
+dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+{
+	int used = BP_GET_ASIZE(bp);
+	int compressed = BP_GET_PSIZE(bp);
+	int uncompressed = BP_GET_UCSIZE(bp);
+
+	dprintf_bp(bp, "born, ds=%p\n", ds);
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	/* It could have been compressed away to nothing */
+	if (BP_IS_HOLE(bp))
+		return;
+	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
+	ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
+	if (ds == NULL) {
+		/*
+		 * Account for the meta-objset space in its placeholder
+		 * dsl_dir.
+		 */
+		ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
+		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+		    used, compressed, uncompressed, tx);
+		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+		return;
+	}
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	mutex_enter(&ds->ds_lock);
+	ds->ds_phys->ds_used_bytes += used;
+	ds->ds_phys->ds_compressed_bytes += compressed;
+	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
+	ds->ds_phys->ds_unique_bytes += used;
+	mutex_exit(&ds->ds_lock);
+	dsl_dir_diduse_space(ds->ds_dir,
+	    used, compressed, uncompressed, tx);
+}
+
+void
+dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+{
+	int used = BP_GET_ASIZE(bp);
+	int compressed = BP_GET_PSIZE(bp);
+	int uncompressed = BP_GET_UCSIZE(bp);
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	if (BP_IS_HOLE(bp))
+		return;
+
+	ASSERT(used > 0);
+	if (ds == NULL) {
+		/*
+		 * Account for the meta-objset space in its placeholder
+		 * dataset.
+		 */
+		/* XXX this can fail, what do we do when it does? */
+		(void) arc_free(NULL, tx->tx_pool->dp_spa,
+		    tx->tx_txg, bp, NULL, NULL, ARC_WAIT);
+		bzero(bp, sizeof (blkptr_t));
+
+		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
+		    -used, -compressed, -uncompressed, tx);
+		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+		return;
+	}
+	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
+		dprintf_bp(bp, "freeing: %s", "");
+		/* XXX check return code? */
+		(void) arc_free(NULL, tx->tx_pool->dp_spa,
+		    tx->tx_txg, bp, NULL, NULL, ARC_WAIT);
+
+		mutex_enter(&ds->ds_lock);
+		/* XXX unique_bytes is not accurate for head datasets */
+		/* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */
+		ds->ds_phys->ds_unique_bytes -= used;
+		mutex_exit(&ds->ds_lock);
+		dsl_dir_diduse_space(ds->ds_dir,
+		    -used, -compressed, -uncompressed, tx);
+	} else {
+		dprintf_bp(bp, "putting on dead list: %s", "");
+		bplist_enqueue(&ds->ds_deadlist, bp, tx);
+		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
+		if (ds->ds_phys->ds_prev_snap_obj != 0) {
+			ASSERT3U(ds->ds_prev->ds_object, ==,
+			    ds->ds_phys->ds_prev_snap_obj);
+			ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
+			if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
+			    ds->ds_object &&
+			    bp->blk_birth >
+			    ds->ds_prev->ds_phys->ds_prev_snap_txg) {
+				dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+				mutex_enter(&ds->ds_prev->ds_lock);
+				ds->ds_prev->ds_phys->ds_unique_bytes +=
+				    used;
+				mutex_exit(&ds->ds_prev->ds_lock);
+			}
+		}
+	}
+	bzero(bp, sizeof (blkptr_t));
+	mutex_enter(&ds->ds_lock);
+	ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
+	ds->ds_phys->ds_used_bytes -= used;
+	ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
+	ds->ds_phys->ds_compressed_bytes -= compressed;
+	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
+	ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
+	mutex_exit(&ds->ds_lock);
+}
+
+int
+dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth, dmu_tx_t *tx)
+{
+	uint64_t prev_snap_txg;
+	dsl_dir_t *dd;
+	/* ASSERT that it is not a snapshot */
+	if (ds == NULL)
+		return (TRUE);
+	/*
+	 * The snapshot creation could fail, but that would cause an
+	 * incorrect FALSE return, which would only result in an
+	 * overestimation of the amount of space that an operation would
+	 * consume, which is OK.
+	 *
+	 * There's also a small window where we could miss a pending
+	 * snapshot, because we could set the sync task in the quiescing
+	 * phase.  So this should only be used as a guess.
+	 */
+	dd = ds->ds_dir;
+	mutex_enter(&dd->dd_lock);
+	if (dd->dd_sync_func == dsl_dataset_snapshot_sync &&
+	    dd->dd_sync_txg < tx->tx_txg)
+		prev_snap_txg = dd->dd_sync_txg;
+	else
+		prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+	mutex_exit(&dd->dd_lock);
+	return (blk_birth > prev_snap_txg);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_evict(dmu_buf_t *db, void *dsv)
+{
+	dsl_dataset_t *ds = dsv;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	/* open_refcount == DOS_REF_MAX when deleting */
+	ASSERT(ds->ds_open_refcount == 0 ||
+	    ds->ds_open_refcount == DOS_REF_MAX);
+
+	dprintf_ds(ds, "evicting %s\n", "");
+
+	unique_remove(ds->ds_phys->ds_fsid_guid);
+
+	if (ds->ds_user_ptr != NULL)
+		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+
+	if (ds->ds_prev) {
+		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
+		ds->ds_prev = NULL;
+	}
+
+	bplist_close(&ds->ds_deadlist);
+	dsl_dir_close(ds->ds_dir, ds);
+
+	if (list_link_active(&ds->ds_synced_link))
+		list_remove(&dp->dp_synced_objsets, ds);
+
+	kmem_free(ds, sizeof (dsl_dataset_t));
+}
+
+static void
+dsl_dataset_get_snapname(dsl_dataset_t *ds)
+{
+	dsl_dataset_phys_t *headphys;
+	int err;
+	dmu_buf_t *headdbuf;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+
+	if (ds->ds_snapname[0])
+		return;
+	if (ds->ds_phys->ds_next_snap_obj == 0)
+		return;
+
+	headdbuf = dmu_bonus_hold_tag(mos,
+	    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG);
+	dmu_buf_read(headdbuf);
+	headphys = headdbuf->db_data;
+	err = zap_value_search(dp->dp_meta_objset,
+	    headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
+	ASSERT(err == 0);
+	dmu_buf_rele_tag(headdbuf, FTAG);
+}
+
+dsl_dataset_t *
+dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
+    int mode, void *tag)
+{
+	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
+	objset_t *mos = dp->dp_meta_objset;
+	dmu_buf_t *dbuf;
+	dsl_dataset_t *ds;
+
+	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+	    dsl_pool_sync_context(dp));
+
+	dbuf = dmu_bonus_hold_tag(mos, dsobj, tag);
+	dmu_buf_read(dbuf);
+	ds = dmu_buf_get_user(dbuf);
+	if (ds == NULL) {
+		dsl_dataset_t *winner;
+
+		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
+		ds->ds_dbuf = dbuf;
+		ds->ds_object = dsobj;
+		ds->ds_phys = dbuf->db_data;
+		ds->ds_dir = dsl_dir_open_obj(dp,
+		    ds->ds_phys->ds_dir_obj, NULL, ds);
+
+		bplist_open(&ds->ds_deadlist,
+		    mos, ds->ds_phys->ds_deadlist_obj);
+
+		if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
+			ds->ds_snapname[0] = '\0';
+			if (ds->ds_phys->ds_prev_snap_obj) {
+				ds->ds_prev =
+				    dsl_dataset_open_obj(dp,
+				    ds->ds_phys->ds_prev_snap_obj, NULL,
+				    DS_MODE_NONE, ds);
+			}
+		} else {
+			if (snapname) {
+#ifdef ZFS_DEBUG
+				dsl_dataset_phys_t *headphys;
+				int err;
+				dmu_buf_t *headdbuf = dmu_bonus_hold_tag(mos,
+				    ds->ds_dir->dd_phys->
+				    dd_head_dataset_obj, FTAG);
+				dmu_buf_read(headdbuf);
+				headphys = headdbuf->db_data;
+				uint64_t foundobj;
+				err = zap_lookup(dp->dp_meta_objset,
+				    headphys->ds_snapnames_zapobj,
+				    snapname, sizeof (foundobj), 1, &foundobj);
+				ASSERT3U(err, ==, 0);
+				ASSERT3U(foundobj, ==, dsobj);
+				dmu_buf_rele_tag(headdbuf, FTAG);
+#endif
+				(void) strcat(ds->ds_snapname, snapname);
+			} else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
+				dsl_dataset_get_snapname(ds);
+			}
+		}
+
+		winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
+		    dsl_dataset_evict);
+		if (winner) {
+			bplist_close(&ds->ds_deadlist);
+			if (ds->ds_prev) {
+				dsl_dataset_close(ds->ds_prev,
+				    DS_MODE_NONE, ds);
+			}
+			dsl_dir_close(ds->ds_dir, ds);
+			kmem_free(ds, sizeof (dsl_dataset_t));
+			ds = winner;
+		} else {
+			uint64_t new =
+			    unique_insert(ds->ds_phys->ds_fsid_guid);
+			if (new != ds->ds_phys->ds_fsid_guid) {
+				/* XXX it won't necessarily be synced... */
+				ds->ds_phys->ds_fsid_guid = new;
+			}
+		}
+	}
+	ASSERT3P(ds->ds_dbuf, ==, dbuf);
+	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
+
+	mutex_enter(&ds->ds_lock);
+	if ((DS_MODE_LEVEL(mode) == DS_MODE_PRIMARY &&
+	    ds->ds_phys->ds_restoring && !DS_MODE_IS_RESTORE(mode)) ||
+	    (ds->ds_open_refcount + weight > DOS_REF_MAX)) {
+		mutex_exit(&ds->ds_lock);
+		dsl_dataset_close(ds, DS_MODE_NONE, tag);
+		return (NULL);
+	}
+	ds->ds_open_refcount += weight;
+	mutex_exit(&ds->ds_lock);
+
+	return (ds);
+}
+
+int
+dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
+    void *tag, dsl_dataset_t **dsp)
+{
+	dsl_dir_t *dd;
+	dsl_pool_t *dp;
+	const char *tail;
+	uint64_t obj;
+	dsl_dataset_t *ds = NULL;
+	int err = 0;
+
+	dd = dsl_dir_open_spa(spa, name, FTAG, &tail);
+	if (dd == NULL)
+		return (ENOENT);
+
+	dp = dd->dd_pool;
+	obj = dd->dd_phys->dd_head_dataset_obj;
+	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	if (obj == 0) {
+		/* A dataset with no associated objset */
+		err = ENOENT;
+		goto out;
+	}
+
+	if (tail != NULL) {
+		objset_t *mos = dp->dp_meta_objset;
+
+		ds = dsl_dataset_open_obj(dp, obj, NULL, DS_MODE_NONE, tag);
+		obj = ds->ds_phys->ds_snapnames_zapobj;
+		dsl_dataset_close(ds, DS_MODE_NONE, tag);
+		ds = NULL;
+
+		if (tail[0] != '@') {
+			err = ENOENT;
+			goto out;
+		}
+		tail++;
+
+		/* Look for a snapshot */
+		if (!DS_MODE_IS_READONLY(mode)) {
+			err = EROFS;
+			goto out;
+		}
+		dprintf("looking for snapshot '%s'\n", tail);
+		err = zap_lookup(mos, obj, tail, 8, 1, &obj);
+		if (err)
+			goto out;
+	}
+	ds = dsl_dataset_open_obj(dp, obj, tail, mode, tag);
+	if (ds == NULL)
+		err = EBUSY;
+
+out:
+	rw_exit(&dp->dp_config_rwlock);
+	dsl_dir_close(dd, FTAG);
+
+	ASSERT3U((err == 0), ==, (ds != NULL));
+	/* ASSERT(ds == NULL || strcmp(name, ds->ds_name) == 0); */
+
+	*dsp = ds;
+	return (err);
+}
+
+int
+dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp)
+{
+	return (dsl_dataset_open_spa(NULL, name, mode, tag, dsp));
+}
+
+void
+dsl_dataset_name(dsl_dataset_t *ds, char *name)
+{
+	if (ds == NULL) {
+		(void) strcpy(name, "mos");
+	} else {
+		dsl_dir_name(ds->ds_dir, name);
+		dsl_dataset_get_snapname(ds);
+		if (ds->ds_snapname[0]) {
+			(void) strcat(name, "@");
+			if (!MUTEX_HELD(&ds->ds_lock)) {
+				/*
+				 * We use a "recursive" mutex so that we
+				 * can call dprintf_ds() with ds_lock held.
+				 */
+				mutex_enter(&ds->ds_lock);
+				(void) strcat(name, ds->ds_snapname);
+				mutex_exit(&ds->ds_lock);
+			} else {
+				(void) strcat(name, ds->ds_snapname);
+			}
+		}
+	}
+}
+
+void
+dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
+{
+	uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
+	mutex_enter(&ds->ds_lock);
+	ASSERT3U(ds->ds_open_refcount, >=, weight);
+	ds->ds_open_refcount -= weight;
+	dprintf_ds(ds, "closing mode %u refcount now 0x%llx\n",
+	    mode, ds->ds_open_refcount);
+	mutex_exit(&ds->ds_lock);
+
+	dmu_buf_rele_tag(ds->ds_dbuf, tag);
+}
+
+void
+dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
+{
+	objset_t *mos = dp->dp_meta_objset;
+	dmu_buf_t *dbuf;
+	dsl_dataset_phys_t *dsphys;
+	dsl_dataset_t *ds;
+	uint64_t dsobj;
+	dsl_dir_t *dd;
+
+	dsl_dir_create_root(mos, ddobjp, tx);
+	dd = dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG);
+	ASSERT(dd != NULL);
+
+	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_OBJSET, 0,
+	    DMU_OT_DSL_OBJSET, sizeof (dsl_dataset_phys_t), tx);
+	dbuf = dmu_bonus_hold(mos, dsobj);
+	dmu_buf_will_dirty(dbuf, tx);
+	dsphys = dbuf->db_data;
+	dsphys->ds_dir_obj = dd->dd_object;
+	dsphys->ds_fsid_guid = unique_create();
+	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+	    sizeof (dsphys->ds_guid));
+	dsphys->ds_snapnames_zapobj =
+	    zap_create(mos, DMU_OT_DSL_OBJSET_SNAP_MAP, DMU_OT_NONE, 0, tx);
+	dsphys->ds_creation_time = gethrestime_sec();
+	dsphys->ds_creation_txg = tx->tx_txg;
+	dsphys->ds_deadlist_obj =
+	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+	dmu_buf_rele(dbuf);
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	dd->dd_phys->dd_head_dataset_obj = dsobj;
+	dsl_dir_close(dd, FTAG);
+
+	ds = dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG);
+	(void) dmu_objset_create_impl(dp->dp_spa, ds, DMU_OST_ZFS, tx);
+	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+}
+
+int
+dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
+    const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx)
+{
+	int err;
+	dsl_pool_t *dp = pds->dd_pool;
+	dmu_buf_t *dbuf;
+	dsl_dataset_phys_t *dsphys;
+	uint64_t dsobj;
+	objset_t *mos = dp->dp_meta_objset;
+	dsl_dir_t *dd;
+
+	if (clone_parent != NULL) {
+		/*
+		 * You can't clone across pools.
+		 */
+		if (clone_parent->ds_dir->dd_pool != dp)
+			return (EXDEV);
+
+		/*
+		 * You can only clone snapshots, not the head datasets.
+		 */
+		if (clone_parent->ds_phys->ds_num_children == 0)
+			return (EINVAL);
+	}
+
+	ASSERT(lastname[0] != '@');
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	err = dsl_dir_create_sync(pds, lastname, tx);
+	if (err)
+		return (err);
+	dd = dsl_dir_open_spa(dp->dp_spa, fullname, FTAG, NULL);
+	ASSERT(dd != NULL);
+
+	/* This is the point of no (unsuccessful) return */
+
+	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_OBJSET, 0,
+	    DMU_OT_DSL_OBJSET, sizeof (dsl_dataset_phys_t), tx);
+	dbuf = dmu_bonus_hold(mos, dsobj);
+	dmu_buf_will_dirty(dbuf, tx);
+	dsphys = dbuf->db_data;
+	dsphys->ds_dir_obj = dd->dd_object;
+	dsphys->ds_fsid_guid = unique_create();
+	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
+	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+	    sizeof (dsphys->ds_guid));
+	dsphys->ds_snapnames_zapobj =
+	    zap_create(mos, DMU_OT_DSL_OBJSET_SNAP_MAP, DMU_OT_NONE, 0, tx);
+	dsphys->ds_creation_time = gethrestime_sec();
+	dsphys->ds_creation_txg = tx->tx_txg;
+	dsphys->ds_deadlist_obj =
+	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+	if (clone_parent) {
+		dsphys->ds_prev_snap_obj = clone_parent->ds_object;
+		dsphys->ds_prev_snap_txg =
+		    clone_parent->ds_phys->ds_creation_txg;
+		dsphys->ds_used_bytes =
+		    clone_parent->ds_phys->ds_used_bytes;
+		dsphys->ds_compressed_bytes =
+		    clone_parent->ds_phys->ds_compressed_bytes;
+		dsphys->ds_uncompressed_bytes =
+		    clone_parent->ds_phys->ds_uncompressed_bytes;
+		dsphys->ds_bp = clone_parent->ds_phys->ds_bp;
+
+		dmu_buf_will_dirty(clone_parent->ds_dbuf, tx);
+		clone_parent->ds_phys->ds_num_children++;
+
+		dmu_buf_will_dirty(dd->dd_dbuf, tx);
+		dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
+	}
+	dmu_buf_rele(dbuf);
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	dd->dd_phys->dd_head_dataset_obj = dsobj;
+	dsl_dir_close(dd, FTAG);
+
+	return (0);
+}
+
+
+int
+dsl_dataset_destroy(const char *name)
+{
+	int err;
+	dsl_pool_t *dp;
+	dsl_dir_t *dd;
+	const char *tail;
+
+	dd = dsl_dir_open(name, FTAG, &tail);
+	if (dd == NULL)
+		return (ENOENT);
+
+	dp = dd->dd_pool;
+	if (tail != NULL) {
+		if (tail[0] != '@') {
+			dsl_dir_close(dd, FTAG);
+			return (ENOENT);
+		}
+		tail++;
+		/* Just blow away the snapshot */
+		do {
+			txg_wait_synced(dp, 0);
+			err = dsl_dir_sync_task(dd,
+			    dsl_dataset_destroy_sync, (void*)tail, 0);
+		} while (err == EAGAIN);
+		dsl_dir_close(dd, FTAG);
+	} else {
+		char buf[MAXNAMELEN];
+		char *cp;
+
+		dsl_dir_t *pds;
+		if (dd->dd_phys->dd_parent_obj == 0) {
+			dsl_dir_close(dd, FTAG);
+			return (EINVAL);
+		}
+		/*
+		 * Make sure it's not dirty before we destroy it.
+		 */
+		txg_wait_synced(dd->dd_pool, 0);
+		/*
+		 * Blow away the dsl_dir + head dataset.
+		 * dsl_dir_destroy_sync() will call
+		 * dsl_dataset_destroy_sync() to destroy the head dataset.
+		 */
+		rw_enter(&dp->dp_config_rwlock, RW_READER);
+		pds = dsl_dir_open_obj(dd->dd_pool,
+		    dd->dd_phys->dd_parent_obj, NULL, FTAG);
+		dsl_dir_close(dd, FTAG);
+		rw_exit(&dp->dp_config_rwlock);
+
+		(void) strcpy(buf, name);
+		cp = strrchr(buf, '/') + 1;
+		ASSERT(cp[0] != '\0');
+		do {
+			txg_wait_synced(dp, 0);
+			err = dsl_dir_sync_task(pds,
+			    dsl_dir_destroy_sync, cp, 0);
+		} while (err == EAGAIN);
+		dsl_dir_close(pds, FTAG);
+	}
+
+	return (err);
+}
+
+int
+dsl_dataset_rollback(const char *name)
+{
+	int err;
+	dsl_dir_t *dd;
+	const char *tail;
+
+	dd = dsl_dir_open(name, FTAG, &tail);
+	if (dd == NULL)
+		return (ENOENT);
+
+	if (tail != NULL) {
+		dsl_dir_close(dd, FTAG);
+		return (EINVAL);
+	}
+	do {
+		txg_wait_synced(dd->dd_pool, 0);
+		err = dsl_dir_sync_task(dd,
+		    dsl_dataset_rollback_sync, NULL, 0);
+	} while (err == EAGAIN);
+	dsl_dir_close(dd, FTAG);
+
+	return (err);
+}
+
+void *
+dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
+    void *p, dsl_dataset_evict_func_t func)
+{
+	void *old;
+
+	mutex_enter(&ds->ds_lock);
+	old = ds->ds_user_ptr;
+	if (old == NULL) {
+		ds->ds_user_ptr = p;
+		ds->ds_user_evict_func = func;
+	}
+	mutex_exit(&ds->ds_lock);
+	return (old);
+}
+
+void *
+dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
+{
+	return (ds->ds_user_ptr);
+}
+
+
+void
+dsl_dataset_get_blkptr(dsl_dataset_t *ds, blkptr_t *bp)
+{
+	*bp = ds->ds_phys->ds_bp;
+}
+
+void
+dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+{
+	ASSERT(dmu_tx_is_syncing(tx));
+	/* If it's the meta-objset, set dp_meta_rootbp */
+	if (ds == NULL) {
+		tx->tx_pool->dp_meta_rootbp = *bp;
+	} else {
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		ds->ds_phys->ds_bp = *bp;
+	}
+}
+
+spa_t *
+dsl_dataset_get_spa(dsl_dataset_t *ds)
+{
+	return (ds->ds_dir->dd_pool->dp_spa);
+}
+
+void
+dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp;
+
+	if (ds == NULL) /* this is the meta-objset */
+		return;
+
+	ASSERT(ds->ds_user_ptr != NULL);
+	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
+
+	dp = ds->ds_dir->dd_pool;
+
+	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
+		/* up the hold count until we can be written out */
+		dmu_buf_add_ref(ds->ds_dbuf, ds);
+	}
+}
+
+struct killarg {
+	uint64_t *usedp;
+	uint64_t *compressedp;
+	uint64_t *uncompressedp;
+	zio_t *zio;
+	dmu_tx_t *tx;
+};
+
+static int
+kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
+{
+	struct killarg *ka = arg;
+	blkptr_t *bp = &bc->bc_blkptr;
+
+	ASSERT3U(bc->bc_errno, ==, 0);
+
+	/*
+	 * Since this callback is not called concurrently, no lock is
+	 * needed on the accounting values.
+	 */
+	*ka->usedp += BP_GET_ASIZE(bp);
+	*ka->compressedp += BP_GET_PSIZE(bp);
+	*ka->uncompressedp += BP_GET_UCSIZE(bp);
+	/* XXX check for EIO? */
+	(void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
+	    ARC_NOWAIT);
+	return (0);
+}
+
+/* ARGSUSED */
+int
+dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	dsl_dataset_t *ds;
+
+	if (dd->dd_phys->dd_head_dataset_obj == 0)
+		return (EINVAL);
+	ds = dsl_dataset_open_obj(dd->dd_pool,
+	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+
+	if (ds->ds_phys->ds_prev_snap_txg == 0) {
+		/*
+		 * There's no previous snapshot.  I suppose we could
+		 * roll it back to being empty (and re-initialize the
+		 * upper (ZPL) layer).  But for now there's no way to do
+		 * this via the user interface.
+		 */
+		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		return (EINVAL);
+	}
+
+	mutex_enter(&ds->ds_lock);
+	if (ds->ds_open_refcount > 0) {
+		mutex_exit(&ds->ds_lock);
+		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		return (EBUSY);
+	}
+
+	/*
+	 * If we made changes this txg, traverse_dsl_dataset won't find
+	 * them.  Try again.
+	 */
+	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) {
+		mutex_exit(&ds->ds_lock);
+		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		return (EAGAIN);
+	}
+
+	/* THE POINT OF NO (unsuccessful) RETURN */
+	ds->ds_open_refcount = DOS_REF_MAX;
+	mutex_exit(&ds->ds_lock);
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+	/* Zero out the deadlist. */
+	dprintf("old deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
+	bplist_close(&ds->ds_deadlist);
+	bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+	ds->ds_phys->ds_deadlist_obj =
+	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+	bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+	dprintf("new deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
+
+	{
+		/* Free blkptrs that we gave birth to */
+		zio_t *zio;
+		uint64_t used = 0, compressed = 0, uncompressed = 0;
+		struct killarg ka;
+
+		zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
+		    ZIO_FLAG_MUSTSUCCEED);
+		ka.usedp = &used;
+		ka.compressedp = &compressed;
+		ka.uncompressedp = &uncompressed;
+		ka.zio = zio;
+		ka.tx = tx;
+		(void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
+		    ADVANCE_POST, kill_blkptr, &ka);
+		(void) zio_wait(zio);
+
+		dsl_dir_diduse_space(dd,
+		    -used, -compressed, -uncompressed, tx);
+	}
+
+	/* Change our contents to that of the prev snapshot (finally!) */
+	ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj);
+	ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
+	ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes;
+	ds->ds_phys->ds_compressed_bytes =
+	    ds->ds_prev->ds_phys->ds_compressed_bytes;
+	ds->ds_phys->ds_uncompressed_bytes =
+	    ds->ds_prev->ds_phys->ds_uncompressed_bytes;
+	ds->ds_phys->ds_restoring = ds->ds_prev->ds_phys->ds_restoring;
+	ds->ds_phys->ds_unique_bytes = 0;
+
+	dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+	ds->ds_prev->ds_phys->ds_unique_bytes = 0;
+
+	dprintf("new deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
+	ds->ds_open_refcount = 0;
+	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+
+	return (0);
+}
+
+int
+dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	const char *snapname = arg;
+	uint64_t used = 0, compressed = 0, uncompressed = 0;
+	blkptr_t bp;
+	zio_t *zio;
+	int err;
+	int after_branch_point = FALSE;
+	int drop_lock = FALSE;
+	dsl_pool_t *dp = dd->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	dsl_dataset_t *ds, *ds_prev = NULL;
+	uint64_t obj;
+
+	if (dd->dd_phys->dd_head_dataset_obj == 0)
+		return (EINVAL);
+
+	if (!RW_WRITE_HELD(&dp->dp_config_rwlock)) {
+		rw_enter(&dp->dp_config_rwlock, RW_WRITER);
+		drop_lock = TRUE;
+	}
+
+	ds = dsl_dataset_open_obj(dd->dd_pool,
+	    dd->dd_phys->dd_head_dataset_obj, NULL,
+	    snapname ? DS_MODE_NONE : DS_MODE_EXCLUSIVE, FTAG);
+
+	if (snapname) {
+		err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
+		    snapname, 8, 1, &obj);
+		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		if (err) {
+			if (drop_lock)
+				rw_exit(&dp->dp_config_rwlock);
+			return (err);
+		}
+
+		ds = dsl_dataset_open_obj(dd->dd_pool, obj, NULL,
+		    DS_MODE_EXCLUSIVE, FTAG);
+	}
+	if (ds == NULL) {
+		if (drop_lock)
+			rw_exit(&dp->dp_config_rwlock);
+		return (EBUSY);
+	}
+
+	obj = ds->ds_object;
+
+	/* Can't delete a branch point. */
+	if (ds->ds_phys->ds_num_children > 1) {
+		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+		if (drop_lock)
+			rw_exit(&dp->dp_config_rwlock);
+		return (EINVAL);
+	}
+
+	/*
+	 * Can't delete a head dataset if there are snapshots of it.
+	 * (Except if the only snapshots are from the branch we cloned
+	 * from.)
+	 */
+	if (ds->ds_prev != NULL &&
+	    ds->ds_prev->ds_phys->ds_next_snap_obj == obj) {
+		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+		if (drop_lock)
+			rw_exit(&dp->dp_config_rwlock);
+		return (EINVAL);
+	}
+
+	/*
+	 * If we made changes this txg, traverse_dsl_dataset won't find
+	 * them.  Try again.
+	 */
+	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) {
+		mutex_exit(&ds->ds_lock);
+		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		return (EAGAIN);
+	}
+
+	/* THE POINT OF NO (unsuccessful) RETURN */
+
+	if (ds->ds_phys->ds_prev_snap_obj != 0) {
+		if (ds->ds_prev) {
+			ds_prev = ds->ds_prev;
+		} else {
+			ds_prev = dsl_dataset_open_obj(dd->dd_pool,
+			    ds->ds_phys->ds_prev_snap_obj, NULL,
+			    DS_MODE_NONE, FTAG);
+		}
+		after_branch_point =
+		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
+
+		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
+		if (after_branch_point &&
+		    ds->ds_phys->ds_next_snap_obj == 0) {
+			/* This clone is toast. */
+			ASSERT(ds_prev->ds_phys->ds_num_children > 1);
+			ds_prev->ds_phys->ds_num_children--;
+		} else if (!after_branch_point) {
+			ds_prev->ds_phys->ds_next_snap_obj =
+			    ds->ds_phys->ds_next_snap_obj;
+		}
+	}
+
+	ASSERT3P(tx->tx_pool, ==, dd->dd_pool);
+	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+
+	if (ds->ds_phys->ds_next_snap_obj != 0) {
+		dsl_dataset_t *ds_next;
+		uint64_t itor = 0;
+
+		spa_scrub_restart(dp->dp_spa, tx->tx_txg);
+
+		ds_next = dsl_dataset_open_obj(dd->dd_pool,
+		    ds->ds_phys->ds_next_snap_obj, NULL, DS_MODE_NONE, FTAG);
+		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
+
+		dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
+		ds_next->ds_phys->ds_prev_snap_obj =
+		    ds->ds_phys->ds_prev_snap_obj;
+		ds_next->ds_phys->ds_prev_snap_txg =
+		    ds->ds_phys->ds_prev_snap_txg;
+		ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
+		    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
+
+		/*
+		 * Transfer to our deadlist (which will become next's
+		 * new deadlist) any entries from next's current
+		 * deadlist which were born before prev, and free the
+		 * other entries.
+		 *
+		 * XXX we're doing this long task with the config lock held
+		 */
+		while (bplist_iterate(&ds_next->ds_deadlist, &itor,
+		    &bp) == 0) {
+			if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
+				bplist_enqueue(&ds->ds_deadlist, &bp, tx);
+				if (ds_prev && !after_branch_point &&
+				    bp.blk_birth >
+				    ds_prev->ds_phys->ds_prev_snap_txg) {
+					ds_prev->ds_phys->ds_unique_bytes +=
+					    BP_GET_ASIZE(&bp);
+				}
+			} else {
+				used += BP_GET_ASIZE(&bp);
+				compressed += BP_GET_PSIZE(&bp);
+				uncompressed += BP_GET_UCSIZE(&bp);
+				/* XXX check return value? */
+				(void) arc_free(zio, dp->dp_spa, tx->tx_txg,
+				    &bp, NULL, NULL, ARC_NOWAIT);
+			}
+		}
+
+		/* free next's deadlist */
+		bplist_close(&ds_next->ds_deadlist);
+		bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
+
+		/* set next's deadlist to our deadlist */
+		ds_next->ds_phys->ds_deadlist_obj =
+		    ds->ds_phys->ds_deadlist_obj;
+		bplist_open(&ds_next->ds_deadlist, mos,
+		    ds_next->ds_phys->ds_deadlist_obj);
+		ds->ds_phys->ds_deadlist_obj = 0;
+
+		if (ds_next->ds_phys->ds_next_snap_obj != 0) {
+			/*
+			 * Update next's unique to include blocks which
+			 * were previously shared by only this snapshot
+			 * and it.  Those blocks will be born after the
+			 * prev snap and before this snap, and will have
+			 * died after the next snap and before the one
+			 * after that (ie. be on the snap after next's
+			 * deadlist).
+			 *
+			 * XXX we're doing this long task with the
+			 * config lock held
+			 */
+			dsl_dataset_t *ds_after_next;
+
+			ds_after_next = dsl_dataset_open_obj(dd->dd_pool,
+			    ds_next->ds_phys->ds_next_snap_obj, NULL,
+			    DS_MODE_NONE, FTAG);
+			itor = 0;
+			while (bplist_iterate(&ds_after_next->ds_deadlist,
+			    &itor, &bp) == 0) {
+				if (bp.blk_birth >
+				    ds->ds_phys->ds_prev_snap_txg &&
+				    bp.blk_birth <=
+				    ds->ds_phys->ds_creation_txg) {
+					ds_next->ds_phys->ds_unique_bytes +=
+					    BP_GET_ASIZE(&bp);
+				}
+			}
+
+			dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
+			ASSERT3P(ds_next->ds_prev, ==, NULL);
+		} else {
+			/*
+			 * It would be nice to update the head dataset's
+			 * unique.  To do so we would have to traverse
+			 * it for blocks born after ds_prev, which is
+			 * pretty expensive just to maintain something
+			 * for debugging purposes.
+			 */
+			ASSERT3P(ds_next->ds_prev, ==, ds);
+			dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
+			    ds_next);
+			if (ds_prev) {
+				ds_next->ds_prev = dsl_dataset_open_obj(
+				    dd->dd_pool, ds->ds_phys->ds_prev_snap_obj,
+				    NULL, DS_MODE_NONE, ds_next);
+			} else {
+				ds_next->ds_prev = NULL;
+			}
+		}
+		dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
+
+		/*
+		 * NB: unique_bytes is not accurate for head objsets
+		 * because we don't update it when we delete the most
+		 * recent snapshot -- see above comment.
+		 */
+		ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
+	} else {
+		/*
+		 * There's no next snapshot, so this is a head dataset.
+		 * Destroy the deadlist.  Unless it's a clone, the
+		 * deadlist should be empty.  (If it's a clone, it's
+		 * safe to ignore the deadlist contents.)
+		 */
+		struct killarg ka;
+
+		ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
+		bplist_close(&ds->ds_deadlist);
+		bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+		ds->ds_phys->ds_deadlist_obj = 0;
+
+		/*
+		 * Free everything that we point to (that's born after
+		 * the previous snapshot, if we are a clone)
+		 *
+		 * XXX we're doing this long task with the config lock held
+		 */
+		ka.usedp = &used;
+		ka.compressedp = &compressed;
+		ka.uncompressedp = &uncompressed;
+		ka.zio = zio;
+		ka.tx = tx;
+		err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
+		    ADVANCE_POST, kill_blkptr, &ka);
+		ASSERT3U(err, ==, 0);
+	}
+
+	err = zio_wait(zio);
+	ASSERT3U(err, ==, 0);
+
+	dsl_dir_diduse_space(dd, -used, -compressed, -uncompressed, tx);
+
+	if (ds->ds_phys->ds_snapnames_zapobj) {
+		err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
+		ASSERT(err == 0);
+	}
+
+	if (dd->dd_phys->dd_head_dataset_obj == ds->ds_object) {
+		/* Erase the link in the dataset */
+		dmu_buf_will_dirty(dd->dd_dbuf, tx);
+		dd->dd_phys->dd_head_dataset_obj = 0;
+		/*
+		 * dsl_dir_sync_destroy() called us, they'll destroy
+		 * the dataset.
+		 */
+	} else {
+		/* remove from snapshot namespace */
+		dsl_dataset_t *ds_head;
+		ds_head = dsl_dataset_open_obj(dd->dd_pool,
+		    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+#ifdef ZFS_DEBUG
+		{
+			uint64_t val;
+			err = zap_lookup(mos,
+			    ds_head->ds_phys->ds_snapnames_zapobj,
+			    snapname, 8, 1, &val);
+			ASSERT3U(err, ==, 0);
+			ASSERT3U(val, ==, obj);
+		}
+#endif
+		err = zap_remove(mos, ds_head->ds_phys->ds_snapnames_zapobj,
+		    snapname, tx);
+		ASSERT(err == 0);
+		dsl_dataset_close(ds_head, DS_MODE_NONE, FTAG);
+	}
+
+	if (ds_prev && ds->ds_prev != ds_prev)
+		dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
+
+	err = dmu_object_free(mos, obj, tx);
+	ASSERT(err == 0);
+
+	/*
+	 * Close the objset with mode NONE, thus leaving it with
+	 * DOS_REF_MAX set, so that noone can access it.
+	 */
+	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+
+	if (drop_lock)
+		rw_exit(&dp->dp_config_rwlock);
+	return (0);
+}
+
+int
+dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	const char *snapname = arg;
+	dsl_pool_t *dp = dd->dd_pool;
+	dmu_buf_t *dbuf;
+	dsl_dataset_phys_t *dsphys;
+	uint64_t dsobj, value;
+	objset_t *mos = dp->dp_meta_objset;
+	dsl_dataset_t *ds;
+	int err;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	if (dd->dd_phys->dd_head_dataset_obj == 0)
+		return (EINVAL);
+	ds = dsl_dataset_open_obj(dp, dd->dd_phys->dd_head_dataset_obj, NULL,
+	    DS_MODE_NONE, FTAG);
+
+	err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
+	    snapname, 8, 1, &value);
+	if (err == 0) {
+		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		return (EEXIST);
+	}
+	ASSERT(err == ENOENT);
+
+	/* The point of no (unsuccessful) return */
+
+	dprintf_dd(dd, "taking snapshot %s in txg %llu\n",
+	    snapname, tx->tx_txg);
+
+	spa_scrub_restart(dp->dp_spa, tx->tx_txg);
+
+	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
+
+	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_OBJSET, 0,
+	    DMU_OT_DSL_OBJSET, sizeof (dsl_dataset_phys_t), tx);
+	dbuf = dmu_bonus_hold(mos, dsobj);
+	dmu_buf_will_dirty(dbuf, tx);
+	dsphys = dbuf->db_data;
+	dsphys->ds_dir_obj = dd->dd_object;
+	dsphys->ds_fsid_guid = unique_create();
+	unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
+	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+	    sizeof (dsphys->ds_guid));
+	dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
+	dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+	dsphys->ds_next_snap_obj = ds->ds_object;
+	dsphys->ds_num_children = 1;
+	dsphys->ds_creation_time = gethrestime_sec();
+	dsphys->ds_creation_txg = tx->tx_txg;
+	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
+	dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
+	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
+	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
+	dsphys->ds_restoring = ds->ds_phys->ds_restoring;
+	dsphys->ds_bp = ds->ds_phys->ds_bp;
+	dmu_buf_rele(dbuf);
+
+	if (ds->ds_phys->ds_prev_snap_obj != 0) {
+		dsl_dataset_t *ds_prev;
+
+		ds_prev = dsl_dataset_open_obj(dp,
+		    ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_NONE, FTAG);
+		ASSERT(ds_prev->ds_phys->ds_next_snap_obj ==
+		    ds->ds_object ||
+		    ds_prev->ds_phys->ds_num_children > 1);
+		if (ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+			dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
+			ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
+			    ds_prev->ds_phys->ds_creation_txg);
+			ds_prev->ds_phys->ds_next_snap_obj = dsobj;
+		}
+		dsl_dataset_close(ds_prev, DS_MODE_NONE, FTAG);
+	} else {
+		ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 0);
+	}
+
+	bplist_close(&ds->ds_deadlist);
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg);
+	ds->ds_phys->ds_prev_snap_obj = dsobj;
+	ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg;
+	ds->ds_phys->ds_unique_bytes = 0;
+	ds->ds_phys->ds_deadlist_obj =
+	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+	bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+
+	dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
+	err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
+	    snapname, 8, 1, &dsobj, tx);
+	ASSERT(err == 0);
+
+	if (ds->ds_prev)
+		dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
+	ds->ds_prev = dsl_dataset_open_obj(dp,
+	    ds->ds_phys->ds_prev_snap_obj, snapname, DS_MODE_NONE, ds);
+
+	rw_exit(&dp->dp_config_rwlock);
+	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+
+	return (0);
+}
+
+void
+dsl_dataset_sync(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(ds->ds_user_ptr != NULL);
+	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
+
+	dmu_objset_sync(ds->ds_user_ptr, tx);
+	dsl_dir_dirty(ds->ds_dir, tx);
+	bplist_close(&ds->ds_deadlist);
+
+	dmu_buf_remove_ref(ds->ds_dbuf, ds);
+}
+
+void
+dsl_dataset_stats(dsl_dataset_t *ds, dmu_objset_stats_t *dds)
+{
+	/* fill in properties crap */
+	dsl_dir_stats(ds->ds_dir, dds);
+
+	if (ds->ds_phys->ds_num_children != 0) {
+		dds->dds_is_snapshot = TRUE;
+		dds->dds_num_clones = ds->ds_phys->ds_num_children - 1;
+	}
+
+	dds->dds_last_txg = ds->ds_phys->ds_bp.blk_birth;
+
+	dds->dds_objects_used = ds->ds_phys->ds_bp.blk_fill;
+	dds->dds_objects_avail = DN_MAX_OBJECT - dds->dds_objects_used;
+
+	/* We override the dataset's creation time... they should be the same */
+	dds->dds_creation_time = ds->ds_phys->ds_creation_time;
+	dds->dds_creation_txg = ds->ds_phys->ds_creation_txg;
+	dds->dds_space_refd = ds->ds_phys->ds_used_bytes;
+	dds->dds_fsid_guid = ds->ds_phys->ds_fsid_guid;
+	dds->dds_guid = ds->ds_phys->ds_guid;
+
+	if (ds->ds_phys->ds_next_snap_obj) {
+		/*
+		 * This is a snapshot; override the dd's space used with
+		 * our unique space
+		 */
+		dds->dds_space_used = ds->ds_phys->ds_unique_bytes;
+		dds->dds_compressed_bytes =
+		    ds->ds_phys->ds_compressed_bytes;
+		dds->dds_uncompressed_bytes =
+		    ds->ds_phys->ds_uncompressed_bytes;
+	}
+
+	dds->dds_objset_obj = ds->ds_object;
+}
+
+dsl_pool_t *
+dsl_dataset_pool(dsl_dataset_t *ds)
+{
+	return (ds->ds_dir->dd_pool);
+}
+
+struct osrenamearg {
+	const char *oldname;
+	const char *newname;
+};
+
+static int
+dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	struct osrenamearg *ora = arg;
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	dsl_dir_t *nds;
+	const char *tail;
+	int err;
+	dsl_dataset_t *snds, *fsds;
+	uint64_t val;
+
+	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, ora->oldname,
+	    DS_MODE_READONLY | DS_MODE_STANDARD, FTAG, &snds);
+	if (err)
+		return (err);
+
+	if (snds->ds_dir != dd) {
+		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+		return (EINVAL);
+	}
+
+	/* better be changing a snapshot */
+	if (snds->ds_phys->ds_next_snap_obj == 0) {
+		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+		return (EINVAL);
+	}
+
+	/* new fs better exist */
+	nds = dsl_dir_open_spa(dd->dd_pool->dp_spa, ora->newname, FTAG, &tail);
+	if (nds == NULL) {
+		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+		return (ENOENT);
+	}
+
+	dsl_dir_close(nds, FTAG);
+
+	/* new name better be in same fs */
+	if (nds != dd) {
+		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+		return (EINVAL);
+	}
+
+	/* new name better be a snapshot */
+	if (tail == NULL || tail[0] != '@') {
+		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+		return (EINVAL);
+	}
+
+	tail++;
+
+	fsds = dsl_dataset_open_obj(dd->dd_pool,
+	    dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+
+	/* new name better not be in use */
+	err = zap_lookup(mos, fsds->ds_phys->ds_snapnames_zapobj,
+	    tail, 8, 1, &val);
+	if (err != ENOENT) {
+		if (err == 0)
+			err = EEXIST;
+		dsl_dataset_close(fsds, DS_MODE_NONE, FTAG);
+		dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+		return (EEXIST);
+	}
+
+	/* The point of no (unsuccessful) return */
+
+	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_WRITER);
+	dsl_dataset_get_snapname(snds);
+	err = zap_remove(mos, fsds->ds_phys->ds_snapnames_zapobj,
+	    snds->ds_snapname, tx);
+	ASSERT3U(err, ==, 0);
+	mutex_enter(&snds->ds_lock);
+	(void) strcpy(snds->ds_snapname, tail);
+	mutex_exit(&snds->ds_lock);
+	err = zap_add(mos, fsds->ds_phys->ds_snapnames_zapobj,
+	    snds->ds_snapname, 8, 1, &snds->ds_object, tx);
+	ASSERT3U(err, ==, 0);
+	rw_exit(&dd->dd_pool->dp_config_rwlock);
+
+	dsl_dataset_close(fsds, DS_MODE_NONE, FTAG);
+	dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+	return (0);
+}
+
+#pragma weak dmu_objset_rename = dsl_dataset_rename
+int
+dsl_dataset_rename(const char *osname, const char *newname)
+{
+	dsl_dir_t *dd;
+	const char *tail;
+	struct osrenamearg ora;
+	int err;
+
+	dd = dsl_dir_open(osname, FTAG, &tail);
+	if (dd == NULL)
+		return (ENOENT);
+	if (tail == NULL) {
+		err = dsl_dir_sync_task(dd,
+		    dsl_dir_rename_sync, (void*)newname, 1<<12);
+		dsl_dir_close(dd, FTAG);
+		return (err);
+	}
+	if (tail[0] != '@') {
+		/* the name ended in a nonexistant component */
+		dsl_dir_close(dd, FTAG);
+		return (ENOENT);
+	}
+
+	ora.oldname = osname;
+	ora.newname = newname;
+
+	err = dsl_dir_sync_task(dd,
+	    dsl_dataset_snapshot_rename_sync, &ora, 1<<12);
+	dsl_dir_close(dd, FTAG);
+	return (err);
+}
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
new file mode 100644
index 000000000000..3b0d32de7045
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -0,0 +1,1217 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include "zfs_namecheck.h"
+
+static uint64_t dsl_dir_space_accounted(dsl_dir_t *dd);
+static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd);
+static int dsl_dir_set_reservation_sync(dsl_dir_t *dd,
+    void *arg, dmu_tx_t *tx);
+static uint64_t dsl_dir_space_available(dsl_dir_t *dd,
+    dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
+
+
+/* ARGSUSED */
+static void
+dsl_dir_evict(dmu_buf_t *db, void *arg)
+{
+	dsl_dir_t *dd = arg;
+	dsl_pool_t *dp = dd->dd_pool;
+	int t;
+
+	for (t = 0; t < TXG_SIZE; t++) {
+		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
+		ASSERT(dd->dd_tempreserved[t] == 0);
+		ASSERT(dd->dd_space_towrite[t] == 0);
+	}
+
+	ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes);
+
+	ASSERT(dd->dd_sync_txg == 0);
+
+	if (dd->dd_parent)
+		dsl_dir_close(dd->dd_parent, dd);
+
+	spa_close(dd->dd_pool->dp_spa, dd);
+
+	/*
+	 * The props callback list should be empty since they hold the
+	 * dir open.
+	 */
+	list_destroy(&dd->dd_prop_cbs);
+	kmem_free(dd, sizeof (dsl_dir_t));
+}
+
+dsl_dir_t *
+dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+    const char *tail, void *tag)
+{
+	dmu_buf_t *dbuf;
+	dsl_dir_t *dd;
+
+	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+	    dsl_pool_sync_context(dp));
+
+	dbuf = dmu_bonus_hold_tag(dp->dp_meta_objset, ddobj, tag);
+	dmu_buf_read(dbuf);
+	dd = dmu_buf_get_user(dbuf);
+#ifdef ZFS_DEBUG
+	{
+		dmu_object_info_t doi;
+		dmu_object_info_from_db(dbuf, &doi);
+		ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DATASET);
+	}
+#endif
+	/* XXX assert bonus buffer size is correct */
+	if (dd == NULL) {
+		dsl_dir_t *winner;
+		int err;
+
+		dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
+		dd->dd_object = ddobj;
+		dd->dd_dbuf = dbuf;
+		dd->dd_pool = dp;
+		dd->dd_phys = dbuf->db_data;
+		dd->dd_used_bytes = dd->dd_phys->dd_used_bytes;
+
+		list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
+		    offsetof(dsl_prop_cb_record_t, cbr_node));
+
+		if (dd->dd_phys->dd_parent_obj) {
+			dd->dd_parent = dsl_dir_open_obj(dp,
+			    dd->dd_phys->dd_parent_obj, NULL, dd);
+			if (tail) {
+#ifdef ZFS_DEBUG
+				uint64_t foundobj;
+
+				err = zap_lookup(dp->dp_meta_objset,
+				    dd->dd_parent->dd_phys->
+				    dd_child_dir_zapobj,
+				    tail, sizeof (foundobj), 1, &foundobj);
+				ASSERT3U(err, ==, 0);
+				ASSERT3U(foundobj, ==, ddobj);
+#endif
+				(void) strcpy(dd->dd_myname, tail);
+			} else {
+				err = zap_value_search(dp->dp_meta_objset,
+				    dd->dd_parent->dd_phys->
+				    dd_child_dir_zapobj,
+				    ddobj, dd->dd_myname);
+				/*
+				 * The caller should be protecting this ddobj
+				 * from being deleted concurrently
+				 */
+				ASSERT(err == 0);
+			}
+		} else {
+			(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
+		}
+
+		winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
+		    dsl_dir_evict);
+		if (winner) {
+			if (dd->dd_parent)
+				dsl_dir_close(dd->dd_parent, dd);
+			kmem_free(dd, sizeof (dsl_dir_t));
+			dd = winner;
+		} else {
+			spa_open_ref(dp->dp_spa, dd);
+		}
+	}
+
+	/*
+	 * The dsl_dir_t has both open-to-close and instantiate-to-evict
+	 * holds on the spa.  We need the open-to-close holds because
+	 * otherwise the spa_refcnt wouldn't change when we open a
+	 * dir which the spa also has open, so we could incorrectly
+	 * think it was OK to unload/export/destroy the pool.  We need
+	 * the instantiate-to-evict hold because the dsl_dir_t has a
+	 * pointer to the dd_pool, which has a pointer to the spa_t.
+	 */
+	spa_open_ref(dp->dp_spa, tag);
+	ASSERT3P(dd->dd_pool, ==, dp);
+	ASSERT3U(dd->dd_object, ==, ddobj);
+	ASSERT3P(dd->dd_dbuf, ==, dbuf);
+	return (dd);
+}
+
+void
+dsl_dir_close(dsl_dir_t *dd, void *tag)
+{
+	dprintf_dd(dd, "%s\n", "");
+	spa_close(dd->dd_pool->dp_spa, tag);
+	dmu_buf_rele_tag(dd->dd_dbuf, tag);
+}
+
+/* buf must be long enough (MAXNAMELEN should do) */
+void
+dsl_dir_name(dsl_dir_t *dd, char *buf)
+{
+	if (dd->dd_parent) {
+		dsl_dir_name(dd->dd_parent, buf);
+		(void) strcat(buf, "/");
+	} else {
+		buf[0] = '\0';
+	}
+	if (!MUTEX_HELD(&dd->dd_lock)) {
+		/*
+		 * recursive mutex so that we can use
+		 * dprintf_dd() with dd_lock held
+		 */
+		mutex_enter(&dd->dd_lock);
+		(void) strcat(buf, dd->dd_myname);
+		mutex_exit(&dd->dd_lock);
+	} else {
+		(void) strcat(buf, dd->dd_myname);
+	}
+}
+
+int
+dsl_dir_is_private(dsl_dir_t *dd)
+{
+	int rv = FALSE;
+
+	if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent))
+		rv = TRUE;
+	if (dataset_name_hidden(dd->dd_myname))
+		rv = TRUE;
+	return (rv);
+}
+
+
+static int
+getcomponent(const char *path, char *component, const char **nextp)
+{
+	char *p;
+	if (path == NULL)
+		return (NULL);
+	/* This would be a good place to reserve some namespace... */
+	p = strpbrk(path, "/@");
+	if (p && (p[1] == '/' || p[1] == '@')) {
+		/* two separators in a row */
+		return (EINVAL);
+	}
+	if (p == NULL || p == path) {
+		/*
+		 * if the first thing is an @ or /, it had better be an
+		 * @ and it had better not have any more ats or slashes,
+		 * and it had better have something after the @.
+		 */
+		if (p != NULL &&
+		    (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
+			return (EINVAL);
+		if (strlen(path) >= MAXNAMELEN)
+			return (ENAMETOOLONG);
+		(void) strcpy(component, path);
+		p = NULL;
+	} else if (p[0] == '/') {
+		if (p-path >= MAXNAMELEN)
+			return (ENAMETOOLONG);
+		(void) strncpy(component, path, p - path);
+		component[p-path] = '\0';
+		p++;
+	} else if (p[0] == '@') {
+		/*
+		 * if the next separator is an @, there better not be
+		 * any more slashes.
+		 */
+		if (strchr(path, '/'))
+			return (EINVAL);
+		if (p-path >= MAXNAMELEN)
+			return (ENAMETOOLONG);
+		(void) strncpy(component, path, p - path);
+		component[p-path] = '\0';
+	} else {
+		ASSERT(!"invalid p");
+	}
+	*nextp = p;
+	return (0);
+}
+
+/*
+ * same as dsl_open_dir, ignore the first component of name and use the
+ * spa instead
+ */
+dsl_dir_t *
+dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
+{
+	char buf[MAXNAMELEN];
+	const char *next, *nextnext = NULL;
+	int err;
+	dsl_dir_t *dd;
+	dsl_pool_t *dp;
+	uint64_t ddobj;
+	int openedspa = FALSE;
+
+	dprintf("%s\n", name);
+
+	if (name == NULL)
+		return (NULL);
+	err = getcomponent(name, buf, &next);
+	if (err)
+		return (NULL);
+	if (spa == NULL) {
+		err = spa_open(buf, &spa, FTAG);
+		if (err) {
+			dprintf("spa_open(%s) failed\n", buf);
+			return (NULL);
+		}
+		openedspa = TRUE;
+
+		/* XXX this assertion belongs in spa_open */
+		ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa)));
+	}
+
+	dp = spa_get_dsl(spa);
+
+	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	dd = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag);
+	while (next != NULL) {
+		dsl_dir_t *child_ds;
+		err = getcomponent(next, buf, &nextnext);
+		if (err) {
+			dsl_dir_close(dd, tag);
+			if (openedspa)
+				spa_close(spa, FTAG);
+			return (NULL);
+		}
+		ASSERT(next[0] != '\0');
+		if (next[0] == '@')
+			break;
+		if (dd->dd_phys->dd_child_dir_zapobj == 0)
+			break;
+		dprintf("looking up %s in obj%lld\n",
+		    buf, dd->dd_phys->dd_child_dir_zapobj);
+
+		err = zap_lookup(dp->dp_meta_objset,
+		    dd->dd_phys->dd_child_dir_zapobj,
+		    buf, sizeof (ddobj), 1, &ddobj);
+		if (err == ENOENT) {
+			break;
+		}
+		ASSERT(err == 0);
+
+		child_ds = dsl_dir_open_obj(dp, ddobj, buf, tag);
+		dsl_dir_close(dd, tag);
+		dd = child_ds;
+		next = nextnext;
+	}
+	rw_exit(&dp->dp_config_rwlock);
+
+	/*
+	 * It's an error if there's more than one component left, or
+	 * tailp==NULL and there's any component left.
+	 */
+	if (next != NULL &&
+	    (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
+		/* bad path name */
+		dsl_dir_close(dd, tag);
+		dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
+		next = NULL;
+		dd = NULL;
+	}
+	if (tailp)
+		*tailp = next;
+	if (openedspa)
+		spa_close(spa, FTAG);
+	return (dd);
+}
+
+/*
+ * Return the dsl_dir_t, and possibly the last component which couldn't
+ * be found in *tail.  Return NULL if the path is bogus, or if
+ * tail==NULL and we couldn't parse the whole name.  (*tail)[0] == '@'
+ * means that the last component is a snapshot.
+ */
+dsl_dir_t *
+dsl_dir_open(const char *name, void *tag, const char **tailp)
+{
+	return (dsl_dir_open_spa(NULL, name, tag, tailp));
+}
+
+int
+dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
+{
+	objset_t *mos = pds->dd_pool->dp_meta_objset;
+	uint64_t ddobj;
+	dsl_dir_phys_t *dsphys;
+	dmu_buf_t *dbuf;
+	int err;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	if (pds->dd_phys->dd_child_dir_zapobj == 0) {
+		dmu_buf_will_dirty(pds->dd_dbuf, tx);
+		pds->dd_phys->dd_child_dir_zapobj = zap_create(mos,
+		    DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx);
+	}
+
+	rw_enter(&pds->dd_pool->dp_config_rwlock, RW_WRITER);
+	err = zap_lookup(mos, pds->dd_phys->dd_child_dir_zapobj,
+	    name, sizeof (uint64_t), 1, &ddobj);
+	if (err != ENOENT) {
+		rw_exit(&pds->dd_pool->dp_config_rwlock);
+		return (err ? err : EEXIST);
+	}
+
+	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+	    DMU_OT_DSL_DATASET, sizeof (dsl_dir_phys_t), tx);
+	err = zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
+	    name, sizeof (uint64_t), 1, &ddobj, tx);
+	ASSERT3U(err, ==, 0);
+	dprintf("dataset_create: zap_add %s->%lld to %lld returned %d\n",
+	    name, ddobj, pds->dd_phys->dd_child_dir_zapobj, err);
+
+	dbuf = dmu_bonus_hold(mos, ddobj);
+	dmu_buf_will_dirty(dbuf, tx);
+	dsphys = dbuf->db_data;
+
+	dsphys->dd_creation_time = gethrestime_sec();
+	dsphys->dd_parent_obj = pds->dd_object;
+	dsphys->dd_props_zapobj = zap_create(mos,
+	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+	dsphys->dd_child_dir_zapobj = zap_create(mos,
+	    DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx);
+	dmu_buf_rele(dbuf);
+
+	rw_exit(&pds->dd_pool->dp_config_rwlock);
+
+	return (0);
+}
+
+int
+dsl_dir_destroy_sync(dsl_dir_t *pds, void *arg, dmu_tx_t *tx)
+{
+	const char *name = arg;
+	dsl_dir_t *dd = NULL;
+	dsl_pool_t *dp = pds->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t val, obj, child_zapobj, props_zapobj;
+	int t, err;
+
+	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
+
+	err = zap_lookup(mos, pds->dd_phys->dd_child_dir_zapobj, name,
+	    8, 1, &obj);
+	if (err)
+		goto out;
+
+	dd = dsl_dir_open_obj(dp, obj, name, FTAG);
+	ASSERT3U(dd->dd_phys->dd_parent_obj, ==, pds->dd_object);
+
+	if (dmu_buf_refcount(dd->dd_dbuf) > 1) {
+		err = EBUSY;
+		goto out;
+	}
+
+	for (t = 0; t < TXG_SIZE; t++) {
+		/*
+		 * if they were dirty, they'd also be open.
+		 * dp_config_rwlock ensures that it stays that way.
+		 */
+		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
+	}
+
+	child_zapobj = dd->dd_phys->dd_child_dir_zapobj;
+	props_zapobj = dd->dd_phys->dd_props_zapobj;
+
+	if (child_zapobj != 0) {
+		uint64_t count;
+		err = EEXIST;
+		(void) zap_count(mos, child_zapobj, &count);
+		if (count != 0)
+			goto out;
+	}
+
+	if (dd->dd_phys->dd_head_dataset_obj != 0) {
+		err = dsl_dataset_destroy_sync(dd, NULL, tx);
+		if (err)
+			goto out;
+	}
+	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
+
+	/* The point of no (unsuccessful) return */
+
+	/* Make sure parent's used gets updated */
+	val = 0;
+	err = dsl_dir_set_reservation_sync(dd, &val, tx);
+	ASSERT(err == 0);
+	ASSERT3U(dd->dd_used_bytes, ==, 0);
+	ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
+	dsl_dir_close(dd, FTAG);
+	dd = NULL;
+
+	err = dmu_object_free(mos, obj, tx);
+	ASSERT(err == 0);
+
+	if (child_zapobj)
+		err = zap_destroy(mos, child_zapobj, tx);
+	ASSERT(err == 0);
+
+	if (props_zapobj)
+		err = zap_destroy(mos, props_zapobj, tx);
+	ASSERT(err == 0);
+
+	err = zap_remove(mos, pds->dd_phys->dd_child_dir_zapobj, name, tx);
+	ASSERT(err == 0);
+
+out:
+	rw_exit(&dp->dp_config_rwlock);
+	if (dd)
+		dsl_dir_close(dd, FTAG);
+
+	return (err);
+}
+
+void
+dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
+{
+	dsl_dir_phys_t *dsp;
+	dmu_buf_t *dbuf;
+	int error;
+
+	*ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+	    DMU_OT_DSL_DATASET, sizeof (dsl_dir_phys_t), tx);
+
+	error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET,
+	    sizeof (uint64_t), 1, ddobjp, tx);
+	ASSERT3U(error, ==, 0);
+
+	dbuf = dmu_bonus_hold(mos, *ddobjp);
+	dmu_buf_will_dirty(dbuf, tx);
+	dsp = dbuf->db_data;
+
+	dsp->dd_creation_time = gethrestime_sec();
+	dsp->dd_props_zapobj = zap_create(mos,
+	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+	dsp->dd_child_dir_zapobj = zap_create(mos,
+	    DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx);
+
+	dmu_buf_rele(dbuf);
+}
+
+void
+dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds)
+{
+	bzero(dds, sizeof (dmu_objset_stats_t));
+
+	dds->dds_dir_obj = dd->dd_object;
+	dds->dds_available = dsl_dir_space_available(dd, NULL, 0, TRUE);
+
+	mutex_enter(&dd->dd_lock);
+	dds->dds_space_used = dd->dd_used_bytes;
+	dds->dds_compressed_bytes = dd->dd_phys->dd_compressed_bytes;
+	dds->dds_uncompressed_bytes = dd->dd_phys->dd_uncompressed_bytes;
+	dds->dds_quota = dd->dd_phys->dd_quota;
+	dds->dds_reserved = dd->dd_phys->dd_reserved;
+	mutex_exit(&dd->dd_lock);
+
+	dds->dds_creation_time = dd->dd_phys->dd_creation_time;
+
+	dds->dds_is_placeholder = (dd->dd_phys->dd_head_dataset_obj == 0);
+
+	if (dd->dd_phys->dd_clone_parent_obj) {
+		dsl_dataset_t *ds;
+
+		rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+		ds = dsl_dataset_open_obj(dd->dd_pool,
+		    dd->dd_phys->dd_clone_parent_obj, NULL, DS_MODE_NONE, FTAG);
+		dsl_dataset_name(ds, dds->dds_clone_of);
+		dds->dds_clone_of_obj = dd->dd_phys->dd_clone_parent_obj;
+		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		rw_exit(&dd->dd_pool->dp_config_rwlock);
+	}
+
+	VERIFY(dsl_prop_get_ds_integer(dd, "checksum",
+	    &dds->dds_checksum, dds->dds_checksum_setpoint) == 0);
+
+	VERIFY(dsl_prop_get_ds_integer(dd, "compression",
+	    &dds->dds_compression, dds->dds_compression_setpoint) == 0);
+
+	VERIFY(dsl_prop_get_ds_integer(dd, "zoned",
+	    &dds->dds_zoned, dds->dds_zoned_setpoint) == 0);
+
+	spa_altroot(dd->dd_pool->dp_spa, dds->dds_altroot,
+	    sizeof (dds->dds_altroot));
+}
+
+int
+dsl_dir_sync_task(dsl_dir_t *dd,
+    int (*func)(dsl_dir_t *, void*, dmu_tx_t *), void *arg, uint64_t space)
+{
+	dmu_tx_t *tx;
+	dsl_pool_t *dp = dd->dd_pool;
+	int err = 0;
+	uint64_t txg;
+
+	dprintf_dd(dd, "func=%p space=%llu\n", func, space);
+
+again:
+	tx = dmu_tx_create_ds(dd);
+	dmu_tx_hold_space(tx, space);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err == ENOSPC || err == EDQUOT) {
+		dsl_dir_t *rds;
+		/*
+		 * They can get their space from either this dd, or the
+		 * root dd.
+		 */
+		for (rds = dd; rds->dd_parent; rds = rds->dd_parent)
+			continue;
+		dmu_tx_abort(tx);
+		tx = dmu_tx_create_ds(rds);
+		dmu_tx_hold_space(tx, space);
+		err = dmu_tx_assign(tx, TXG_WAIT);
+	}
+	if (err) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+
+	txg = dmu_tx_get_txg(tx);
+	mutex_enter(&dd->dd_lock);
+	if (dd->dd_sync_txg != 0) {
+		mutex_exit(&dd->dd_lock);
+		dmu_tx_commit(tx);
+		txg_wait_synced(dp, 0);
+		goto again;
+	}
+
+	/* We're good to go */
+
+	dd->dd_sync_txg = txg;
+	dd->dd_sync_func = func;
+	dd->dd_sync_arg = arg;
+
+	mutex_exit(&dd->dd_lock);
+
+	dsl_dir_dirty(dd, tx);
+	dmu_tx_commit(tx);
+
+	txg_wait_synced(dp, txg);
+
+	mutex_enter(&dd->dd_lock);
+	ASSERT(dd->dd_sync_txg == txg);
+	ASSERT(dd->dd_sync_func == NULL);
+	err = dd->dd_sync_err;
+	dd->dd_sync_txg = 0;
+	mutex_exit(&dd->dd_lock);
+
+	return (err);
+}
+
+void
+dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dd->dd_pool;
+
+	ASSERT(dd->dd_phys);
+
+	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) {
+		/* up the hold count until we can be written out */
+		dmu_buf_add_ref(dd->dd_dbuf, dd);
+	}
+}
+
+static int64_t
+parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
+{
+	uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
+	uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
+	return (new_accounted - old_accounted);
+}
+
+void
+dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+	if (dd->dd_sync_txg == tx->tx_txg && dd->dd_sync_func) {
+		dd->dd_sync_err = dd->dd_sync_func(dd, dd->dd_sync_arg, tx);
+		dd->dd_sync_func = NULL;
+	}
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+	mutex_enter(&dd->dd_lock);
+	ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0);
+	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
+	    dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
+	dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
+	dd->dd_phys->dd_used_bytes = dd->dd_used_bytes;
+	mutex_exit(&dd->dd_lock);
+
+	/* release the hold from dsl_dir_dirty */
+	dmu_buf_remove_ref(dd->dd_dbuf, dd);
+}
+
+static uint64_t
+dsl_dir_estimated_space(dsl_dir_t *dd)
+{
+	int64_t space;
+	int i;
+
+	ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+	space = dd->dd_used_bytes;
+	ASSERT(space >= 0);
+	for (i = 0; i < TXG_SIZE; i++) {
+		space += dd->dd_space_towrite[i&TXG_MASK];
+		ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
+	}
+	return (space);
+}
+
+/*
+ * How much space would dd have available if ancestor had delta applied
+ * to it?  If ondiskonly is set, we're only interested in what's
+ * on-disk, not estimated pending changes.
+ */
+static uint64_t
+dsl_dir_space_available(dsl_dir_t *dd,
+    dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
+{
+	uint64_t parentspace, myspace, quota, used;
+
+	/*
+	 * If there are no restrictions otherwise, assume we have
+	 * unlimited space available.
+	 */
+	quota = UINT64_MAX;
+	parentspace = UINT64_MAX;
+
+	if (dd->dd_parent != NULL) {
+		parentspace = dsl_dir_space_available(dd->dd_parent,
+		    ancestor, delta, ondiskonly);
+	}
+
+	mutex_enter(&dd->dd_lock);
+	if (dd->dd_phys->dd_quota != 0)
+		quota = dd->dd_phys->dd_quota;
+	if (ondiskonly) {
+		used = dd->dd_used_bytes;
+	} else {
+		used = dsl_dir_estimated_space(dd);
+	}
+	if (dd == ancestor)
+		used += delta;
+
+	if (dd->dd_parent == NULL) {
+		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE);
+		quota = MIN(quota, poolsize);
+	}
+
+	if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
+		/*
+		 * We have some space reserved, in addition to what our
+		 * parent gave us.
+		 */
+		parentspace += dd->dd_phys->dd_reserved - used;
+	}
+
+	if (used > quota) {
+		/* over quota */
+		myspace = 0;
+#ifdef ZFS_DEBUG
+		{
+			/*
+			 * While it's OK to be a little over quota, if
+			 * we think we are using more space than there
+			 * is in the pool (which is already 6% more than
+			 * dsl_pool_adjustedsize()), something is very
+			 * wrong.
+			 */
+			uint64_t space = spa_get_space(dd->dd_pool->dp_spa);
+			ASSERT3U(used, <=, space);
+		}
+#endif
+	} else {
+		/*
+		 * the lesser of parent's space and the space
+		 * left in our quota
+		 */
+		myspace = MIN(parentspace, quota - used);
+	}
+
+	mutex_exit(&dd->dd_lock);
+
+	return (myspace);
+}
+
+struct tempreserve {
+	list_node_t tr_node;
+	dsl_dir_t *tr_ds;
+	uint64_t tr_size;
+};
+
+/*
+ * Reserve space in this dsl_dir, to be used in this tx's txg.
+ * After the space has been dirtied (and thus
+ * dsl_dir_willuse_space() has been called), the reservation should
+ * be canceled, using dsl_dir_tempreserve_clear().
+ */
+static int
+dsl_dir_tempreserve_impl(dsl_dir_t *dd,
+    uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx)
+{
+	uint64_t txg = tx->tx_txg;
+	uint64_t est_used, quota, parent_rsrv;
+	int edquot = EDQUOT;
+	int txgidx = txg & TXG_MASK;
+	int i;
+	struct tempreserve *tr;
+
+	ASSERT3U(txg, !=, 0);
+
+	mutex_enter(&dd->dd_lock);
+	/*
+	 * Check against the dsl_dir's quota.  We don't add in the delta
+	 * when checking for over-quota because they get one free hit.
+	 */
+	est_used = dsl_dir_estimated_space(dd);
+	for (i = 0; i < TXG_SIZE; i++)
+		est_used += dd->dd_tempreserved[i];
+
+	quota = UINT64_MAX;
+
+	if (dd->dd_phys->dd_quota)
+		quota = dd->dd_phys->dd_quota;
+
+	/*
+	 * If this transaction will result in a net free of space, we want
+	 * to let it through, but we have to be careful: the space that it
+	 * frees won't become available until *after* this txg syncs.
+	 * Therefore, to ensure that it's possible to remove files from
+	 * a full pool without inducing transient overcommits, we throttle
+	 * netfree transactions against a quota that is slightly larger,
+	 * but still within the pool's allocation slop.  In cases where
+	 * we're very close to full, this will allow a steady trickle of
+	 * removes to get through.
+	 */
+	if (dd->dd_parent == NULL) {
+		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
+		if (poolsize < quota) {
+			quota = poolsize;
+			edquot = ENOSPC;
+		}
+	} else if (netfree) {
+		quota = UINT64_MAX;
+	}
+
+	/*
+	 * If they are requesting more space, and our current estimate
+	 * is over quota.  They get to try again unless the actual
+	 * on-disk is over quota.
+	 */
+	if (asize > 0 && est_used > quota) {
+		if (dd->dd_used_bytes < quota)
+			edquot = ERESTART;
+		dprintf_dd(dd, "failing: used=%lluK est_used = %lluK "
+		    "quota=%lluK tr=%lluK err=%d\n",
+		    dd->dd_used_bytes>>10, est_used>>10,
+		    quota>>10, asize>>10, edquot);
+		mutex_exit(&dd->dd_lock);
+		return (edquot);
+	}
+
+	/* We need to up our estimated delta before dropping dd_lock */
+	dd->dd_tempreserved[txgidx] += asize;
+
+	parent_rsrv = parent_delta(dd, est_used, asize);
+	mutex_exit(&dd->dd_lock);
+
+	tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
+	tr->tr_ds = dd;
+	tr->tr_size = asize;
+	list_insert_tail(tr_list, tr);
+
+	/* see if it's OK with our parent */
+	if (dd->dd_parent && parent_rsrv) {
+		return (dsl_dir_tempreserve_impl(dd->dd_parent,
+		    parent_rsrv, netfree, tr_list, tx));
+	} else {
+		return (0);
+	}
+}
+
+/*
+ * Reserve space in this dsl_dir, to be used in this tx's txg.
+ * After the space has been dirtied (and thus
+ * dsl_dir_willuse_space() has been called), the reservation should
+ * be canceled, using dsl_dir_tempreserve_clear().
+ */
+int
+dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize,
+    uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx)
+{
+	int err = 0;
+	list_t *tr_list;
+
+	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+	list_create(tr_list, sizeof (struct tempreserve),
+	    offsetof(struct tempreserve, tr_node));
+
+	err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
+	    tr_list, tx);
+
+	if (err == 0) {
+		struct tempreserve *tr;
+
+		err = arc_tempreserve_space(lsize);
+		if (err == 0) {
+			tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
+			tr->tr_ds = NULL;
+			tr->tr_size = lsize;
+			list_insert_tail(tr_list, tr);
+		}
+	}
+
+	if (err)
+		dsl_dir_tempreserve_clear(tr_list, tx);
+	else
+		*tr_cookiep = tr_list;
+	return (err);
+}
+
+/*
+ * Clear a temporary reservation that we previously made with
+ * dsl_dir_tempreserve_space().
+ */
+void
+dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
+{
+	int txgidx = tx->tx_txg & TXG_MASK;
+	list_t *tr_list = tr_cookie;
+	struct tempreserve *tr;
+
+	ASSERT3U(tx->tx_txg, !=, 0);
+
+	while (tr = list_head(tr_list)) {
+		if (tr->tr_ds == NULL) {
+			arc_tempreserve_clear(tr->tr_size);
+		} else {
+			mutex_enter(&tr->tr_ds->dd_lock);
+			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
+			    tr->tr_size);
+			tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
+			mutex_exit(&tr->tr_ds->dd_lock);
+		}
+		list_remove(tr_list, tr);
+		kmem_free(tr, sizeof (struct tempreserve));
+	}
+
+	kmem_free(tr_list, sizeof (list_t));
+}
+
+/*
+ * Call in open context when we think we're going to write/free space,
+ * eg. when dirtying data.  Be conservative (ie. OK to write less than
+ * this or free more than this, but don't write more or free less).
+ */
+void
+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+{
+	int64_t parent_space;
+	uint64_t est_used;
+
+	mutex_enter(&dd->dd_lock);
+	if (space > 0)
+		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
+
+	est_used = dsl_dir_estimated_space(dd);
+	parent_space = parent_delta(dd, est_used, space);
+	mutex_exit(&dd->dd_lock);
+
+	/* Make sure that we clean up dd_space_to* */
+	dsl_dir_dirty(dd, tx);
+
+	/* XXX this is potentially expensive and unnecessary... */
+	if (parent_space && dd->dd_parent)
+		dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
+}
+
+/* call from syncing context when we actually write/free space for this dd */
+void
+dsl_dir_diduse_space(dsl_dir_t *dd,
+    int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
+{
+	int64_t accounted_delta;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dsl_dir_dirty(dd, tx);
+
+	mutex_enter(&dd->dd_lock);
+	accounted_delta = parent_delta(dd, dd->dd_used_bytes, used);
+	ASSERT(used >= 0 || dd->dd_used_bytes >= -used);
+	ASSERT(compressed >= 0 ||
+	    dd->dd_phys->dd_compressed_bytes >= -compressed);
+	ASSERT(uncompressed >= 0 ||
+	    dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
+	dd->dd_used_bytes += used;
+	if (used > 0)
+		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] -= used;
+	dd->dd_phys->dd_uncompressed_bytes += uncompressed;
+	dd->dd_phys->dd_compressed_bytes += compressed;
+	mutex_exit(&dd->dd_lock);
+
+	if (dd->dd_parent != NULL) {
+		dsl_dir_diduse_space(dd->dd_parent,
+		    accounted_delta, compressed, uncompressed, tx);
+	}
+}
+
+static int
+dsl_dir_set_quota_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	uint64_t *quotap = arg;
+	uint64_t new_quota = *quotap;
+	int err = 0;
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+	mutex_enter(&dd->dd_lock);
+	if (new_quota != 0 && (new_quota < dd->dd_phys->dd_reserved ||
+	    new_quota < dsl_dir_estimated_space(dd))) {
+		err = ENOSPC;
+	} else {
+		dd->dd_phys->dd_quota = new_quota;
+	}
+	mutex_exit(&dd->dd_lock);
+	return (err);
+}
+
+int
+dsl_dir_set_quota(const char *ddname, uint64_t quota)
+{
+	dsl_dir_t *dd;
+	int err;
+
+	dd = dsl_dir_open(ddname, FTAG, NULL);
+	if (dd == NULL)
+		return (ENOENT);
+	/*
+	 * If someone removes a file, then tries to set the quota, we
+	 * want to make sure the file freeing takes effect.
+	 */
+	txg_wait_open(dd->dd_pool, 0);
+
+	err = dsl_dir_sync_task(dd, dsl_dir_set_quota_sync, &quota, 0);
+	dsl_dir_close(dd, FTAG);
+	return (err);
+}
+
+static int
+dsl_dir_set_reservation_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	uint64_t *reservationp = arg;
+	uint64_t new_reservation = *reservationp;
+	uint64_t used, avail;
+	int64_t delta;
+
+	if (new_reservation > INT64_MAX)
+		return (EOVERFLOW);
+
+	mutex_enter(&dd->dd_lock);
+	used = dd->dd_used_bytes;
+	delta = MAX(used, new_reservation) -
+	    MAX(used, dd->dd_phys->dd_reserved);
+	mutex_exit(&dd->dd_lock);
+
+	if (dd->dd_parent) {
+		avail = dsl_dir_space_available(dd->dd_parent,
+		    NULL, 0, FALSE);
+	} else {
+		avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
+	}
+
+	if (delta > 0 && delta > avail)
+		return (ENOSPC);
+	if (delta > 0 && dd->dd_phys->dd_quota > 0 &&
+	    new_reservation > dd->dd_phys->dd_quota)
+		return (ENOSPC);
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	dd->dd_phys->dd_reserved = new_reservation;
+
+	if (dd->dd_parent != NULL) {
+		/* Roll up this additional usage into our ancestors */
+		dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx);
+	}
+	return (0);
+}
+
+int
+dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
+{
+	dsl_dir_t *dd;
+	int err;
+
+	dd = dsl_dir_open(ddname, FTAG, NULL);
+	if (dd == NULL)
+		return (ENOENT);
+	err = dsl_dir_sync_task(dd,
+	    dsl_dir_set_reservation_sync, &reservation, 0);
+	dsl_dir_close(dd, FTAG);
+	return (err);
+}
+
+static dsl_dir_t *
+closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
+{
+	for (; ds1; ds1 = ds1->dd_parent) {
+		dsl_dir_t *dd;
+		for (dd = ds2; dd; dd = dd->dd_parent) {
+			if (ds1 == dd)
+				return (dd);
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * If delta is applied to dd, how much of that delta would be applied to
+ * ancestor?  Syncing context only.
+ */
+static int64_t
+would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
+{
+	if (dd == ancestor)
+		return (delta);
+
+	mutex_enter(&dd->dd_lock);
+	delta = parent_delta(dd, dd->dd_used_bytes, delta);
+	mutex_exit(&dd->dd_lock);
+	return (would_change(dd->dd_parent, delta, ancestor));
+}
+
+int
+dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	const char *newname = arg;
+	dsl_pool_t *dp = dd->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	dsl_dir_t *newpds;
+	const char *tail;
+	int err, len;
+
+	/* can't rename to different pool */
+	len = strlen(dp->dp_root_dir->dd_myname);
+	if (strncmp(dp->dp_root_dir->dd_myname, newname, len != 0) ||
+	    newname[len] != '/') {
+		return (ENXIO);
+	}
+
+	newpds = dsl_dir_open_spa(dp->dp_spa, newname, FTAG, &tail);
+
+	/* new parent should exist */
+	if (newpds == NULL)
+		return (ENOENT);
+
+	/* new name should not already exist */
+	if (tail == NULL) {
+		dsl_dir_close(newpds, FTAG);
+		return (EEXIST);
+	}
+
+	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
+
+	/* There should be 2 references: the open and the dirty */
+	if (dmu_buf_refcount(dd->dd_dbuf) > 2) {
+		rw_exit(&dp->dp_config_rwlock);
+		dsl_dir_close(newpds, FTAG);
+		return (EBUSY);
+	}
+
+	if (newpds != dd->dd_parent) {
+		dsl_dir_t *ancestor;
+		int64_t adelta;
+		uint64_t myspace, avail;
+
+		ancestor = closest_common_ancestor(dd, newpds);
+
+		/* no rename into our descendent */
+		if (ancestor == dd) {
+			dsl_dir_close(newpds, FTAG);
+			rw_exit(&dp->dp_config_rwlock);
+			return (EINVAL);
+		}
+
+		myspace = MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved);
+		adelta = would_change(dd->dd_parent, -myspace, ancestor);
+		avail = dsl_dir_space_available(newpds,
+		    ancestor, adelta, FALSE);
+		if (avail < myspace) {
+			dsl_dir_close(newpds, FTAG);
+			rw_exit(&dp->dp_config_rwlock);
+			return (ENOSPC);
+		}
+
+		/* The point of no (unsuccessful) return */
+
+		dsl_dir_diduse_space(dd->dd_parent, -myspace,
+		    -dd->dd_phys->dd_compressed_bytes,
+		    -dd->dd_phys->dd_uncompressed_bytes, tx);
+		dsl_dir_diduse_space(newpds, myspace,
+		    dd->dd_phys->dd_compressed_bytes,
+		    dd->dd_phys->dd_uncompressed_bytes, tx);
+	}
+
+	/* The point of no (unsuccessful) return */
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+	/* remove from old parent zapobj */
+	err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+	    dd->dd_myname, tx);
+	ASSERT3U(err, ==, 0);
+
+	(void) strcpy(dd->dd_myname, tail);
+	dsl_dir_close(dd->dd_parent, dd);
+	dd->dd_phys->dd_parent_obj = newpds->dd_object;
+	dd->dd_parent = dsl_dir_open_obj(dd->dd_pool,
+	    newpds->dd_object, NULL, dd);
+
+	/* add to new parent zapobj */
+	err = zap_add(mos, newpds->dd_phys->dd_child_dir_zapobj,
+	    dd->dd_myname, 8, 1, &dd->dd_object, tx);
+	ASSERT3U(err, ==, 0);
+
+	dsl_dir_close(newpds, FTAG);
+	rw_exit(&dp->dp_config_rwlock);
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
new file mode 100644
index 000000000000..5b71ccfaa925
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -0,0 +1,233 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+
+/* internal reserved dir name */
+#define	MOS_DIR_NAME "$MOS"
+
+static dsl_dir_t *
+dsl_pool_open_mos_dir(dsl_pool_t *dp)
+{
+	uint64_t obj;
+	int err;
+
+	err = zap_lookup(dp->dp_meta_objset,
+	    dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
+	    MOS_DIR_NAME, sizeof (obj), 1, &obj);
+	ASSERT3U(err, ==, 0);
+
+	return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp));
+}
+
+static dsl_pool_t *
+dsl_pool_open_impl(spa_t *spa, uint64_t txg)
+{
+	dsl_pool_t *dp;
+	blkptr_t *bp = spa_get_rootblkptr(spa);
+
+	dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
+	dp->dp_spa = spa;
+	dp->dp_meta_rootbp = *bp;
+	txg_init(dp, txg);
+
+	txg_list_create(&dp->dp_dirty_datasets,
+	    offsetof(dsl_dataset_t, ds_dirty_link));
+	txg_list_create(&dp->dp_dirty_dirs,
+	    offsetof(dsl_dir_t, dd_dirty_link));
+	list_create(&dp->dp_synced_objsets, sizeof (dsl_dataset_t),
+	    offsetof(dsl_dataset_t, ds_synced_link));
+
+	return (dp);
+}
+
+dsl_pool_t *
+dsl_pool_open(spa_t *spa, uint64_t txg)
+{
+	int err;
+	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+
+	dp->dp_meta_objset =
+	    &dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp)->os;
+
+	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
+	    &dp->dp_root_dir_obj);
+	ASSERT3U(err, ==, 0);
+
+	dp->dp_root_dir = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+	    NULL, dp);
+	dp->dp_mos_dir = dsl_pool_open_mos_dir(dp);
+	rw_exit(&dp->dp_config_rwlock);
+
+	return (dp);
+}
+
+void
+dsl_pool_close(dsl_pool_t *dp)
+{
+	/* drop our reference from dsl_pool_open() */
+	dsl_dir_close(dp->dp_mos_dir, dp);
+	dsl_dir_close(dp->dp_root_dir, dp);
+
+	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
+	dmu_objset_evict(NULL, dp->dp_meta_objset->os);
+
+	txg_list_destroy(&dp->dp_dirty_datasets);
+	txg_list_destroy(&dp->dp_dirty_dirs);
+	list_destroy(&dp->dp_synced_objsets);
+
+	arc_flush();
+	txg_fini(dp);
+	kmem_free(dp, sizeof (dsl_pool_t));
+}
+
+dsl_pool_t *
+dsl_pool_create(spa_t *spa, uint64_t txg)
+{
+	int err;
+	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+	dp->dp_meta_objset = &dmu_objset_create_impl(spa,
+	    NULL, DMU_OST_META, tx)->os;
+
+	/* create the pool directory */
+	err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
+	ASSERT3U(err, ==, 0);
+
+	/* create and open the root dir */
+	dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx);
+	dp->dp_root_dir = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+	    NULL, dp);
+
+	/* create and open the meta-objset dir */
+	err = dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME,
+	    tx);
+	ASSERT3U(err, ==, 0);
+	dp->dp_mos_dir = dsl_pool_open_mos_dir(dp);
+
+	dmu_tx_commit(tx);
+
+	return (dp);
+}
+
+void
+dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
+{
+	dmu_tx_t *tx;
+	objset_impl_t *mosi = dp->dp_meta_objset->os;
+
+	tx = dmu_tx_create_assigned(dp, txg);
+
+	do {
+		dsl_dir_t *dd;
+		dsl_dataset_t *ds;
+
+		while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+			if (!list_link_active(&ds->ds_synced_link))
+				list_insert_tail(&dp->dp_synced_objsets, ds);
+			dsl_dataset_sync(ds, tx);
+		}
+		while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
+			dsl_dir_sync(dd, tx);
+		/*
+		 * We need to loop since dsl_dir_sync() could create a
+		 * new (dirty) objset.
+		 * XXX - isn't this taken care of by the spa's sync to
+		 * convergence loop?
+		 */
+	} while (!txg_list_empty(&dp->dp_dirty_datasets, txg));
+
+	if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
+	    list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
+		dmu_objset_sync(mosi, tx);
+		dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
+		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+	}
+
+	dmu_tx_commit(tx);
+}
+
+void
+dsl_pool_zil_clean(dsl_pool_t *dp)
+{
+	dsl_dataset_t *ds;
+
+	while (ds = list_head(&dp->dp_synced_objsets)) {
+		list_remove(&dp->dp_synced_objsets, ds);
+		ASSERT(ds->ds_user_ptr != NULL);
+		zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil);
+	}
+}
+
+int
+dsl_pool_sync_context(dsl_pool_t *dp)
+{
+	/*
+	 * Yeah, this is cheesy.  But the SPA needs some way to let
+	 * the sync threads invoke spa_open() and spa_close() while
+	 * it holds the namespace lock.  I'm certainly open to better
+	 * ideas for how to determine whether the current thread is
+	 * operating on behalf of spa_sync().  This works for now.
+	 */
+	return (curthread == dp->dp_tx.tx_sync_thread ||
+	    BP_IS_HOLE(&dp->dp_meta_rootbp));
+}
+
+uint64_t
+dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
+{
+	uint64_t space, resv;
+
+	/*
+	 * Reserve about 1% (1/128), or at least 16MB, for allocation
+	 * efficiency.
+	 * XXX The intent log is not accounted for, so it must fit
+	 * within this slop.
+	 *
+	 * If we're trying to assess whether it's OK to do a free,
+	 * cut the reservation in half to allow forward progress
+	 * (e.g. make it possible to rm(1) files from a full pool).
+	 */
+	space = spa_get_space(dp->dp_spa);
+	resv = MAX(space >> 7, SPA_MINDEVSIZE >> 2);
+	if (netfree)
+		resv >>= 1;
+
+	return (space - resv);
+}
diff --git a/usr/src/uts/common/fs/zfs/dsl_prop.c b/usr/src/uts/common/fs/zfs/dsl_prop.c
new file mode 100644
index 000000000000..bd54263507da
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/dsl_prop.c
@@ -0,0 +1,367 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/spa.h>
+#include <sys/zio_checksum.h> /* for the default checksum value */
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+
+static int
+dodefault(const char *propname, int intsz, int numint, void *buf)
+{
+	zfs_prop_t prop;
+
+	if ((prop = zfs_name_to_prop(propname)) == ZFS_PROP_INVAL ||
+	    zfs_prop_readonly(prop))
+		return (ENOENT);
+
+	if (zfs_prop_get_type(prop) == prop_type_string) {
+		if (intsz != 1)
+			return (EOVERFLOW);
+		zfs_prop_default_string(prop, buf, numint);
+	} else {
+		if (intsz != 8 || numint < 1)
+			return (EOVERFLOW);
+
+		*(uint64_t *)buf = zfs_prop_default_numeric(prop);
+	}
+
+	return (0);
+}
+
+static int
+dsl_prop_get_impl(dsl_pool_t *dp, uint64_t ddobj, const char *propname,
+    int intsz, int numint, void *buf, char *setpoint)
+{
+	int err = 0;
+	objset_t *mos = dp->dp_meta_objset;
+
+	if (setpoint)
+		setpoint[0] = '\0';
+
+	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
+
+	while (ddobj != 0) {
+		dsl_dir_t *dd = dsl_dir_open_obj(dp, ddobj, NULL, FTAG);
+		err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
+		    propname, intsz, numint, buf);
+		if (err != ENOENT) {
+			if (setpoint)
+				dsl_dir_name(dd, setpoint);
+			dsl_dir_close(dd, FTAG);
+			break;
+		}
+		ASSERT3U(err, ==, ENOENT);
+		ddobj = dd->dd_phys->dd_parent_obj;
+		dsl_dir_close(dd, FTAG);
+	}
+	if (err == ENOENT)
+		err = dodefault(propname, intsz, numint, buf);
+
+	return (err);
+}
+
+/*
+ * Register interest in the named property.  We'll call the callback
+ * once to notify it of the current property value, and again each time
+ * the property changes, until this callback is unregistered.
+ *
+ * Return 0 on success, errno if the prop is not an integer value.
+ */
+int
+dsl_prop_register(dsl_dataset_t *ds, const char *propname,
+    dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+	dsl_dir_t *dd;
+	uint64_t value;
+	dsl_prop_cb_record_t *cbr;
+	int err;
+
+	dd = ds->ds_dir;
+
+	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+
+	err = dsl_prop_get_impl(dd->dd_pool, dd->dd_object, propname,
+	    8, 1, &value, NULL);
+	if (err == ENOENT) {
+		err = 0;
+		value = DSL_PROP_VALUE_UNDEFINED;
+	}
+	if (err != 0) {
+		rw_exit(&dd->dd_pool->dp_config_rwlock);
+		return (err);
+	}
+
+	cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
+	cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP);
+	(void) strcpy((char *)cbr->cbr_propname, propname);
+	cbr->cbr_func = callback;
+	cbr->cbr_arg = cbarg;
+	mutex_enter(&dd->dd_lock);
+	list_insert_head(&dd->dd_prop_cbs, cbr);
+	mutex_exit(&dd->dd_lock);
+
+	cbr->cbr_func(cbr->cbr_arg, value);
+
+	(void) dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, cbr);
+	rw_exit(&dd->dd_pool->dp_config_rwlock);
+	/* Leave dataset open until this callback is unregistered */
+	return (0);
+}
+
+int
+dsl_prop_get_ds(dsl_dir_t *dd, const char *propname,
+    int intsz, int numints, void *buf, char *setpoint)
+{
+	int err;
+
+	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+	err = dsl_prop_get_impl(dd->dd_pool, dd->dd_object,
+	    propname, intsz, numints, buf, setpoint);
+	rw_exit(&dd->dd_pool->dp_config_rwlock);
+
+	return (err);
+}
+
+int
+dsl_prop_get(const char *ddname, const char *propname,
+    int intsz, int numints, void *buf, char *setpoint)
+{
+	dsl_dir_t *dd;
+	const char *tail;
+	int err;
+
+	dd = dsl_dir_open(ddname, FTAG, &tail);
+	if (dd == NULL)
+		return (ENOENT);
+	if (tail && tail[0] != '@') {
+		dsl_dir_close(dd, FTAG);
+		return (ENOENT);
+	}
+
+	err = dsl_prop_get_ds(dd, propname, intsz, numints, buf, setpoint);
+
+	dsl_dir_close(dd, FTAG);
+	return (err);
+}
+
+/*
+ * Return 0 on success, ENOENT if ddname is invalid, EOVERFLOW if
+ * valuelen not big enough.
+ */
+int
+dsl_prop_get_string(const char *ddname, const char *propname,
+    char *value, int valuelen, char *setpoint)
+{
+	return (dsl_prop_get(ddname, propname, 1, valuelen, value, setpoint));
+}
+
+/*
+ * Get the current property value.  It may have changed by the time this
+ * function returns, so it is NOT safe to follow up with
+ * dsl_prop_register() and assume that the value has not changed in
+ * between.
+ *
+ * Return 0 on success, ENOENT if ddname is invalid.
+ */
+int
+dsl_prop_get_integer(const char *ddname, const char *propname,
+    uint64_t *valuep, char *setpoint)
+{
+	return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
+}
+
+int
+dsl_prop_get_ds_integer(dsl_dir_t *dd, const char *propname,
+    uint64_t *valuep, char *setpoint)
+{
+	return (dsl_prop_get_ds(dd, propname, 8, 1, valuep, setpoint));
+}
+
+/*
+ * Unregister this callback.  Return 0 on success, ENOENT if ddname is
+ * invalid, ENOMSG if no matching callback registered.
+ */
+int
+dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
+    dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+	dsl_dir_t *dd;
+	dsl_prop_cb_record_t *cbr;
+
+	dd = ds->ds_dir;
+
+	mutex_enter(&dd->dd_lock);
+	for (cbr = list_head(&dd->dd_prop_cbs);
+	    cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+		if (strcmp(cbr->cbr_propname, propname) == 0 &&
+		    cbr->cbr_func == callback &&
+		    cbr->cbr_arg == cbarg)
+			break;
+	}
+
+	if (cbr == NULL) {
+		mutex_exit(&dd->dd_lock);
+		return (ENOMSG);
+	}
+
+	list_remove(&dd->dd_prop_cbs, cbr);
+	mutex_exit(&dd->dd_lock);
+	kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
+	kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
+
+	/* Clean up from dsl_prop_register */
+	dsl_dir_close(dd, cbr);
+	return (0);
+}
+
+static void
+dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
+    const char *propname, uint64_t value, int first)
+{
+	dsl_dir_t *dd;
+	dsl_prop_cb_record_t *cbr;
+	objset_t *mos = dp->dp_meta_objset;
+	int err;
+
+	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+	dd = dsl_dir_open_obj(dp, ddobj, NULL, FTAG);
+
+	if (!first) {
+		/*
+		 * If the prop is set here, then this change is not
+		 * being inherited here or below; stop the recursion.
+		 */
+		err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
+		    8, 1, &value);
+		if (err == 0) {
+			dsl_dir_close(dd, FTAG);
+			return;
+		}
+		ASSERT3U(err, ==, ENOENT);
+	}
+
+	mutex_enter(&dd->dd_lock);
+	for (cbr = list_head(&dd->dd_prop_cbs);
+	    cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
+		if (strcmp(cbr->cbr_propname, propname) == 0) {
+			cbr->cbr_func(cbr->cbr_arg, value);
+		}
+	}
+	mutex_exit(&dd->dd_lock);
+
+	if (dd->dd_phys->dd_child_dir_zapobj) {
+		zap_cursor_t zc;
+		zap_attribute_t za;
+
+		for (zap_cursor_init(&zc, mos,
+		    dd->dd_phys->dd_child_dir_zapobj);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    zap_cursor_advance(&zc)) {
+			/* XXX recursion could blow stack; esp. za! */
+			dsl_prop_changed_notify(dp, za.za_first_integer,
+			    propname, value, FALSE);
+		}
+	}
+	dsl_dir_close(dd, FTAG);
+}
+
+struct prop_set_arg {
+	const char *name;
+	int intsz;
+	int numints;
+	const void *buf;
+};
+
+static int
+dsl_prop_set_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
+{
+	struct prop_set_arg *psa = arg;
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
+	uint64_t intval;
+	int err, isint;
+
+	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_WRITER);
+
+	isint = (dodefault(psa->name, 8, 1, &intval) == 0);
+
+	if (psa->numints == 0) {
+		err = zap_remove(mos, zapobj, psa->name, tx);
+		if (err == ENOENT) /* that's fine. */
+			err = 0;
+		if (err == 0 && isint) {
+			err = dsl_prop_get_impl(dd->dd_pool,
+			    dd->dd_phys->dd_parent_obj, psa->name,
+			    8, 1, &intval, NULL);
+		}
+	} else {
+		err = zap_update(mos, zapobj, psa->name,
+		    psa->intsz, psa->numints, psa->buf, tx);
+		if (isint)
+			intval = *(uint64_t *)psa->buf;
+	}
+
+	if (err == 0 && isint) {
+		dsl_prop_changed_notify(dd->dd_pool,
+		    dd->dd_object, psa->name, intval, TRUE);
+	}
+	rw_exit(&dd->dd_pool->dp_config_rwlock);
+
+	return (err);
+}
+
+int
+dsl_prop_set(const char *ddname, const char *propname,
+    int intsz, int numints, const void *buf)
+{
+	dsl_dir_t *dd;
+	int err;
+	struct prop_set_arg psa;
+
+	dd = dsl_dir_open(ddname, FTAG, NULL);
+	if (dd == NULL)
+		return (ENOENT);
+
+	psa.name = propname;
+	psa.intsz = intsz;
+	psa.numints = numints;
+	psa.buf = buf;
+	err = dsl_dir_sync_task(dd, dsl_prop_set_sync, &psa, 0);
+
+	dsl_dir_close(dd, FTAG);
+
+	return (err);
+}
diff --git a/usr/src/uts/common/fs/zfs/fletcher.c b/usr/src/uts/common/fs/zfs/fletcher.c
new file mode 100644
index 000000000000..03186d13873d
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/fletcher.c
@@ -0,0 +1,100 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/byteorder.h>
+#include <sys/spa.h>
+
+void
+fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	const uint64_t *ip = buf;
+	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+	uint64_t a0, b0, a1, b1;
+
+	for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+		a0 += ip[0];
+		a1 += ip[1];
+		b0 += a0;
+		b1 += a1;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+void
+fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	const uint64_t *ip = buf;
+	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+	uint64_t a0, b0, a1, b1;
+
+	for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+		a0 += BSWAP_64(ip[0]);
+		a1 += BSWAP_64(ip[1]);
+		b0 += a0;
+		b1 += a1;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+void
+fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+
+	for (a = b = c = d = 0; ip < ipend; ip++) {
+		a += ip[0];
+		b += a;
+		c += b;
+		d += c;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+
+	for (a = b = c = d = 0; ip < ipend; ip++) {
+		a += BSWAP_32(ip[0]);
+		b += a;
+		c += b;
+		d += c;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
diff --git a/usr/src/uts/common/fs/zfs/lzjb.c b/usr/src/uts/common/fs/zfs/lzjb.c
new file mode 100644
index 000000000000..5979a55ef704
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/lzjb.c
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * This is stolen from common/os/compress.c and will be removed once
+ * our changes have made it into the on10 source base.
+ *
+ * In particular, we are adding the "feature" that compress() can
+ * take a destination buffer size and return -1 if the data will not
+ * compress to d_len or less.
+ */
+
+#include <sys/types.h>
+
+#define	MATCH_BITS	6
+#define	MATCH_MIN	3
+#define	MATCH_MAX	((1 << MATCH_BITS) + (MATCH_MIN - 1))
+#define	OFFSET_MASK	((1 << (16 - MATCH_BITS)) - 1)
+#define	LEMPEL_SIZE	256
+
+size_t
+lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len)
+{
+	uchar_t *src = s_start;
+	uchar_t *dst = d_start;
+	uchar_t *cpy, *copymap;
+	int copymask = 1 << (NBBY - 1);
+	int mlen, offset;
+	uint16_t *hp;
+	uint16_t lempel[LEMPEL_SIZE];	/* uninitialized; see above */
+
+	while (src < (uchar_t *)s_start + s_len) {
+		if ((copymask <<= 1) == (1 << NBBY)) {
+			if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) {
+				if (d_len != s_len)
+					return (s_len);
+				mlen = s_len;
+				for (src = s_start, dst = d_start; mlen; mlen--)
+					*dst++ = *src++;
+				return (s_len);
+			}
+			copymask = 1;
+			copymap = dst;
+			*dst++ = 0;
+		}
+		if (src > (uchar_t *)s_start + s_len - MATCH_MAX) {
+			*dst++ = *src++;
+			continue;
+		}
+		hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) &
+		    (LEMPEL_SIZE - 1)];
+		offset = (intptr_t)(src - *hp) & OFFSET_MASK;
+		*hp = (uint16_t)(uintptr_t)src;
+		cpy = src - offset;
+		if (cpy >= (uchar_t *)s_start && cpy != src &&
+		    src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) {
+			*copymap |= copymask;
+			for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++)
+				if (src[mlen] != cpy[mlen])
+					break;
+			*dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) |
+			    (offset >> NBBY);
+			*dst++ = (uchar_t)offset;
+			src += mlen;
+		} else {
+			*dst++ = *src++;
+		}
+	}
+	return (dst - (uchar_t *)d_start);
+}
+
+/*ARGSUSED*/
+int
+lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len)
+{
+	uchar_t *src = s_start;
+	uchar_t *dst = d_start;
+	uchar_t *d_end = (uchar_t *)d_start + d_len;
+	uchar_t *cpy, copymap;
+	int copymask = 1 << (NBBY - 1);
+
+	while (dst < d_end) {
+		if ((copymask <<= 1) == (1 << NBBY)) {
+			copymask = 1;
+			copymap = *src++;
+		}
+		if (copymap & copymask) {
+			int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN;
+			int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK;
+			src += 2;
+			if ((cpy = dst - offset) < (uchar_t *)d_start)
+				return (-1);
+			while (--mlen >= 0 && dst < d_end)
+				*dst++ = *cpy++;
+		} else {
+			*dst++ = *src++;
+		}
+	}
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
new file mode 100644
index 000000000000..9d682e499042
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -0,0 +1,796 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/space_map.h>
+#include <sys/metaslab_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+/*
+ * ==========================================================================
+ * Metaslab classes
+ * ==========================================================================
+ */
+metaslab_class_t *
+metaslab_class_create(void)
+{
+	metaslab_class_t *mc;
+
+	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
+
+	mc->mc_rotor = NULL;
+
+	return (mc);
+}
+
+void
+metaslab_class_destroy(metaslab_class_t *mc)
+{
+	metaslab_group_t *mg;
+
+	while ((mg = mc->mc_rotor) != NULL) {
+		metaslab_class_remove(mc, mg);
+		metaslab_group_destroy(mg);
+	}
+
+	kmem_free(mc, sizeof (metaslab_class_t));
+}
+
+void
+metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
+{
+	metaslab_group_t *mgprev, *mgnext;
+
+	ASSERT(mg->mg_class == NULL);
+
+	if ((mgprev = mc->mc_rotor) == NULL) {
+		mg->mg_prev = mg;
+		mg->mg_next = mg;
+	} else {
+		mgnext = mgprev->mg_next;
+		mg->mg_prev = mgprev;
+		mg->mg_next = mgnext;
+		mgprev->mg_next = mg;
+		mgnext->mg_prev = mg;
+	}
+	mc->mc_rotor = mg;
+	mg->mg_class = mc;
+}
+
+void
+metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
+{
+	metaslab_group_t *mgprev, *mgnext;
+
+	ASSERT(mg->mg_class == mc);
+
+	mgprev = mg->mg_prev;
+	mgnext = mg->mg_next;
+
+	if (mg == mgnext) {
+		mc->mc_rotor = NULL;
+	} else {
+		mc->mc_rotor = mgnext;
+		mgprev->mg_next = mgnext;
+		mgnext->mg_prev = mgprev;
+	}
+
+	mg->mg_prev = NULL;
+	mg->mg_next = NULL;
+	mg->mg_class = NULL;
+}
+
+/*
+ * ==========================================================================
+ * Metaslab groups
+ * ==========================================================================
+ */
+static int
+metaslab_compare(const void *x1, const void *x2)
+{
+	const metaslab_t *m1 = x1;
+	const metaslab_t *m2 = x2;
+
+	if (m1->ms_weight < m2->ms_weight)
+		return (1);
+	if (m1->ms_weight > m2->ms_weight)
+		return (-1);
+
+	/*
+	 * If the weights are identical, use the offset to force uniqueness.
+	 */
+	if (m1->ms_map.sm_start < m2->ms_map.sm_start)
+		return (-1);
+	if (m1->ms_map.sm_start > m2->ms_map.sm_start)
+		return (1);
+
+	ASSERT3P(m1, ==, m2);
+
+	return (0);
+}
+
+metaslab_group_t *
+metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
+{
+	metaslab_group_t *mg;
+
+	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
+	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
+	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
+	mg->mg_aliquot = 2ULL << 20;		/* XXX -- tweak me */
+	mg->mg_vd = vd;
+	metaslab_class_add(mc, mg);
+
+	return (mg);
+}
+
+void
+metaslab_group_destroy(metaslab_group_t *mg)
+{
+	avl_destroy(&mg->mg_metaslab_tree);
+	mutex_destroy(&mg->mg_lock);
+	kmem_free(mg, sizeof (metaslab_group_t));
+}
+
+void
+metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+	mutex_enter(&mg->mg_lock);
+	ASSERT(msp->ms_group == NULL);
+	msp->ms_group = mg;
+	msp->ms_weight = weight;
+	avl_add(&mg->mg_metaslab_tree, msp);
+	mutex_exit(&mg->mg_lock);
+}
+
+void
+metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
+{
+	mutex_enter(&mg->mg_lock);
+	ASSERT(msp->ms_group == mg);
+	avl_remove(&mg->mg_metaslab_tree, msp);
+	msp->ms_group = NULL;
+	mutex_exit(&mg->mg_lock);
+}
+
+void
+metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+	mutex_enter(&mg->mg_lock);
+	ASSERT(msp->ms_group == mg);
+	avl_remove(&mg->mg_metaslab_tree, msp);
+	msp->ms_weight = weight;
+	avl_add(&mg->mg_metaslab_tree, msp);
+	mutex_exit(&mg->mg_lock);
+}
+
+/*
+ * ==========================================================================
+ * Metaslabs
+ * ==========================================================================
+ */
+void
+metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_t **mspp,
+	uint64_t start, uint64_t size, uint64_t txg)
+{
+	vdev_t *vd = mg->mg_vd;
+	metaslab_t *msp;
+	int fm;
+
+	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
+
+	msp->ms_smo = smo;
+
+	space_map_create(&msp->ms_map, start, size, vd->vdev_ashift,
+	    &msp->ms_lock);
+
+	for (fm = 0; fm < TXG_SIZE; fm++) {
+		space_map_create(&msp->ms_allocmap[fm], start, size,
+		    vd->vdev_ashift, &msp->ms_lock);
+		space_map_create(&msp->ms_freemap[fm], start, size,
+		    vd->vdev_ashift, &msp->ms_lock);
+	}
+
+	/*
+	 * If we're opening an existing pool (txg == 0) or creating
+	 * a new one (txg == TXG_INITIAL), all space is available now.
+	 * If we're adding space to an existing pool, the new space
+	 * does not become available until after this txg has synced.
+	 * We enforce this by assigning an initial weight of 0 to new space.
+	 *
+	 * (Transactional allocations for this txg would actually be OK;
+	 * it's intent log allocations that cause trouble.  If we wrote
+	 * a log block in this txg and lost power, the log replay would be
+	 * based on the DVA translations that had been synced in txg - 1.
+	 * Those translations would not include this metaslab's vdev.)
+	 */
+	metaslab_group_add(mg, msp, txg > TXG_INITIAL ? 0 : size);
+
+	if (txg == 0) {
+		/*
+		 * We're opening the pool.  Make the metaslab's
+		 * free space available immediately.
+		 */
+		vdev_space_update(vd, size, smo->smo_alloc);
+		metaslab_sync_done(msp, 0);
+	} else {
+		/*
+		 * We're adding a new metaslab to an already-open pool.
+		 * Declare all of the metaslab's space to be free.
+		 *
+		 * Note that older transaction groups cannot allocate
+		 * from this metaslab until its existence is committed,
+		 * because we set ms_last_alloc to the current txg.
+		 */
+		smo->smo_alloc = 0;
+		msp->ms_usable_space = size;
+		mutex_enter(&msp->ms_lock);
+		space_map_add(&msp->ms_map, start, size);
+		msp->ms_map_incore = 1;
+		mutex_exit(&msp->ms_lock);
+
+		/* XXX -- we'll need a call to picker_init here */
+		msp->ms_dirty[txg & TXG_MASK] |= MSD_ADD;
+		msp->ms_last_alloc = txg;
+		vdev_dirty(vd, VDD_ADD, txg);
+		(void) txg_list_add(&vd->vdev_ms_list, msp, txg);
+	}
+
+	*mspp = msp;
+}
+
+void
+metaslab_fini(metaslab_t *msp)
+{
+	int fm;
+	metaslab_group_t *mg = msp->ms_group;
+
+	vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
+	    -msp->ms_smo->smo_alloc);
+
+	metaslab_group_remove(mg, msp);
+
+	/* XXX -- we'll need a call to picker_fini here */
+
+	mutex_enter(&msp->ms_lock);
+
+	space_map_vacate(&msp->ms_map, NULL, NULL);
+	msp->ms_map_incore = 0;
+	space_map_destroy(&msp->ms_map);
+
+	for (fm = 0; fm < TXG_SIZE; fm++) {
+		space_map_destroy(&msp->ms_allocmap[fm]);
+		space_map_destroy(&msp->ms_freemap[fm]);
+	}
+
+	mutex_exit(&msp->ms_lock);
+
+	kmem_free(msp, sizeof (metaslab_t));
+}
+
+/*
+ * Write a metaslab to disk in the context of the specified transaction group.
+ */
+void
+metaslab_sync(metaslab_t *msp, uint64_t txg)
+{
+	vdev_t *vd = msp->ms_group->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+	objset_t *os = spa->spa_meta_objset;
+	space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
+	space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
+	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+	space_map_obj_t *smo = msp->ms_smo;
+	uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK];
+	uint64_t alloc_delta;
+	dmu_buf_t *db;
+	dmu_tx_t *tx;
+
+	dprintf("%s offset %llx\n", vdev_description(vd), msp->ms_map.sm_start);
+
+	mutex_enter(&msp->ms_lock);
+
+	if (*dirty & MSD_ADD)
+		vdev_space_update(vd, msp->ms_map.sm_size, 0);
+
+	if (*dirty & (MSD_ALLOC | MSD_FREE)) {
+		tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+		if (smo->smo_object == 0) {
+			ASSERT(smo->smo_objsize == 0);
+			ASSERT(smo->smo_alloc == 0);
+			smo->smo_object = dmu_object_alloc(os,
+			    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
+			    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
+			ASSERT(smo->smo_object != 0);
+			dmu_write(os, vd->vdev_ms_array, sizeof (uint64_t) *
+			    (msp->ms_map.sm_start >> vd->vdev_ms_shift),
+			    sizeof (uint64_t), &smo->smo_object, tx);
+		}
+
+		alloc_delta = allocmap->sm_space - freemap->sm_space;
+		vdev_space_update(vd, 0, alloc_delta);
+		smo->smo_alloc += alloc_delta;
+
+		if (msp->ms_last_alloc == txg && msp->ms_map.sm_space == 0 &&
+		    (*dirty & MSD_CONDENSE) == 0) {
+			space_map_t *sm = &msp->ms_map;
+			space_map_t *tsm;
+			int i;
+
+			ASSERT(msp->ms_map_incore);
+
+			space_map_merge(freemap, freed_map);
+			space_map_vacate(allocmap, NULL, NULL);
+
+			/*
+			 * Write out the current state of the allocation
+			 * world.  The current metaslab is full, minus
+			 * stuff that's been freed this txg (freed_map),
+			 * minus allocations from txgs in the future.
+			 */
+			space_map_add(sm, sm->sm_start, sm->sm_size);
+			for (i = 1; i < TXG_CONCURRENT_STATES; i++) {
+				tsm = &msp->ms_allocmap[(txg + i) & TXG_MASK];
+				space_map_iterate(tsm, space_map_remove, sm);
+			}
+			space_map_iterate(freed_map, space_map_remove, sm);
+
+			space_map_write(sm, smo, os, tx);
+
+			ASSERT(sm->sm_space == 0);
+			ASSERT(freemap->sm_space == 0);
+			ASSERT(allocmap->sm_space == 0);
+
+			*dirty |= MSD_CONDENSE;
+		} else {
+			space_map_sync(allocmap, NULL, smo, SM_ALLOC, os, tx);
+			space_map_sync(freemap, freed_map, smo, SM_FREE,
+			    os, tx);
+		}
+
+		db = dmu_bonus_hold(os, smo->smo_object);
+		dmu_buf_will_dirty(db, tx);
+		ASSERT3U(db->db_size, ==, sizeof (*smo));
+		bcopy(smo, db->db_data, db->db_size);
+		dmu_buf_rele(db);
+
+		dmu_tx_commit(tx);
+	}
+
+	*dirty &= ~(MSD_ALLOC | MSD_FREE | MSD_ADD);
+
+	mutex_exit(&msp->ms_lock);
+
+	(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
+}
+
+/*
+ * Called after a transaction group has completely synced to mark
+ * all of the metaslab's free space as usable.
+ */
+void
+metaslab_sync_done(metaslab_t *msp, uint64_t txg)
+{
+	uint64_t weight;
+	uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK];
+	space_map_obj_t *smo = msp->ms_smo;
+
+	dprintf("%s offset %llx txg %llu\n",
+	    vdev_description(msp->ms_group->mg_vd), msp->ms_map.sm_start, txg);
+
+	mutex_enter(&msp->ms_lock);
+
+	ASSERT3U((*dirty & (MSD_ALLOC | MSD_FREE | MSD_ADD)), ==, 0);
+
+	msp->ms_usable_space = msp->ms_map.sm_size - smo->smo_alloc;
+	msp->ms_usable_end = smo->smo_objsize;
+
+	weight = msp->ms_usable_space;
+
+	if (txg != 0) {
+		space_map_t *freed_map =
+		    &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+
+		/* XXX -- we'll need a call to picker_fini here */
+
+		/* If we're empty, don't bother sticking around */
+		if (msp->ms_usable_space == 0) {
+			space_map_vacate(&msp->ms_map, NULL, NULL);
+			msp->ms_map_incore = 0;
+			ASSERT3U(freed_map->sm_space, ==, 0);
+			weight = 0;
+		} else {
+			/* Add the freed blocks to the available space map */
+			if (msp->ms_map_incore)
+				space_map_merge(freed_map, &msp->ms_map);
+			else
+				space_map_vacate(freed_map, NULL, NULL);
+			weight += msp->ms_map.sm_size;
+		}
+
+		if (msp->ms_last_alloc == txg)
+			/* Safe to use for allocation now */
+			msp->ms_last_alloc = 0;
+
+		*dirty = 0;
+	}
+
+	mutex_exit(&msp->ms_lock);
+
+	metaslab_group_sort(msp->ms_group, msp, weight);
+}
+
+/*
+ * The first-fit block picker.  No picker_init or picker_fini,
+ * this is just an experiment to see how it feels to separate out
+ * the block selection policy from the map updates.
+ * Note: the 'cursor' argument is a form of PPD.
+ */
+static uint64_t
+metaslab_pick_block(space_map_t *sm, uint64_t size, uint64_t *cursor)
+{
+	avl_tree_t *t = &sm->sm_root;
+	uint64_t align = size & -size;
+	space_seg_t *ss, ssearch;
+	avl_index_t where;
+	int tried_once = 0;
+
+again:
+	ssearch.ss_start = *cursor;
+	ssearch.ss_end = *cursor + size;
+
+	ss = avl_find(t, &ssearch, &where);
+	if (ss == NULL)
+		ss = avl_nearest(t, where, AVL_AFTER);
+
+	while (ss != NULL) {
+		uint64_t offset = P2ROUNDUP(ss->ss_start, align);
+
+		if (offset + size <= ss->ss_end) {
+			*cursor = offset + size;
+			return (offset);
+		}
+		ss = AVL_NEXT(t, ss);
+	}
+
+	/* If we couldn't find a block after cursor, search again */
+	if (tried_once == 0) {
+		tried_once = 1;
+		*cursor = 0;
+		goto again;
+	}
+
+	return (-1ULL);
+}
+
+static uint64_t
+metaslab_getblock(metaslab_t *msp, uint64_t size, uint64_t txg)
+{
+	space_map_t *sm = &msp->ms_map;
+	vdev_t *vd = msp->ms_group->mg_vd;
+	uint64_t offset;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(msp->ms_map_incore);
+	ASSERT(sm->sm_space != 0);
+	ASSERT(P2PHASE(size, 1ULL << vd->vdev_ashift) == 0);
+
+	offset = metaslab_pick_block(sm, size,
+	    &msp->ms_map_cursor[highbit(size & -size) - vd->vdev_ashift - 1]);
+	if (offset != -1ULL) {
+		space_map_remove(sm, offset, size);
+		space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+	}
+	return (offset);
+}
+
+/*
+ * Intent log support: upon opening the pool after a crash, notify the SPA
+ * of blocks that the intent log has allocated for immediate write, but
+ * which are still considered free by the SPA because the last transaction
+ * group didn't commit yet.
+ */
+int
+metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg)
+{
+	uint64_t vdev = DVA_GET_VDEV(dva);
+	uint64_t offset = DVA_GET_OFFSET(dva);
+	uint64_t size = DVA_GET_ASIZE(dva);
+	objset_t *os = spa->spa_meta_objset;
+	vdev_t *vd;
+	metaslab_t *msp;
+	space_map_t *sm;
+	space_map_obj_t *smo;
+	int error;
+
+	if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
+		return (ENXIO);
+
+	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
+		return (ENXIO);
+
+	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+	sm = &msp->ms_map;
+	smo = msp->ms_smo;
+
+	if (DVA_GET_GANG(dva))
+		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+	mutex_enter(&msp->ms_lock);
+
+	if (msp->ms_map_incore == 0) {
+		error = space_map_load(sm, smo, SM_FREE, os,
+		    msp->ms_usable_end, sm->sm_size - msp->ms_usable_space);
+		ASSERT(error == 0);
+		if (error) {
+			mutex_exit(&msp->ms_lock);
+			return (error);
+		}
+		msp->ms_map_incore = 1;
+		/* XXX -- we'll need a call to picker_init here */
+		bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor));
+	}
+
+	space_map_remove(sm, offset, size);
+	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+
+	if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) {
+		msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC;
+		msp->ms_last_alloc = txg;
+		vdev_dirty(vd, VDD_ALLOC, txg);
+		(void) txg_list_add(&vd->vdev_ms_list, msp, txg);
+	}
+
+	mutex_exit(&msp->ms_lock);
+
+	return (0);
+}
+
+static int
+metaslab_usable(metaslab_t *msp, uint64_t size, uint64_t txg)
+{
+	/*
+	 * Enforce segregation across transaction groups.
+	 */
+	/* XXX -- We should probably not assume we know what ms_weight means */
+	if (msp->ms_last_alloc == txg)
+		return (msp->ms_map.sm_space >= size && msp->ms_weight >= size);
+
+	if (msp->ms_last_alloc != 0)
+		return (0);
+
+	if (msp->ms_map.sm_space >= size && msp->ms_weight >= size)
+		return (1);
+
+	/* XXX -- the weight test should be in terms of MINFREE */
+	return (msp->ms_usable_space >= size && msp->ms_weight >= size);
+}
+
+static metaslab_t *
+metaslab_pick(metaslab_group_t *mg, uint64_t size, uint64_t txg)
+{
+	metaslab_t *msp;
+	avl_tree_t *t = &mg->mg_metaslab_tree;
+
+	mutex_enter(&mg->mg_lock);
+	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp))
+		if (metaslab_usable(msp, size, txg))
+			break;
+	mutex_exit(&mg->mg_lock);
+
+	return (msp);
+}
+
+static metaslab_t *
+metaslab_group_alloc(spa_t *spa, metaslab_group_t *mg, uint64_t size,
+    uint64_t *offp, uint64_t txg)
+{
+	metaslab_t *msp;
+	int error;
+
+	while ((msp = metaslab_pick(mg, size, txg)) != NULL) {
+		space_map_obj_t *smo = msp->ms_smo;
+		mutex_enter(&msp->ms_lock);
+		if (!metaslab_usable(msp, size, txg)) {
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+		if (msp->ms_map_incore == 0) {
+			error = space_map_load(&msp->ms_map, smo, SM_FREE,
+			    spa->spa_meta_objset, msp->ms_usable_end,
+			    msp->ms_map.sm_size - msp->ms_usable_space);
+			ASSERT(error == 0);
+			if (error) {
+				mutex_exit(&msp->ms_lock);
+				metaslab_group_sort(mg, msp, 0);
+				continue;
+			}
+			msp->ms_map_incore = 1;
+			/* XXX -- we'll need a call to picker_init here */
+			bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor));
+		}
+		*offp = metaslab_getblock(msp, size, txg);
+		if (*offp != -1ULL) {
+			if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) {
+				vdev_t *vd = mg->mg_vd;
+				msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC;
+				msp->ms_last_alloc = txg;
+				vdev_dirty(vd, VDD_ALLOC, txg);
+				(void) txg_list_add(&vd->vdev_ms_list,
+				    msp, txg);
+			}
+			mutex_exit(&msp->ms_lock);
+			return (msp);
+		}
+		mutex_exit(&msp->ms_lock);
+		metaslab_group_sort(msp->ms_group, msp, size - 1);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Allocate a block for the specified i/o.
+ */
+int
+metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg)
+{
+	metaslab_t *msp;
+	metaslab_group_t *mg, *rotor;
+	metaslab_class_t *mc;
+	vdev_t *vd;
+	uint64_t offset = -1ULL;
+	uint64_t asize;
+
+	mc = spa_metaslab_class_select(spa);
+
+	/*
+	 * Start at the rotor and loop through all mgs until we find something.
+	 * Note that there's no locking on mc_rotor or mc_allocated because
+	 * nothing actually breaks if we miss a few updates -- we just won't
+	 * allocate quite as evenly.  It all balances out over time.
+	 */
+	mg = rotor = mc->mc_rotor;
+	do {
+		vd = mg->mg_vd;
+		asize = vdev_psize_to_asize(vd, psize);
+		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
+
+		msp = metaslab_group_alloc(spa, mg, asize, &offset, txg);
+		if (msp != NULL) {
+			ASSERT(offset != -1ULL);
+
+			/*
+			 * If we've just selected this metaslab group,
+			 * figure out whether the corresponding vdev is
+			 * over- or under-used relative to the pool,
+			 * and set an allocation bias to even it out.
+			 */
+			if (mc->mc_allocated == 0) {
+				vdev_stat_t *vs = &vd->vdev_stat;
+				uint64_t alloc, space;
+				int64_t vu, su;
+
+				alloc = spa_get_alloc(spa);
+				space = spa_get_space(spa);
+
+				/*
+				 * Determine percent used in units of 0..1024.
+				 * (This is just to avoid floating point.)
+				 */
+				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
+				su = (alloc << 10) / (space + 1);
+
+				/*
+				 * Bias by at most +/- 25% of the aliquot.
+				 */
+				mg->mg_bias = ((su - vu) *
+				    (int64_t)mg->mg_aliquot) / (1024 * 4);
+
+				dprintf("bias = %lld\n", mg->mg_bias);
+			}
+
+			if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
+			    mg->mg_aliquot + mg->mg_bias) {
+				mc->mc_rotor = mg->mg_next;
+				mc->mc_allocated = 0;
+			}
+
+			DVA_SET_VDEV(dva, vd->vdev_id);
+			DVA_SET_OFFSET(dva, offset);
+			DVA_SET_GANG(dva, 0);
+			DVA_SET_ASIZE(dva, asize);
+
+			return (0);
+		}
+		mc->mc_rotor = mg->mg_next;
+		mc->mc_allocated = 0;
+	} while ((mg = mg->mg_next) != rotor);
+
+	dprintf("spa=%p, psize=%llu, txg=%llu: no\n", spa, psize, txg);
+
+	DVA_SET_VDEV(dva, 0);
+	DVA_SET_OFFSET(dva, 0);
+	DVA_SET_GANG(dva, 0);
+
+	return (ENOSPC);
+}
+
+/*
+ * Free the block represented by DVA in the context of the specified
+ * transaction group.
+ */
+void
+metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg)
+{
+	uint64_t vdev = DVA_GET_VDEV(dva);
+	uint64_t offset = DVA_GET_OFFSET(dva);
+	uint64_t size = DVA_GET_ASIZE(dva);
+	vdev_t *vd;
+	metaslab_t *msp;
+
+	if (txg > spa_freeze_txg(spa))
+		return;
+
+	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
+		cmn_err(CE_WARN, "metaslab_free(): bad vdev %llu",
+		    (u_longlong_t)vdev);
+		ASSERT(0);
+		return;
+	}
+
+	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
+		cmn_err(CE_WARN, "metaslab_free(): bad offset %llu",
+		    (u_longlong_t)offset);
+		ASSERT(0);
+		return;
+	}
+
+	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+	if (DVA_GET_GANG(dva))
+		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+	mutex_enter(&msp->ms_lock);
+
+	if ((msp->ms_dirty[txg & TXG_MASK] & MSD_FREE) == 0) {
+		msp->ms_dirty[txg & TXG_MASK] |= MSD_FREE;
+		vdev_dirty(vd, VDD_FREE, txg);
+		(void) txg_list_add(&vd->vdev_ms_list, msp, txg);
+	}
+
+	space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
+
+	mutex_exit(&msp->ms_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/refcount.c b/usr/src/uts/common/fs/zfs/refcount.c
new file mode 100644
index 000000000000..411ed46e13d7
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/refcount.c
@@ -0,0 +1,194 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+#if defined(DEBUG) || !defined(_KERNEL)
+
+#ifdef _KERNEL
+int reference_tracking_enable = FALSE; /* runs out of memory too easily */
+#else
+int reference_tracking_enable = TRUE;
+#endif
+int reference_history = 4; /* tunable */
+
+static kmem_cache_t *reference_cache;
+static kmem_cache_t *reference_history_cache;
+
+void
+refcount_init(void)
+{
+	reference_cache = kmem_cache_create("reference_cache",
+	    sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	reference_history_cache = kmem_cache_create("reference_history_cache",
+	    sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+refcount_fini(void)
+{
+	kmem_cache_destroy(reference_cache);
+	kmem_cache_destroy(reference_history_cache);
+}
+
+void
+refcount_create(refcount_t *rc)
+{
+	list_create(&rc->rc_list, sizeof (reference_t),
+	    offsetof(reference_t, ref_link));
+	list_create(&rc->rc_removed, sizeof (reference_t),
+	    offsetof(reference_t, ref_link));
+	mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+refcount_destroy_many(refcount_t *rc, uint64_t number)
+{
+	reference_t *ref;
+
+	ASSERT(rc->rc_count == number);
+	while (ref = list_head(&rc->rc_list)) {
+		list_remove(&rc->rc_list, ref);
+		kmem_cache_free(reference_cache, ref);
+	}
+	list_destroy(&rc->rc_list);
+
+	while (ref = list_head(&rc->rc_removed)) {
+		list_remove(&rc->rc_removed, ref);
+		kmem_cache_free(reference_history_cache, ref->ref_removed);
+		kmem_cache_free(reference_cache, ref);
+	}
+	list_destroy(&rc->rc_removed);
+	mutex_destroy(&rc->rc_mtx);
+}
+
+void
+refcount_destroy(refcount_t *rc)
+{
+	refcount_destroy_many(rc, 0);
+}
+
+int
+refcount_is_zero(refcount_t *rc)
+{
+	ASSERT(rc->rc_count >= 0);
+	return (rc->rc_count == 0);
+}
+
+int64_t
+refcount_count(refcount_t *rc)
+{
+	ASSERT(rc->rc_count >= 0);
+	return (rc->rc_count);
+}
+
+int64_t
+refcount_add_many(refcount_t *rc, uint64_t number, void *holder)
+{
+	reference_t *ref;
+	int64_t count;
+
+	if (reference_tracking_enable) {
+		ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
+		ref->ref_holder = holder;
+		ref->ref_number = number;
+	}
+	mutex_enter(&rc->rc_mtx);
+	ASSERT(rc->rc_count >= 0);
+	if (reference_tracking_enable)
+		list_insert_head(&rc->rc_list, ref);
+	rc->rc_count += number;
+	count = rc->rc_count;
+	mutex_exit(&rc->rc_mtx);
+
+	return (count);
+}
+
+int64_t
+refcount_add(refcount_t *rc, void *holder)
+{
+	return (refcount_add_many(rc, 1, holder));
+}
+
+int64_t
+refcount_remove_many(refcount_t *rc, uint64_t number, void *holder)
+{
+	reference_t *ref;
+	int64_t count;
+
+	mutex_enter(&rc->rc_mtx);
+	ASSERT(rc->rc_count >= number);
+
+	if (!reference_tracking_enable) {
+		rc->rc_count -= number;
+		count = rc->rc_count;
+		mutex_exit(&rc->rc_mtx);
+		return (count);
+	}
+
+	for (ref = list_head(&rc->rc_list); ref;
+	    ref = list_next(&rc->rc_list, ref)) {
+		if (ref->ref_holder == holder && ref->ref_number == number) {
+			list_remove(&rc->rc_list, ref);
+			if (reference_history > 0) {
+				ref->ref_removed =
+				    kmem_cache_alloc(reference_history_cache,
+				    KM_SLEEP);
+				list_insert_head(&rc->rc_removed, ref);
+				rc->rc_removed_count++;
+				if (rc->rc_removed_count >= reference_history) {
+					ref = list_tail(&rc->rc_removed);
+					list_remove(&rc->rc_removed, ref);
+					kmem_cache_free(reference_history_cache,
+					    ref->ref_removed);
+					kmem_cache_free(reference_cache, ref);
+					rc->rc_removed_count--;
+				}
+			} else {
+				kmem_cache_free(reference_cache, ref);
+			}
+			rc->rc_count -= number;
+			count = rc->rc_count;
+			mutex_exit(&rc->rc_mtx);
+			return (count);
+		}
+	}
+	panic("No such hold %p on refcount %llx", holder,
+	    (u_longlong_t)(uintptr_t)rc);
+	return (-1);
+}
+
+int64_t
+refcount_remove(refcount_t *rc, void *holder)
+{
+	return (refcount_remove_many(rc, 1, holder));
+}
+
+#endif
diff --git a/usr/src/uts/common/fs/zfs/sha256.c b/usr/src/uts/common/fs/zfs/sha256.c
new file mode 100644
index 000000000000..ce5c26131af5
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sha256.c
@@ -0,0 +1,131 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * SHA-256 checksum, as specified in FIPS 180-2, available at:
+ * http://csrc.nist.gov/cryptval
+ *
+ * This is a very compact implementation of SHA-256.
+ * It is designed to be simple and portable, not to be fast.
+ */
+
+/*
+ * The literal definitions according to FIPS180-2 would be:
+ *
+ * 	Ch(x, y, z)     (((x) & (y)) ^ ((~(x)) & (z)))
+ * 	Maj(x, y, z)    (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+ *
+ * We use logical equivalents which require one less op.
+ */
+#define	Ch(x, y, z)	((z) ^ ((x) & ((y) ^ (z))))
+#define	Maj(x, y, z)	(((x) & (y)) ^ ((z) & ((x) ^ (y))))
+#define	Rot32(x, s)	(((x) >> s) | ((x) << (32 - s)))
+#define	SIGMA0(x)	(Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22))
+#define	SIGMA1(x)	(Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25))
+#define	sigma0(x)	(Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3))
+#define	sigma1(x)	(Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10))
+
+static const uint32_t SHA256_K[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+static void
+SHA256Transform(uint32_t *H, const uint8_t *cp)
+{
+	uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64];
+
+	for (t = 0; t < 16; t++, cp += 4)
+		W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3];
+
+	for (t = 16; t < 64; t++)
+		W[t] = sigma1(W[t - 2]) + W[t - 7] +
+		    sigma0(W[t - 15]) + W[t - 16];
+
+	a = H[0]; b = H[1]; c = H[2]; d = H[3];
+	e = H[4]; f = H[5]; g = H[6]; h = H[7];
+
+	for (t = 0; t < 64; t++) {
+		T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t];
+		T2 = SIGMA0(a) + Maj(a, b, c);
+		h = g; g = f; f = e; e = d + T1;
+		d = c; c = b; b = a; a = T1 + T2;
+	}
+
+	H[0] += a; H[1] += b; H[2] += c; H[3] += d;
+	H[4] += e; H[5] += f; H[6] += g; H[7] += h;
+}
+
+void
+zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+	    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
+	uint8_t pad[128];
+	int padsize = size & 63;
+	int i;
+
+	for (i = 0; i < size - padsize; i += 64)
+		SHA256Transform(H, (uint8_t *)buf + i);
+
+	for (i = 0; i < padsize; i++)
+		pad[i] = ((uint8_t *)buf)[i];
+
+	for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
+		pad[padsize] = 0;
+
+	for (i = 0; i < 8; i++)
+		pad[padsize++] = (size << 3) >> (56 - 8 * i);
+
+	for (i = 0; i < padsize; i += 64)
+		SHA256Transform(H, pad + i);
+
+	ZIO_SET_CHECKSUM(zcp,
+	    (uint64_t)H[0] << 32 | H[1],
+	    (uint64_t)H[2] << 32 | H[3],
+	    (uint64_t)H[4] << 32 | H[5],
+	    (uint64_t)H[6] << 32 | H[7]);
+}
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
new file mode 100644
index 000000000000..43112d931960
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -0,0 +1,1784 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * This file contains all the routines used when modifying on-disk SPA state.
+ * This includes opening, importing, destroying, exporting a pool, and syncing a
+ * pool.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/dmu_traverse.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/fs/zfs.h>
+#include <sys/callb.h>
+
+static uint32_t spa_active_count;
+
+/*
+ * ==========================================================================
+ * SPA state manipulation (open/create/destroy/import/export)
+ * ==========================================================================
+ */
+
+/*
+ * Activate an uninitialized pool.
+ */
+static void
+spa_activate(spa_t *spa)
+{
+	int t;
+
+	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+
+	spa->spa_state = POOL_STATE_ACTIVE;
+
+	spa->spa_normal_class = metaslab_class_create();
+
+	spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry",
+	    4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
+
+	for (t = 0; t < ZIO_TYPES; t++) {
+		spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
+		    8, maxclsyspri, 50, INT_MAX,
+		    TASKQ_PREPOPULATE);
+		spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
+		    8, maxclsyspri, 50, INT_MAX,
+		    TASKQ_PREPOPULATE);
+	}
+
+	rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
+
+	list_create(&spa->spa_dirty_list, sizeof (vdev_t),
+	    offsetof(vdev_t, vdev_dirty_node));
+
+	txg_list_create(&spa->spa_vdev_txg_list,
+	    offsetof(struct vdev, vdev_txg_node));
+}
+
+/*
+ * Opposite of spa_activate().
+ */
+static void
+spa_deactivate(spa_t *spa)
+{
+	int t;
+
+	ASSERT(spa->spa_sync_on == B_FALSE);
+	ASSERT(spa->spa_dsl_pool == NULL);
+	ASSERT(spa->spa_root_vdev == NULL);
+
+	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
+
+	txg_list_destroy(&spa->spa_vdev_txg_list);
+
+	list_destroy(&spa->spa_dirty_list);
+
+	rw_destroy(&spa->spa_traverse_lock);
+
+	for (t = 0; t < ZIO_TYPES; t++) {
+		taskq_destroy(spa->spa_zio_issue_taskq[t]);
+		taskq_destroy(spa->spa_zio_intr_taskq[t]);
+		spa->spa_zio_issue_taskq[t] = NULL;
+		spa->spa_zio_intr_taskq[t] = NULL;
+	}
+
+	taskq_destroy(spa->spa_vdev_retry_taskq);
+	spa->spa_vdev_retry_taskq = NULL;
+
+	metaslab_class_destroy(spa->spa_normal_class);
+	spa->spa_normal_class = NULL;
+
+	spa->spa_state = POOL_STATE_UNINITIALIZED;
+}
+
+/*
+ * Verify a pool configuration, and construct the vdev tree appropriately.  This
+ * will create all the necessary vdevs in the appropriate layout, with each vdev
+ * in the CLOSED state.  This will prep the pool before open/creation/import.
+ * All vdev validation is done by the vdev_alloc() routine.
+ */
+static vdev_t *
+spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	vdev_t *vd;
+
+	if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL)
+		return (NULL);
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		return (vd);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0) {
+		vdev_free(vd);
+		return (NULL);
+	}
+
+	for (c = 0; c < children; c++) {
+		if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) {
+			vdev_free(vd);
+			return (NULL);
+		}
+	}
+
+	return (vd);
+}
+
+/*
+ * Opposite of spa_load().
+ */
+static void
+spa_unload(spa_t *spa)
+{
+	/*
+	 * Stop syncing.
+	 */
+	if (spa->spa_sync_on) {
+		txg_sync_stop(spa->spa_dsl_pool);
+		spa->spa_sync_on = B_FALSE;
+	}
+
+	/*
+	 * Wait for any outstanding prefetch I/O to complete.
+	 */
+	spa_config_enter(spa, RW_WRITER);
+	spa_config_exit(spa);
+
+	/*
+	 * Close the dsl pool.
+	 */
+	if (spa->spa_dsl_pool) {
+		dsl_pool_close(spa->spa_dsl_pool);
+		spa->spa_dsl_pool = NULL;
+	}
+
+	/*
+	 * Close all vdevs.
+	 */
+	if (spa->spa_root_vdev) {
+		vdev_free(spa->spa_root_vdev);
+		spa->spa_root_vdev = NULL;
+	}
+}
+
+/*
+ * Load an existing storage pool, using the pool's builtin spa_config as a
+ * source of configuration information.  The 'readonly' flag will prevent us
+ * from writing any updated state to disk, and can be use when testing a pool
+ * for import.
+ */
+static int
+spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
+{
+	int error = 0;
+	nvlist_t *nvroot = NULL;
+	vdev_t *rvd;
+	uberblock_t *ub = &spa->spa_uberblock;
+	uint64_t pool_guid;
+	zio_t *zio;
+
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
+	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
+		return (EINVAL);
+
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+	    &spa->spa_config_txg);
+
+	if (import && spa_guid_exists(pool_guid, 0))
+		return (EEXIST);
+
+	/*
+	 * Parse the configuration into a vdev tree.
+	 */
+	spa_config_enter(spa, RW_WRITER);
+	rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
+	spa_config_exit(spa);
+
+	if (rvd == NULL)
+		return (EINVAL);
+
+	spa->spa_root_vdev = rvd;
+	ASSERT(spa_guid(spa) == pool_guid);
+
+	/*
+	 * Try to open all vdevs, loading each label in the process.
+	 */
+	if (vdev_open(rvd) != 0)
+		return (ENXIO);
+
+	/*
+	 * Find the best uberblock.
+	 */
+	bzero(ub, sizeof (uberblock_t));
+
+	zio = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+	vdev_uberblock_load(zio, rvd, ub);
+	error = zio_wait(zio);
+
+	/*
+	 * If we weren't able to find a single valid uberblock, return failure.
+	 */
+	if (ub->ub_txg == 0) {
+		dprintf("ub_txg is zero\n");
+		return (ENXIO);
+	}
+
+	/*
+	 * If the vdev guid sum doesn't match the uberblock, we have an
+	 * incomplete configuration.
+	 */
+	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
+		rvd->vdev_state = VDEV_STATE_CANT_OPEN;
+		rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM;
+		dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n",
+		    rvd->vdev_guid_sum, ub->ub_guid_sum);
+		return (ENXIO);
+	}
+
+	/*
+	 * Initialize internal SPA structures.
+	 */
+	spa->spa_state = POOL_STATE_ACTIVE;
+	spa->spa_ubsync = spa->spa_uberblock;
+	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
+	spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg);
+	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+
+	VERIFY(zap_lookup(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
+	    sizeof (uint64_t), 1, &spa->spa_config_object) == 0);
+
+	if (!mosconfig) {
+		dmu_buf_t *db;
+		char *packed = NULL;
+		size_t nvsize = 0;
+		nvlist_t *newconfig = NULL;
+
+		db = dmu_bonus_hold(spa->spa_meta_objset,
+		    spa->spa_config_object);
+		dmu_buf_read(db);
+		nvsize = *(uint64_t *)db->db_data;
+		dmu_buf_rele(db);
+
+		packed = kmem_alloc(nvsize, KM_SLEEP);
+		error = dmu_read_canfail(spa->spa_meta_objset,
+		    spa->spa_config_object, 0, nvsize, packed);
+		if (error == 0)
+			error = nvlist_unpack(packed, nvsize, &newconfig, 0);
+		kmem_free(packed, nvsize);
+
+		if (error)
+			return (ENXIO);
+
+		spa_config_set(spa, newconfig);
+
+		spa_unload(spa);
+		spa_deactivate(spa);
+		spa_activate(spa);
+
+		return (spa_load(spa, newconfig, readonly, import, B_TRUE));
+	}
+
+	VERIFY(zap_lookup(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
+	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0);
+
+	/*
+	 * Load the vdev state for all top level vdevs.
+	 */
+	if ((error = vdev_load(rvd, import)) != 0)
+		return (error);
+
+	/*
+	 * Propagate the leaf DTLs we just loaded all the way up the tree.
+	 */
+	spa_config_enter(spa, RW_WRITER);
+	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
+	spa_config_exit(spa);
+
+	/*
+	 * Check the state of the root vdev.  If it can't be opened, it
+	 * indicates one or more toplevel vdevs are faulted.
+	 */
+	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+		return (ENXIO);
+
+	/*
+	 * Claim log blocks that haven't been committed yet, and update all
+	 * top-level vdevs to sync any config changes found in vdev_load().
+	 * This must all happen in a single txg.
+	 */
+	if ((spa_mode & FWRITE) && !readonly) {
+		dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa),
+		    spa_first_txg(spa));
+		dmu_objset_find(spa->spa_name, zil_claim, tx, 0);
+		vdev_config_dirty(rvd);
+		dmu_tx_commit(tx);
+
+		spa->spa_sync_on = B_TRUE;
+		txg_sync_start(spa->spa_dsl_pool);
+
+		/*
+		 * Wait for all claims to sync.
+		 */
+		txg_wait_synced(spa->spa_dsl_pool, 0);
+	}
+
+	return (0);
+}
+
+/*
+ * Pool Open/Import
+ *
+ * The import case is identical to an open except that the configuration is sent
+ * down from userland, instead of grabbed from the configuration cache.  For the
+ * case of an open, the pool configuration will exist in the
+ * POOL_STATE_UNITIALIZED state.
+ *
+ * The stats information (gen/count/ustats) is used to gather vdev statistics at
+ * the same time open the pool, without having to keep around the spa_t in some
+ * ambiguous state.
+ */
+static int
+spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
+{
+	spa_t *spa;
+	int error;
+	int loaded = B_FALSE;
+	int locked = B_FALSE;
+
+	*spapp = NULL;
+
+	/*
+	 * As disgusting as this is, we need to support recursive calls to this
+	 * function because dsl_dir_open() is called during spa_load(), and ends
+	 * up calling spa_open() again.  The real fix is to figure out how to
+	 * avoid dsl_dir_open() calling this in the first place.
+	 */
+	if (mutex_owner(&spa_namespace_lock) != curthread) {
+		mutex_enter(&spa_namespace_lock);
+		locked = B_TRUE;
+	}
+
+	if ((spa = spa_lookup(pool)) == NULL) {
+		if (locked)
+			mutex_exit(&spa_namespace_lock);
+		return (ENOENT);
+	}
+	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
+
+		spa_activate(spa);
+
+		error = spa_load(spa, spa->spa_config,
+		    B_FALSE, B_FALSE, B_FALSE);
+
+		if (error == EBADF) {
+			/*
+			 * If vdev_load() returns EBADF, it indicates that one
+			 * of the vdevs indicates that the pool has been
+			 * exported or destroyed.  If this is the case, the
+			 * config cache is out of sync and we should remove the
+			 * pool from the namespace.
+			 */
+			spa_unload(spa);
+			spa_deactivate(spa);
+			spa_remove(spa);
+			spa_config_sync();
+			if (locked)
+				mutex_exit(&spa_namespace_lock);
+			return (ENOENT);
+		} if (error) {
+			/*
+			 * We can't open the pool, but we still have useful
+			 * information: the state of each vdev after the
+			 * attempted vdev_open().  Return this to the user.
+			 */
+			if (config != NULL && spa->spa_root_vdev != NULL)
+				*config = spa_config_generate(spa, NULL, -1ULL,
+				    B_TRUE);
+			spa_unload(spa);
+			spa_deactivate(spa);
+			if (locked)
+				mutex_exit(&spa_namespace_lock);
+			*spapp = NULL;
+			return (error);
+		}
+
+		loaded = B_TRUE;
+	}
+
+	spa_open_ref(spa, tag);
+	if (locked)
+		mutex_exit(&spa_namespace_lock);
+
+	*spapp = spa;
+
+	if (config != NULL) {
+		spa_config_enter(spa, RW_READER);
+		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+		spa_config_exit(spa);
+	}
+
+	/*
+	 * If we just loaded the pool, resilver anything that's out of date.
+	 */
+	if (loaded && (spa_mode & FWRITE))
+		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+	return (0);
+}
+
+int
+spa_open(const char *name, spa_t **spapp, void *tag)
+{
+	return (spa_open_common(name, spapp, tag, NULL));
+}
+
+int
+spa_get_stats(const char *name, nvlist_t **config)
+{
+	int error;
+	spa_t *spa;
+
+	*config = NULL;
+	error = spa_open_common(name, &spa, FTAG, config);
+
+	if (spa != NULL)
+		spa_close(spa, FTAG);
+
+	return (error);
+}
+
+/*
+ * Pool Creation
+ */
+int
+spa_create(const char *pool, nvlist_t *nvroot, char *altroot)
+{
+	spa_t *spa;
+	dsl_pool_t *dp;
+	dmu_tx_t *tx;
+	int error;
+	uint64_t txg = TXG_INITIAL;
+
+	/*
+	 * If this pool already exists, return failure.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	if (spa_lookup(pool) != NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (EEXIST);
+	}
+	spa = spa_add(pool);
+
+	/*
+	 * Allocate a new spa_t structure.
+	 */
+	spa_activate(spa);
+
+	spa->spa_uberblock.ub_txg = txg - 1;
+	spa->spa_ubsync = spa->spa_uberblock;
+
+	error = spa_vdev_add(spa, nvroot);
+
+	if (error) {
+		spa_unload(spa);
+		spa_deactivate(spa);
+		spa_remove(spa);
+		mutex_exit(&spa_namespace_lock);
+		return (error);
+	}
+
+	if (altroot != NULL) {
+		spa->spa_root = spa_strdup(altroot);
+		atomic_add_32(&spa_active_count, 1);
+	}
+
+	spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
+	spa->spa_meta_objset = dp->dp_meta_objset;
+
+	tx = dmu_tx_create_assigned(dp, txg);
+
+	/*
+	 * Create the pool config object.
+	 */
+	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
+	    DMU_OT_PACKED_NVLIST, 1 << 14,
+	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
+
+	VERIFY(zap_add(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
+	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0);
+
+	/*
+	 * Create the deferred-free bplist object.  Turn off compression
+	 * because sync-to-convergence takes longer if the blocksize
+	 * keeps changing.
+	 */
+	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
+	    1 << 14, tx);
+	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
+	    ZIO_COMPRESS_OFF, tx);
+
+	VERIFY(zap_add(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
+	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0);
+
+	dmu_tx_commit(tx);
+
+	spa->spa_sync_on = B_TRUE;
+	txg_sync_start(spa->spa_dsl_pool);
+
+	/*
+	 * We explicitly wait for the first transaction to complete so that our
+	 * bean counters are appropriately updated.
+	 */
+	txg_wait_synced(spa->spa_dsl_pool, txg);
+
+	spa_config_sync();
+
+	mutex_exit(&spa_namespace_lock);
+
+	return (0);
+}
+
+/*
+ * Import the given pool into the system.  We set up the necessary spa_t and
+ * then call spa_load() to do the dirty work.
+ */
+int
+spa_import(const char *pool, nvlist_t *config, char *altroot)
+{
+	spa_t *spa;
+	int error;
+
+	if (!(spa_mode & FWRITE))
+		return (EROFS);
+
+	/*
+	 * If a pool with this name exists, return failure.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	if (spa_lookup(pool) != NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (EEXIST);
+	}
+
+	/*
+	 * Create an initialize the spa structure
+	 */
+	spa = spa_add(pool);
+	spa_activate(spa);
+
+	/*
+	 * Pass off the heavy lifting to spa_load().  We pass TRUE for mosconfig
+	 * so that we don't try to open the pool if the config is damaged.
+	 */
+	error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE);
+
+	if (error) {
+		spa_unload(spa);
+		spa_deactivate(spa);
+		spa_remove(spa);
+		mutex_exit(&spa_namespace_lock);
+		return (error);
+	}
+
+	/*
+	 * Set the alternate root, if there is one.
+	 */
+	if (altroot != NULL) {
+		atomic_add_32(&spa_active_count, 1);
+		spa->spa_root = spa_strdup(altroot);
+	}
+
+	/*
+	 * Initialize the config based on the in-core state.
+	 */
+	config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0);
+
+	spa_config_set(spa, config);
+
+	/*
+	 * Sync the configuration cache.
+	 */
+	spa_config_sync();
+
+	mutex_exit(&spa_namespace_lock);
+
+	/*
+	 * Resilver anything that's out of date.
+	 */
+	if (spa_mode & FWRITE)
+		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+	return (0);
+}
+
+/*
+ * This (illegal) pool name is used when temporarily importing a spa_t in order
+ * to get the vdev stats associated with the imported devices.
+ */
+#define	TRYIMPORT_NAME	"$import"
+
+nvlist_t *
+spa_tryimport(nvlist_t *tryconfig)
+{
+	nvlist_t *config = NULL;
+	char *poolname;
+	spa_t *spa;
+	uint64_t state;
+
+	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
+		return (NULL);
+
+	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
+		return (NULL);
+
+	mutex_enter(&spa_namespace_lock);
+	spa = spa_add(TRYIMPORT_NAME);
+
+	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+
+	/*
+	 * Initialize the spa_t structure.
+	 */
+	spa_activate(spa);
+
+	/*
+	 * Pass off the heavy lifting to spa_load().  We pass TRUE for mosconfig
+	 * so we don't try to open the pool if the config is damaged.
+	 */
+	(void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE);
+
+	/*
+	 * If 'tryconfig' was at least parsable, return the current config.
+	 */
+	if (spa->spa_root_vdev != NULL) {
+		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
+		    poolname) == 0);
+		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+		    state) == 0);
+	}
+
+	spa_unload(spa);
+	spa_deactivate(spa);
+	spa_remove(spa);
+	mutex_exit(&spa_namespace_lock);
+
+	return (config);
+}
+
+/*
+ * Pool export/destroy
+ *
+ * The act of destroying or exporting a pool is very simple.  We make sure there
+ * is no more pending I/O and any references to the pool are gone.  Then, we
+ * update the pool state and sync all the labels to disk, removing the
+ * configuration from the cache afterwards.
+ */
+static int
+spa_export_common(char *pool, int new_state)
+{
+	spa_t *spa;
+
+	if (!(spa_mode & FWRITE))
+		return (EROFS);
+
+	mutex_enter(&spa_namespace_lock);
+	if ((spa = spa_lookup(pool)) == NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (ENOENT);
+	}
+
+	/*
+	 * The pool will be in core if it's openable,
+	 * in which case we can modify its state.
+	 */
+	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
+		/*
+		 * Objsets may be open only because they're dirty, so we
+		 * have to force it to sync before checking spa_refcnt.
+		 */
+		spa_scrub_suspend(spa);
+		txg_wait_synced(spa->spa_dsl_pool, 0);
+
+		if (!spa_refcount_zero(spa)) {
+			spa_scrub_resume(spa);
+			mutex_exit(&spa_namespace_lock);
+			return (EBUSY);
+		}
+
+		/*
+		 * Update the pool state.
+		 */
+		spa->spa_state = new_state;
+
+		spa_scrub_resume(spa);
+		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
+
+		if (spa->spa_root != NULL)
+			atomic_add_32(&spa_active_count, -1);
+
+		/*
+		 * We want this to be reflected on every label,
+		 * so mark them all dirty.  spa_unload() will do the
+		 * final sync that pushes these changes out.
+		 */
+		vdev_config_dirty(spa->spa_root_vdev);
+	}
+
+	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+		spa_unload(spa);
+		spa_deactivate(spa);
+	}
+
+	spa_remove(spa);
+	spa_config_sync();
+	mutex_exit(&spa_namespace_lock);
+
+	return (0);
+}
+
+/*
+ * Destroy a storage pool.
+ */
+int
+spa_destroy(char *pool)
+{
+	return (spa_export_common(pool, POOL_STATE_DESTROYED));
+}
+
+/*
+ * Export a storage pool.
+ */
+int
+spa_export(char *pool)
+{
+	return (spa_export_common(pool, POOL_STATE_EXPORTED));
+}
+
+/*
+ * ==========================================================================
+ * Device manipulation
+ * ==========================================================================
+ */
+
+/*
+ * Add capacity to a storage pool.
+ */
+int
+spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
+{
+	uint64_t txg;
+	int c, error;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *vd;
+
+	txg = spa_vdev_enter(spa);
+
+	vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD);
+
+	if (vd == NULL)
+		return (spa_vdev_exit(spa, vd, txg, EINVAL));
+
+	if (rvd == NULL)			/* spa_create() */
+		spa->spa_root_vdev = rvd = vd;
+
+	if ((error = vdev_create(vd, txg)) != 0)
+		return (spa_vdev_exit(spa, vd, txg, error));
+
+	/*
+	 * Transfer each top-level vdev from the temporary root
+	 * to the spa's root and initialize its metaslabs.
+	 */
+	for (c = 0; c < vd->vdev_children; c++) {
+		vdev_t *tvd = vd->vdev_child[c];
+		if (vd != rvd) {
+			vdev_remove_child(vd, tvd);
+			tvd->vdev_id = rvd->vdev_children;
+			vdev_add_child(rvd, tvd);
+		}
+		vdev_init(tvd, txg);
+		vdev_config_dirty(tvd);
+	}
+
+	/*
+	 * Update the config based on the new in-core state.
+	 */
+	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
+
+	return (spa_vdev_exit(spa, vd, txg, 0));
+}
+
+/*
+ * Attach a device to a mirror.  The arguments are the path to any device
+ * in the mirror, and the nvroot for the new device.  If the path specifies
+ * a device that is not mirrored, we automatically insert the mirror vdev.
+ *
+ * If 'replacing' is specified, the new device is intended to replace the
+ * existing device; in this case the two devices are made into their own
+ * mirror using the 'replacing' vdev, which is functionally idendical to
+ * the mirror vdev (it actually reuses all the same ops) but has a few
+ * extra rules: you can't attach to it after it's been created, and upon
+ * completion of resilvering, the first disk (the one being replaced)
+ * is automatically detached.
+ */
+int
+spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
+{
+	uint64_t txg, open_txg;
+	int error;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
+	vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops;
+
+	txg = spa_vdev_enter(spa);
+
+	oldvd = vdev_lookup_by_path(rvd, path);
+
+	if (oldvd == NULL)
+		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+	pvd = oldvd->vdev_parent;
+
+	/*
+	 * The parent must be a mirror or the root, unless we're replacing;
+	 * in that case, the parent can be anything but another replacing vdev.
+	 */
+	if (pvd->vdev_ops != &vdev_mirror_ops &&
+	    pvd->vdev_ops != &vdev_root_ops &&
+	    (!replacing || pvd->vdev_ops == &vdev_replacing_ops))
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+	newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD);
+
+	if (newrootvd == NULL || newrootvd->vdev_children != 1)
+		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+	newvd = newrootvd->vdev_child[0];
+
+	if (!newvd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+	if ((error = vdev_create(newrootvd, txg)) != 0)
+		return (spa_vdev_exit(spa, newrootvd, txg, error));
+
+	if (newvd->vdev_psize < oldvd->vdev_psize)
+		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
+
+	if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0)
+		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
+
+	/*
+	 * If this is an in-place replacement, update oldvd's path and devid
+	 * to make it distinguishable from newvd, and unopenable from now on.
+	 */
+	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+		spa_strfree(oldvd->vdev_path);
+		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+		    KM_SLEEP);
+		(void) sprintf(oldvd->vdev_path, "%s/%s",
+		    newvd->vdev_path, "old");
+		if (oldvd->vdev_devid != NULL) {
+			spa_strfree(oldvd->vdev_devid);
+			oldvd->vdev_devid = NULL;
+		}
+	}
+
+	/*
+	 * If the parent is not a mirror, or if we're replacing,
+	 * insert the new mirror/replacing vdev above oldvd.
+	 */
+	if (pvd->vdev_ops != pvops)
+		pvd = vdev_add_parent(oldvd, pvops);
+
+	ASSERT(pvd->vdev_top->vdev_parent == rvd);
+	ASSERT(pvd->vdev_ops == pvops);
+	ASSERT(oldvd->vdev_parent == pvd);
+
+	/*
+	 * Extract the new device from its root and add it to pvd.
+	 */
+	vdev_remove_child(newrootvd, newvd);
+	newvd->vdev_id = pvd->vdev_children;
+	vdev_add_child(pvd, newvd);
+
+	tvd = newvd->vdev_top;
+	ASSERT(pvd->vdev_top == tvd);
+	ASSERT(tvd->vdev_parent == rvd);
+
+	/*
+	 * Update the config based on the new in-core state.
+	 */
+	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
+
+	vdev_config_dirty(tvd);
+
+	/*
+	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
+	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
+	 */
+	open_txg = txg + TXG_CONCURRENT_STATES - 1;
+
+	mutex_enter(&newvd->vdev_dtl_lock);
+	space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
+	    open_txg - TXG_INITIAL + 1);
+	mutex_exit(&newvd->vdev_dtl_lock);
+
+	/*
+	 * Mark newvd's DTL dirty in this txg.
+	 */
+	vdev_dirty(tvd, VDD_DTL, txg);
+	(void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg);
+
+	dprintf("attached %s, replacing=%d\n", path, replacing);
+
+	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
+
+	/*
+	 * Kick off a resilver to update newvd.
+	 */
+	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+	return (0);
+}
+
+/*
+ * Detach a device from a mirror or replacing vdev.
+ * If 'replace_done' is specified, only detach if the parent
+ * is a replacing vdev.
+ */
+int
+spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
+{
+	uint64_t txg;
+	int c, t, error;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *vd, *pvd, *cvd, *tvd;
+
+	txg = spa_vdev_enter(spa);
+
+	vd = vdev_lookup_by_path(rvd, path);
+
+	if (vd == NULL)
+		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+	if (guid != 0 && vd->vdev_guid != guid)
+		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+	pvd = vd->vdev_parent;
+
+	/*
+	 * If replace_done is specified, only remove this device if it's
+	 * the first child of a replacing vdev.
+	 */
+	if (replace_done &&
+	    (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops))
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+	/*
+	 * Only mirror and replacing vdevs support detach.
+	 */
+	if (pvd->vdev_ops != &vdev_replacing_ops &&
+	    pvd->vdev_ops != &vdev_mirror_ops)
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+	/*
+	 * If there's only one replica, you can't detach it.
+	 */
+	if (pvd->vdev_children <= 1)
+		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+	/*
+	 * If all siblings have non-empty DTLs, this device may have the only
+	 * valid copy of the data, which means we cannot safely detach it.
+	 *
+	 * XXX -- as in the vdev_offline() case, we really want a more
+	 * precise DTL check.
+	 */
+	for (c = 0; c < pvd->vdev_children; c++) {
+		uint64_t dirty;
+
+		cvd = pvd->vdev_child[c];
+		if (cvd == vd)
+			continue;
+		if (vdev_is_dead(cvd))
+			continue;
+		mutex_enter(&cvd->vdev_dtl_lock);
+		dirty = cvd->vdev_dtl_map.sm_space |
+		    cvd->vdev_dtl_scrub.sm_space;
+		mutex_exit(&cvd->vdev_dtl_lock);
+		if (!dirty)
+			break;
+	}
+	if (c == pvd->vdev_children)
+		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+	/*
+	 * Erase the disk labels so the disk can be used for other things.
+	 * This must be done after all other error cases are handled,
+	 * but before we disembowel vd (so we can still do I/O to it).
+	 * But if we can't do it, don't treat the error as fatal --
+	 * it may be that the unwritability of the disk is the reason
+	 * it's being detached!
+	 */
+	error = vdev_label_init(vd, 0);
+	if (error)
+		dprintf("unable to erase labels on %s\n", vdev_description(vd));
+
+	/*
+	 * Remove vd from its parent and compact the parent's children.
+	 */
+	vdev_remove_child(pvd, vd);
+	vdev_compact_children(pvd);
+
+	/*
+	 * Remember one of the remaining children so we can get tvd below.
+	 */
+	cvd = pvd->vdev_child[0];
+
+	/*
+	 * If the parent mirror/replacing vdev only has one child,
+	 * the parent is no longer needed.  Remove it from the tree.
+	 */
+	if (pvd->vdev_children == 1)
+		vdev_remove_parent(cvd);
+
+	/*
+	 * We don't set tvd until now because the parent we just removed
+	 * may have been the previous top-level vdev.
+	 */
+	tvd = cvd->vdev_top;
+	ASSERT(tvd->vdev_parent == rvd);
+
+	/*
+	 * Reopen this top-level vdev to reassess health after detach.
+	 */
+	vdev_reopen(tvd, NULL);
+
+	/*
+	 * If the device we just detached was smaller than the others,
+	 * it may be possible to add metaslabs (i.e. grow the pool).
+	 */
+	vdev_metaslab_init(tvd, txg);
+
+	/*
+	 * Update the config based on the new in-core state.
+	 */
+	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
+
+	vdev_config_dirty(tvd);
+
+	/*
+	 * Mark vd's DTL as dirty in this txg.
+	 * vdev_dtl_sync() will see that vd->vdev_detached is set
+	 * and free vd's DTL object in syncing context.
+	 * But first make sure we're not on any *other* txg's DTL list,
+	 * to prevent vd from being accessed after it's freed.
+	 */
+	vdev_dirty(tvd, VDD_DTL, txg);
+	vd->vdev_detached = B_TRUE;
+	for (t = 0; t < TXG_SIZE; t++)
+		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
+	(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
+
+	dprintf("detached %s\n", path);
+
+	return (spa_vdev_exit(spa, vd, txg, 0));
+}
+
+/*
+ * If there are any replacing vdevs that have finished replacing, detach them.
+ * We can't hold the config lock across detaches, so we lock the config,
+ * build a list of candidates, unlock the config, and try each candidate.
+ */
+typedef struct vdev_detach_link {
+	char		*vdl_path;
+	uint64_t	vdl_guid;
+	list_node_t	vdl_node;
+} vdev_detach_link_t;
+
+static void
+spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd)
+{
+	int c;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		spa_vdev_replace_done_make_list(l, vd->vdev_child[c]);
+
+	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
+		vdev_t *cvd0 = vd->vdev_child[0];
+		vdev_t *cvd1 = vd->vdev_child[1];
+		vdev_detach_link_t *vdl;
+		int dirty1;
+
+		mutex_enter(&cvd1->vdev_dtl_lock);
+		dirty1 = cvd1->vdev_dtl_map.sm_space |
+		    cvd1->vdev_dtl_scrub.sm_space;
+		mutex_exit(&cvd1->vdev_dtl_lock);
+
+		if (!dirty1) {
+			vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP);
+			vdl->vdl_path = spa_strdup(cvd0->vdev_path);
+			vdl->vdl_guid = cvd0->vdev_guid;
+			list_insert_tail(l, vdl);
+		}
+	}
+}
+
+void
+spa_vdev_replace_done(spa_t *spa)
+{
+	vdev_detach_link_t *vdl;
+	list_t vdlist;
+
+	list_create(&vdlist, sizeof (vdev_detach_link_t),
+	    offsetof(vdev_detach_link_t, vdl_node));
+
+	spa_config_enter(spa, RW_READER);
+	spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev);
+	spa_config_exit(spa);
+
+	while ((vdl = list_head(&vdlist)) != NULL) {
+		list_remove(&vdlist, vdl);
+		(void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid,
+		    B_TRUE);
+		spa_strfree(vdl->vdl_path);
+		kmem_free(vdl, sizeof (*vdl));
+	}
+
+	list_destroy(&vdlist);
+}
+
+/*
+ * ==========================================================================
+ * SPA Scrubbing
+ * ==========================================================================
+ */
+
+static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t);
+
+static void
+spa_scrub_io_done(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+
+	zio_buf_free(zio->io_data, zio->io_size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	if (zio->io_error)
+		spa->spa_scrub_errors++;
+	if (--spa->spa_scrub_inflight == 0)
+		cv_broadcast(&spa->spa_scrub_io_cv);
+	mutex_exit(&spa->spa_scrub_lock);
+
+	if (zio->io_error) {
+		vdev_t *vd = zio->io_vd;
+		mutex_enter(&vd->vdev_stat_lock);
+		vd->vdev_stat.vs_scrub_errors++;
+		mutex_exit(&vd->vdev_stat_lock);
+	}
+}
+
+static void
+spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags)
+{
+	size_t size = BP_GET_LSIZE(bp);
+	void *data = zio_buf_alloc(size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_inflight++;
+	mutex_exit(&spa->spa_scrub_lock);
+
+	zio_nowait(zio_read(NULL, spa, bp, data, size,
+	    spa_scrub_io_done, NULL, priority, flags));
+}
+
+/* ARGSUSED */
+static int
+spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
+{
+	blkptr_t *bp = &bc->bc_blkptr;
+	vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0]));
+
+	if (bc->bc_errno || vd == NULL) {
+		/*
+		 * We can't scrub this block, but we can continue to scrub
+		 * the rest of the pool.  Note the error and move along.
+		 */
+		mutex_enter(&spa->spa_scrub_lock);
+		spa->spa_scrub_errors++;
+		mutex_exit(&spa->spa_scrub_lock);
+
+		if (vd != NULL) {
+			mutex_enter(&vd->vdev_stat_lock);
+			vd->vdev_stat.vs_scrub_errors++;
+			mutex_exit(&vd->vdev_stat_lock);
+		}
+
+		return (ERESTART);
+	}
+
+	ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
+
+	/*
+	 * Keep track of how much data we've examined so that
+	 * zpool(1M) status can make useful progress reports.
+	 */
+	mutex_enter(&vd->vdev_stat_lock);
+	vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp);
+	mutex_exit(&vd->vdev_stat_lock);
+
+	if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
+		if (DVA_GET_GANG(&bp->blk_dva[0])) {
+			/*
+			 * Gang members may be spread across multiple vdevs,
+			 * so the best we can do is look at the pool-wide DTL.
+			 * XXX -- it would be better to change our allocation
+			 * policy to ensure that this can't happen.
+			 */
+			vd = spa->spa_root_vdev;
+		}
+		if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) {
+			spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY |
+			    ZIO_FLAG_RESILVER);
+		}
+	} else {
+		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB);
+	}
+
+	return (0);
+}
+
+static void
+spa_scrub_thread(spa_t *spa)
+{
+	callb_cpr_t cprinfo;
+	traverse_handle_t *th = spa->spa_scrub_th;
+	vdev_t *rvd = spa->spa_root_vdev;
+	pool_scrub_type_t scrub_type = spa->spa_scrub_type;
+	int error = 0;
+	boolean_t complete;
+
+	CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
+
+	spa_config_enter(spa, RW_WRITER);
+	vdev_reopen(rvd, NULL);		/* purge all vdev caches */
+	vdev_config_dirty(rvd);		/* rewrite all disk labels */
+	vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
+	spa_config_exit(spa);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_errors = 0;
+	spa->spa_scrub_active = 1;
+
+	while (!spa->spa_scrub_stop) {
+		CALLB_CPR_SAFE_BEGIN(&cprinfo);
+		while (spa->spa_scrub_suspend) {
+			spa->spa_scrub_active = 0;
+			cv_broadcast(&spa->spa_scrub_cv);
+			cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+			spa->spa_scrub_active = 1;
+		}
+		CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
+
+		if (spa->spa_scrub_restart_txg != 0)
+			break;
+
+		mutex_exit(&spa->spa_scrub_lock);
+		error = traverse_more(th);
+		mutex_enter(&spa->spa_scrub_lock);
+		if (error != EAGAIN)
+			break;
+	}
+
+	while (spa->spa_scrub_inflight)
+		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+
+	if (spa->spa_scrub_restart_txg != 0)
+		error = ERESTART;
+
+	spa->spa_scrub_active = 0;
+	cv_broadcast(&spa->spa_scrub_cv);
+
+	/*
+	 * If the traverse completed, and there were no errors,
+	 * then the scrub was completely successful.
+	 */
+	complete = (error == 0 && spa->spa_scrub_errors == 0);
+
+	dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
+	    spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
+	    error, spa->spa_scrub_errors, spa->spa_scrub_stop);
+
+	mutex_exit(&spa->spa_scrub_lock);
+
+	/*
+	 * If the scrub/resilver completed, update all DTLs to reflect this.
+	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
+	 */
+	spa_config_enter(spa, RW_WRITER);
+	vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
+	    complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
+	spa_config_exit(spa);
+
+	spa_vdev_replace_done(spa);
+
+	spa_config_enter(spa, RW_READER);
+	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
+	spa_config_exit(spa);
+
+	mutex_enter(&spa->spa_scrub_lock);
+
+	spa->spa_scrub_type = POOL_SCRUB_NONE;
+	spa->spa_scrub_active = 0;
+	spa->spa_scrub_thread = NULL;
+
+	cv_broadcast(&spa->spa_scrub_cv);
+
+	/*
+	 * If we were told to restart, our final act is to start a new scrub.
+	 */
+	if (error == ERESTART)
+		VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0);
+
+	CALLB_CPR_EXIT(&cprinfo);	/* drops &spa->spa_scrub_lock */
+	thread_exit();
+}
+
+void
+spa_scrub_suspend(spa_t *spa)
+{
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_suspend++;
+	while (spa->spa_scrub_active) {
+		cv_broadcast(&spa->spa_scrub_cv);
+		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+	}
+	while (spa->spa_scrub_inflight)
+		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+void
+spa_scrub_resume(spa_t *spa)
+{
+	mutex_enter(&spa->spa_scrub_lock);
+	ASSERT(spa->spa_scrub_suspend != 0);
+	if (--spa->spa_scrub_suspend == 0)
+		cv_broadcast(&spa->spa_scrub_cv);
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+void
+spa_scrub_restart(spa_t *spa, uint64_t txg)
+{
+	/*
+	 * Something happened (e.g. snapshot create/delete) that means
+	 * we must restart any in-progress scrubs.  The itinerary will
+	 * fix this properly.
+	 */
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_restart_txg = txg;
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+{
+	space_seg_t *ss;
+	uint64_t mintxg, maxtxg;
+	vdev_t *rvd = spa->spa_root_vdev;
+	int advance = 0;
+
+	if ((uint_t)type >= POOL_SCRUB_TYPES)
+		return (ENOTSUP);
+
+	/*
+	 * If there's a scrub or resilver already in progress, stop it.
+	 */
+	while (spa->spa_scrub_thread != NULL) {
+		/*
+		 * Don't stop a resilver unless forced.
+		 */
+		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force)
+			return (EBUSY);
+
+		spa->spa_scrub_stop = 1;
+		cv_broadcast(&spa->spa_scrub_cv);
+		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
+	}
+
+	/*
+	 * Terminate the previous traverse.
+	 */
+	if (spa->spa_scrub_th != NULL) {
+		traverse_fini(spa->spa_scrub_th);
+		spa->spa_scrub_th = NULL;
+	}
+
+	spa->spa_scrub_stop = 0;
+	spa->spa_scrub_type = type;
+	spa->spa_scrub_restart_txg = 0;
+
+	mintxg = TXG_INITIAL - 1;
+	maxtxg = spa_last_synced_txg(spa) + 1;
+
+	switch (type) {
+
+	case POOL_SCRUB_NONE:
+		break;
+
+	case POOL_SCRUB_RESILVER:
+		/*
+		 * Determine the resilvering boundaries.
+		 *
+		 * Note: (mintxg, maxtxg) is an open interval,
+		 * i.e. mintxg and maxtxg themselves are not included.
+		 *
+		 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
+		 * so we don't claim to resilver a txg that's still changing.
+		 */
+		mutex_enter(&rvd->vdev_dtl_lock);
+		ss = avl_first(&rvd->vdev_dtl_map.sm_root);
+		mintxg = ss ? ss->ss_start - 1 : 0;
+		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
+		maxtxg = ss ? ss->ss_end : 0;
+		maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1);
+		mutex_exit(&rvd->vdev_dtl_lock);
+
+		advance = ADVANCE_PRE | ADVANCE_PRUNE;
+		break;
+
+	case POOL_SCRUB_EVERYTHING:
+		/*
+		 * A scrub is like a resilver, but not pruned by DTL.
+		 */
+		advance = ADVANCE_PRE;
+		break;
+	}
+
+	if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) {
+		spa->spa_scrub_maxtxg = maxtxg;
+		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
+		    advance, ZIO_FLAG_CANFAIL);
+		traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
+		spa->spa_scrub_thread = thread_create(NULL, 0,
+		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
+	}
+
+	return (0);
+}
+
+int
+spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+{
+	int error;
+	traverse_handle_t *th;
+
+	mutex_enter(&spa->spa_scrub_lock);
+	error = spa_scrub_locked(spa, type, force);
+	th = spa->spa_scrub_th;
+	mutex_exit(&spa->spa_scrub_lock);
+
+	if (th == NULL && type != POOL_SCRUB_NONE)
+		spa_vdev_replace_done(spa);
+
+	return (error);
+}
+
+/*
+ * ==========================================================================
+ * SPA syncing routines
+ * ==========================================================================
+ */
+
+static void
+spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
+{
+	bplist_t *bpl = &spa->spa_sync_bplist;
+	dmu_tx_t *tx;
+	blkptr_t blk;
+	uint64_t itor = 0;
+	zio_t *zio;
+	int error;
+	uint8_t c = 1;
+
+	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
+
+	while (bplist_iterate(bpl, &itor, &blk) == 0)
+		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
+
+	error = zio_wait(zio);
+	ASSERT3U(error, ==, 0);
+
+	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+	bplist_vacate(bpl, tx);
+
+	/*
+	 * Pre-dirty the first block so we sync to convergence faster.
+	 * (Usually only the first block is needed.)
+	 */
+	dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
+	dmu_tx_commit(tx);
+}
+
+static void
+spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
+{
+	nvlist_t *config;
+	char *packed = NULL;
+	size_t nvsize = 0;
+	dmu_buf_t *db;
+
+	if (list_is_empty(&spa->spa_dirty_list))
+		return;
+
+	config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
+
+	spa_config_set(spa, config);
+
+	VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0);
+
+	packed = kmem_alloc(nvsize, KM_SLEEP);
+
+	VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0);
+
+	dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize,
+	    packed, tx);
+
+	kmem_free(packed, nvsize);
+
+	db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object);
+	dmu_buf_will_dirty(db, tx);
+	*(uint64_t *)db->db_data = nvsize;
+	dmu_buf_rele(db);
+}
+
+/*
+ * Sync the specified transaction group.  New blocks may be dirtied as
+ * part of the process, so we iterate until it converges.
+ */
+void
+spa_sync(spa_t *spa, uint64_t txg)
+{
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	objset_t *mos = spa->spa_meta_objset;
+	bplist_t *bpl = &spa->spa_sync_bplist;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *vd;
+	dmu_tx_t *tx;
+	int dirty_vdevs;
+
+	/*
+	 * Lock out configuration changes.
+	 */
+	spa_config_enter(spa, RW_READER);
+
+	spa->spa_syncing_txg = txg;
+	spa->spa_sync_pass = 0;
+
+	bplist_open(bpl, mos, spa->spa_sync_bplist_obj);
+
+	/*
+	 * If anything has changed in this txg, push the deferred frees
+	 * from the previous txg.  If not, leave them alone so that we
+	 * don't generate work on an otherwise idle system.
+	 */
+	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
+	    !txg_list_empty(&dp->dp_dirty_dirs, txg))
+		spa_sync_deferred_frees(spa, txg);
+
+	/*
+	 * Iterate to convergence.
+	 */
+	do {
+		spa->spa_sync_pass++;
+
+		tx = dmu_tx_create_assigned(dp, txg);
+		spa_sync_config_object(spa, tx);
+		dmu_tx_commit(tx);
+
+		dsl_pool_sync(dp, txg);
+
+		dirty_vdevs = 0;
+		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
+			vdev_sync(vd, txg);
+			dirty_vdevs++;
+		}
+
+		tx = dmu_tx_create_assigned(dp, txg);
+		bplist_sync(bpl, tx);
+		dmu_tx_commit(tx);
+
+	} while (dirty_vdevs);
+
+	bplist_close(bpl);
+
+	dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
+
+	/*
+	 * Rewrite the vdev configuration (which includes the uberblock)
+	 * to commit the transaction group.
+	 */
+	while (spa_sync_labels(spa, txg)) {
+		dprintf("waiting for devices to heal\n");
+		delay(hz);
+		vdev_reopen(rvd, NULL);
+	}
+
+	/*
+	 * Make a stable copy of the fully synced uberblock.
+	 * We use this as the root for pool traversals.
+	 */
+	spa->spa_traverse_wanted = 1;	/* tells traverse_more() to stop */
+
+	spa_scrub_suspend(spa);		/* stop scrubbing and finish I/Os */
+
+	rw_enter(&spa->spa_traverse_lock, RW_WRITER);
+	spa->spa_traverse_wanted = 0;
+	spa->spa_ubsync = spa->spa_uberblock;
+	rw_exit(&spa->spa_traverse_lock);
+
+	spa_scrub_resume(spa);		/* resume scrub with new ubsync */
+
+	/*
+	 * Clean up the ZIL records for the synced txg.
+	 */
+	dsl_pool_zil_clean(dp);
+
+	/*
+	 * Update usable space statistics.
+	 */
+	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+		vdev_sync_done(vd, txg);
+
+	/*
+	 * It had better be the case that we didn't dirty anything
+	 * since spa_sync_labels().
+	 */
+	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
+	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
+	ASSERT(bpl->bpl_queue == NULL);
+
+	spa_config_exit(spa);
+}
+
+/*
+ * Sync all pools.  We don't want to hold the namespace lock across these
+ * operations, so we take a reference on the spa_t and drop the lock during the
+ * sync.
+ */
+void
+spa_sync_allpools(void)
+{
+	spa_t *spa = NULL;
+	mutex_enter(&spa_namespace_lock);
+	while ((spa = spa_next(spa)) != NULL) {
+		if (spa_state(spa) != POOL_STATE_ACTIVE)
+			continue;
+		spa_open_ref(spa, FTAG);
+		mutex_exit(&spa_namespace_lock);
+		txg_wait_synced(spa_get_dsl(spa), 0);
+		mutex_enter(&spa_namespace_lock);
+		spa_close(spa, FTAG);
+	}
+	mutex_exit(&spa_namespace_lock);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous routines
+ * ==========================================================================
+ */
+
+int
+spa_busy(void)
+{
+	return (spa_active_count != 0);
+}
+
+/*
+ * Remove all pools in the system.
+ */
+void
+spa_evict_all(void)
+{
+	spa_t *spa;
+
+	/*
+	 * Remove all cached state.  All pools should be closed now,
+	 * so every spa in the AVL tree should be unreferenced.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	while ((spa = spa_next(NULL)) != NULL) {
+		/*
+		 * Stop all scrub and resilver activity.  spa_scrub() needs to
+		 * wait for the scrub thread, which may do a detach and sync the
+		 * configs, which needs spa_namespace_lock.  Drop the lock while
+		 * maintaining a hold on the spa_t.
+		 */
+		spa_open_ref(spa, FTAG);
+		mutex_exit(&spa_namespace_lock);
+		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
+		mutex_enter(&spa_namespace_lock);
+		spa_close(spa, FTAG);
+
+		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+			spa_unload(spa);
+			spa_deactivate(spa);
+		}
+		spa_remove(spa);
+	}
+	mutex_exit(&spa_namespace_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c
new file mode 100644
index 000000000000..abcd67ddb96b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/spa_config.c
@@ -0,0 +1,308 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/nvpair.h>
+#include <sys/uio.h>
+#include <sys/fs/zfs.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_ioctl.h>
+
+/*
+ * Pool configuration repository.
+ *
+ * The configuration for all pools, in addition to being stored on disk, is
+ * stored in /kernel/drv/zpool.cache as a packed nvlist.  The kernel maintains
+ * this list as pools are created, destroyed, or modified.
+ *
+ * We have a single nvlist which holds all the configuration information.  When
+ * the module loads, we read this information from the cache and populate the
+ * SPA namespace.  This namespace is maintained independently in spa.c.
+ * Whenever the namespace is modified, or the configuration of a pool is
+ * changed, we call spa_config_sync(), which walks through all the active pools
+ * and writes the configuration to disk.
+ */
+
+static uint64_t spa_config_generation = 1;
+
+/*
+ * This can be overridden in userland to preserve an alternate namespace for
+ * userland pools when doing testing.
+ */
+const char *spa_config_dir = ZPOOL_CACHE_DIR;
+
+/*
+ * Called when the module is first loaded, this routine loads the configuration
+ * file into the SPA namespace.  It does not actually open or load the pools; it
+ * only populates the namespace.
+ */
+void
+spa_config_load(void)
+{
+	vnode_t *vp;
+	void *buf = NULL;
+	vattr_t vattr;
+	ssize_t resid;
+	nvlist_t *nvlist, *child;
+	nvpair_t *nvpair;
+	spa_t *spa;
+	char pathname[128];
+
+	/*
+	 * Open the configuration file.
+	 */
+	(void) snprintf(pathname, sizeof (pathname), "./%s/%s", spa_config_dir,
+	    ZPOOL_CACHE_FILE);
+	if (vn_openat(pathname, UIO_SYSSPACE, FREAD | FOFFMAX, 0, &vp, 0, 0,
+	    rootdir) != 0)
+		return;
+
+	/*
+	 * Read the nvlist from the file.
+	 */
+	if (VOP_GETATTR(vp, &vattr, 0, kcred) != 0)
+		goto out;
+
+	buf = kmem_alloc(vattr.va_size, KM_SLEEP);
+
+	if (vn_rdwr(UIO_READ, vp, buf, vattr.va_size, 0, UIO_SYSSPACE,
+	    0, RLIM64_INFINITY, kcred, &resid) != 0)
+		goto out;
+
+	if (resid != 0)
+		goto out;
+
+	/*
+	 * Unpack the nvlist.
+	 */
+	if (nvlist_unpack(buf, vattr.va_size, &nvlist, KM_SLEEP) != 0)
+		goto out;
+
+	/*
+	 * Iterate over all elements in the nvlist, creating a new spa_t for
+	 * each one with the specified configuration.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	nvpair = NULL;
+	while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
+
+		if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
+			continue;
+
+		VERIFY(nvpair_value_nvlist(nvpair, &child) == 0);
+
+		if (spa_lookup(nvpair_name(nvpair)) != NULL)
+			continue;
+		spa = spa_add(nvpair_name(nvpair));
+
+		/*
+		 * We blindly duplicate the configuration here.  If it's
+		 * invalid, we will catch it when the pool is first opened.
+		 */
+		VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0);
+	}
+	mutex_exit(&spa_namespace_lock);
+
+	nvlist_free(nvlist);
+
+out:
+	if (buf != NULL)
+		kmem_free(buf, vattr.va_size);
+
+	(void) VOP_CLOSE(vp, FREAD | FOFFMAX, 1, 0, kcred);
+	VN_RELE(vp);
+}
+
+/*
+ * Synchronize all pools to disk.  This must be called with the namespace lock
+ * held.
+ */
+void
+spa_config_sync(void)
+{
+	spa_t *spa = NULL;
+	nvlist_t *config;
+	size_t buflen;
+	char *buf;
+	vnode_t *vp;
+	int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
+	char pathname[128];
+	char pathname2[128];
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+
+	/*
+	 * Add all known pools to the configuration list, ignoring those with
+	 * alternate root paths.
+	 */
+	spa = NULL;
+	while ((spa = spa_next(spa)) != NULL) {
+		mutex_enter(&spa->spa_config_cache_lock);
+		if (spa->spa_config && spa->spa_name && spa->spa_root == NULL)
+			VERIFY(nvlist_add_nvlist(config, spa->spa_name,
+			    spa->spa_config) == 0);
+		mutex_exit(&spa->spa_config_cache_lock);
+	}
+
+	/*
+	 * Pack the configuration into a buffer.
+	 */
+	VERIFY(nvlist_size(config, &buflen, NV_ENCODE_XDR) == 0);
+
+	buf = kmem_alloc(buflen, KM_SLEEP);
+
+	VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR, 0) == 0);
+
+	/*
+	 * Write the configuration to disk.  We need to do the traditional
+	 * 'write to temporary file, sync, move over original' to make sure we
+	 * always have a consistent view of the data.
+	 */
+	(void) snprintf(pathname, sizeof (pathname), "%s/%s", spa_config_dir,
+	    ZPOOL_CACHE_TMP);
+
+	if (vn_open(pathname, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) != 0)
+		goto out;
+
+	if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
+	    0, RLIM64_INFINITY, kcred, NULL) == 0 &&
+	    VOP_FSYNC(vp, FSYNC, kcred) == 0) {
+		(void) snprintf(pathname2, sizeof (pathname2), "%s/%s",
+		    spa_config_dir, ZPOOL_CACHE_FILE);
+		(void) vn_rename(pathname, pathname2, UIO_SYSSPACE);
+	}
+
+	(void) VOP_CLOSE(vp, oflags, 1, 0, kcred);
+	VN_RELE(vp);
+
+out:
+	(void) vn_remove(pathname, UIO_SYSSPACE, RMFILE);
+	spa_config_generation++;
+
+	kmem_free(buf, buflen);
+	nvlist_free(config);
+}
+
+/*
+ * Sigh.  Inside a local zone, we don't have access to /kernel/drv/zpool.cache,
+ * and we don't want to allow the local zone to see all the pools anyway.
+ * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
+ * information for all pool visible within the zone.
+ */
+nvlist_t *
+spa_all_configs(uint64_t *generation)
+{
+	nvlist_t *pools;
+	spa_t *spa;
+
+	if (*generation == spa_config_generation)
+		return (NULL);
+
+	VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, 0) == 0);
+
+	spa = NULL;
+	mutex_enter(&spa_namespace_lock);
+	while ((spa = spa_next(spa)) != NULL) {
+		if (INGLOBALZONE(curproc) ||
+		    zone_dataset_visible(spa_name(spa), NULL)) {
+			mutex_enter(&spa->spa_config_cache_lock);
+			VERIFY(nvlist_add_nvlist(pools, spa_name(spa),
+			    spa->spa_config) == 0);
+			mutex_exit(&spa->spa_config_cache_lock);
+		}
+	}
+	mutex_exit(&spa_namespace_lock);
+
+	*generation = spa_config_generation;
+
+	return (pools);
+}
+
+void
+spa_config_set(spa_t *spa, nvlist_t *config)
+{
+	mutex_enter(&spa->spa_config_cache_lock);
+	if (spa->spa_config != NULL)
+		nvlist_free(spa->spa_config);
+	spa->spa_config = config;
+	mutex_exit(&spa->spa_config_cache_lock);
+}
+
+/*
+ * Generate the pool's configuration based on the current in-core state.
+ * We infer whether to generate a complete config or just one top-level config
+ * based on whether vd is the root vdev.
+ */
+nvlist_t *
+spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
+{
+	nvlist_t *config, *nvroot;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	if (vd == NULL)
+		vd = rvd;
+
+	/*
+	 * If txg is -1, report the current value of spa->spa_config_txg.
+	 * If txg is any other non-zero value, update spa->spa_config_txg.
+	 */
+	if (txg == -1ULL)
+		txg = spa->spa_config_txg;
+	else if (txg != 0 && vd == rvd)
+		spa->spa_config_txg = txg;
+
+	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+	    UBERBLOCK_VERSION) == 0);
+	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
+	    spa_name(spa)) == 0);
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+	    spa_state(spa)) == 0);
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+	    txg) == 0);
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+	    spa_guid(spa)) == 0);
+
+	if (vd != rvd) {
+		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
+		    vd->vdev_top->vdev_guid) == 0);
+		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
+		    vd->vdev_guid) == 0);
+		vd = vd->vdev_top;		/* label contains top config */
+	}
+
+	nvroot = vdev_config_generate(vd, getstats);
+	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+	nvlist_free(nvroot);
+
+	return (config);
+}
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
new file mode 100644
index 000000000000..c1b60175093e
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -0,0 +1,848 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * SPA locking
+ *
+ * There are four basic locks for managing spa_t structures:
+ *
+ * spa_namespace_lock (global mutex)
+ *
+ * 	This lock must be acquired to do any of the following:
+ *
+ * 		- Lookup a spa_t by name
+ * 		- Add or remove a spa_t from the namespace
+ * 		- Increase spa_refcount from non-zero
+ * 		- Check if spa_refcount is zero
+ * 		- Rename a spa_t
+ * 		- Held for the duration of create/destroy/import/export
+ *
+ * 	It does not need to handle recursion.  A create or destroy may
+ * 	reference objects (files or zvols) in other pools, but by
+ * 	definition they must have an existing reference, and will never need
+ * 	to lookup a spa_t by name.
+ *
+ * spa_refcount (per-spa refcount_t protected by mutex)
+ *
+ * 	This reference count keep track of any active users of the spa_t.  The
+ * 	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
+ * 	the refcount is never really 'zero' - opening a pool implicitly keeps
+ * 	some references in the DMU.  Internally we check against SPA_MINREF, but
+ * 	present the image of a zero/non-zero value to consumers.
+ *
+ * spa_config_lock (per-spa crazy rwlock)
+ *
+ * 	This SPA special is a recursive rwlock, capable of being acquired from
+ * 	asynchronous threads.  It has protects the spa_t from config changes,
+ * 	and must be held in the following circumstances:
+ *
+ * 		- RW_READER to perform I/O to the spa
+ * 		- RW_WRITER to change the vdev config
+ *
+ * spa_config_cache_lock (per-spa mutex)
+ *
+ * 	This mutex prevents the spa_config nvlist from being updated.  No
+ *      other locks are required to obtain this lock, although implicitly you
+ *      must have the namespace lock or non-zero refcount to have any kind
+ *      of spa_t pointer at all.
+ *
+ * spa_vdev_lock (global mutex)
+ *
+ * 	This special lock is a global mutex used to serialize attempts to
+ * 	access devices through ZFS.  It makes sure that we do not try to add
+ * 	a single vdev to multiple pools at the same time.  It must be held
+ * 	when adding or removing a device from the pool.
+ *
+ *
+ * The locking order is fairly straightforward:
+ *
+ * 		spa_namespace_lock	->	spa_refcount
+ *
+ * 	The namespace lock must be acquired to increase the refcount from 0
+ * 	or to check if it is zero.
+ *
+ * 		spa_refcount 		->	spa_config_lock
+ *
+ * 	There must be at least one valid reference on the spa_t to acquire
+ * 	the config lock.
+ *
+ * 		spa_vdev_lock		->	spa_config_lock
+ *
+ * 	There are no locks required for spa_vdev_lock, but it must be
+ * 	acquired before spa_config_lock.
+ *
+ *
+ * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and
+ * are globally visible.
+ *
+ * The namespace is manipulated using the following functions, all which require
+ * the spa_namespace_lock to be held.
+ *
+ * 	spa_lookup()		Lookup a spa_t by name.
+ *
+ * 	spa_add()		Create a new spa_t in the namespace.
+ *
+ * 	spa_remove()		Remove a spa_t from the namespace.  This also
+ * 				frees up any memory associated with the spa_t.
+ *
+ * 	spa_next()		Returns the next spa_t in the system, or the
+ * 				first if NULL is passed.
+ *
+ * 	spa_evict_all()		Shutdown and remove all spa_t structures in
+ * 				the system.
+ *
+ *
+ * The spa_refcount is manipulated using the following functions:
+ *
+ * 	spa_open_ref()		Adds a reference to the given spa_t.  Must be
+ * 				called with spa_namespace_lock held if the
+ * 				refcount is currently zero.
+ *
+ * 	spa_close()		Remove a reference from the spa_t.  This will
+ * 				not free the spa_t or remove it from the
+ * 				namespace.  No locking is required.
+ *
+ * 	spa_refcount_zero()	Returns true if the refcount is currently
+ * 				zero.  Must be called with spa_namespace_lock
+ * 				held.
+ *
+ * The spa_config_lock is manipulated using the following functions:
+ *
+ * 	spa_config_enter()	Acquire the config lock as RW_READER or
+ * 				RW_WRITER.  At least one reference on the spa_t
+ * 				must exist.
+ *
+ * 	spa_config_exit()	Release the config lock.
+ *
+ * 	spa_config_held()	Returns true if the config lock is currently
+ * 				held in the given state.
+ *
+ * The spa_vdev_lock, while acquired directly, is hidden by the following
+ * functions, which imply additional semantics that must be followed:
+ *
+ * 	spa_vdev_enter()	Acquire the vdev lock and the config lock for
+ * 				writing.
+ *
+ * 	spa_vdev_exit()		Release the config lock, wait for all I/O
+ * 				to complete, release the vdev lock, and sync
+ * 				the updated configs to the cache.
+ *
+ * The spa_name() function also requires either the spa_namespace_lock
+ * or the spa_config_lock, as both are needed to do a rename.  spa_rename() is
+ * also implemented within this file since is requires manipulation of the
+ * namespace.
+ */
+
+static avl_tree_t spa_namespace_avl;
+kmutex_t spa_namespace_lock;
+static kcondvar_t spa_namespace_cv;
+
+kmem_cache_t *spa_buffer_pool;
+int spa_mode;
+
+#ifdef ZFS_DEBUG
+int zfs_flags = ~0;
+#else
+int zfs_flags = 0;
+#endif
+
+static kmutex_t spa_vdev_lock;
+
+#define	SPA_MINREF	5	/* spa_refcnt for an open-but-idle pool */
+
+/*
+ * ==========================================================================
+ * SPA namespace functions
+ * ==========================================================================
+ */
+
+/*
+ * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
+ * Returns NULL if no matching spa_t is found.
+ */
+spa_t *
+spa_lookup(const char *name)
+{
+	spa_t search, *spa;
+	avl_index_t where;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	search.spa_name = (char *)name;
+	spa = avl_find(&spa_namespace_avl, &search, &where);
+
+	return (spa);
+}
+
+/*
+ * Create an uninitialized spa_t with the given name.  Requires
+ * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
+ * exist by calling spa_lookup() first.
+ */
+spa_t *
+spa_add(const char *name)
+{
+	spa_t *spa;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
+
+	spa->spa_name = spa_strdup(name);
+	spa->spa_state = POOL_STATE_UNINITIALIZED;
+	spa->spa_freeze_txg = UINT64_MAX;
+
+	refcount_create(&spa->spa_refcount);
+
+	avl_add(&spa_namespace_avl, spa);
+
+	return (spa);
+}
+
+/*
+ * Removes a spa_t from the namespace, freeing up any memory used.  Requires
+ * spa_namespace_lock.  This is called only after the spa_t has been closed and
+ * deactivated.
+ */
+void
+spa_remove(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+	ASSERT(spa->spa_scrub_thread == NULL);
+
+	avl_remove(&spa_namespace_avl, spa);
+	cv_broadcast(&spa_namespace_cv);
+
+	if (spa->spa_root)
+		spa_strfree(spa->spa_root);
+
+	if (spa->spa_name)
+		spa_strfree(spa->spa_name);
+
+	spa_config_set(spa, NULL);
+
+	refcount_destroy(&spa->spa_refcount);
+
+	kmem_free(spa, sizeof (spa_t));
+}
+
+/*
+ * Given a pool, return the next pool in the namespace, or NULL if there is
+ * none.  If 'prev' is NULL, return the first pool.
+ */
+spa_t *
+spa_next(spa_t *prev)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	if (prev)
+		return (AVL_NEXT(&spa_namespace_avl, prev));
+	else
+		return (avl_first(&spa_namespace_avl));
+}
+
+/*
+ * ==========================================================================
+ * SPA refcount functions
+ * ==========================================================================
+ */
+
+/*
+ * Add a reference to the given spa_t.  Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_open_ref(spa_t *spa, void *tag)
+{
+	ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
+	    MUTEX_HELD(&spa_namespace_lock));
+
+	(void) refcount_add(&spa->spa_refcount, tag);
+}
+
+/*
+ * Remove a reference to the given spa_t.  Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_close(spa_t *spa, void *tag)
+{
+	ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
+	    MUTEX_HELD(&spa_namespace_lock));
+
+	(void) refcount_remove(&spa->spa_refcount, tag);
+}
+
+/*
+ * Check to see if the spa refcount is zero.  Must be called with
+ * spa_namespace_lock held.  We really compare against SPA_MINREF, which is the
+ * number of references acquired when opening a pool
+ */
+boolean_t
+spa_refcount_zero(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	return (refcount_count(&spa->spa_refcount) == SPA_MINREF);
+}
+
+/*
+ * ==========================================================================
+ * SPA config locking
+ * ==========================================================================
+ */
+
+/*
+ * Acquire the config lock.  The config lock is a special rwlock that allows for
+ * recursive enters.  Because these enters come from the same thread as well as
+ * asynchronous threads working on behalf of the owner, we must unilaterally
+ * allow all reads access as long at least one reader is held (even if a write
+ * is requested).  This has the side effect of write starvation, but write locks
+ * are extremely rare, and a solution to this problem would be significantly
+ * more complex (if even possible).
+ *
+ * We would like to assert that the namespace lock isn't held, but this is a
+ * valid use during create.
+ */
+void
+spa_config_enter(spa_t *spa, krw_t rw)
+{
+	spa_config_lock_t *scl = &spa->spa_config_lock;
+
+	mutex_enter(&scl->scl_lock);
+
+	if (scl->scl_writer != curthread) {
+		if (rw == RW_READER) {
+			while (scl->scl_writer != NULL)
+				cv_wait(&scl->scl_cv, &scl->scl_lock);
+		} else {
+			while (scl->scl_writer != NULL || scl->scl_count > 0)
+				cv_wait(&scl->scl_cv, &scl->scl_lock);
+			scl->scl_writer = curthread;
+		}
+	}
+
+	scl->scl_count++;
+
+	mutex_exit(&scl->scl_lock);
+}
+
+/*
+ * Release the spa config lock, notifying any waiters in the process.
+ */
+void
+spa_config_exit(spa_t *spa)
+{
+	spa_config_lock_t *scl = &spa->spa_config_lock;
+
+	mutex_enter(&scl->scl_lock);
+
+	ASSERT(scl->scl_count > 0);
+	if (--scl->scl_count == 0) {
+		cv_broadcast(&scl->scl_cv);
+		scl->scl_writer = NULL;  /* OK in either case */
+	}
+
+	mutex_exit(&scl->scl_lock);
+}
+
+/*
+ * Returns true if the config lock is held in the given manner.
+ */
+boolean_t
+spa_config_held(spa_t *spa, krw_t rw)
+{
+	spa_config_lock_t *scl = &spa->spa_config_lock;
+	boolean_t held;
+
+	mutex_enter(&scl->scl_lock);
+	if (rw == RW_WRITER)
+		held = (scl->scl_writer == curthread);
+	else
+		held = (scl->scl_count != 0);
+	mutex_exit(&scl->scl_lock);
+
+	return (held);
+}
+
+/*
+ * ==========================================================================
+ * SPA vdev locking
+ * ==========================================================================
+ */
+
+/*
+ * Lock the given spa_t for the purpose of adding or removing a vdev.  This
+ * grabs the global spa_vdev_lock as well as the spa config lock for writing.
+ * It returns the next transaction group for the spa_t.
+ */
+uint64_t
+spa_vdev_enter(spa_t *spa)
+{
+	mutex_enter(&spa_vdev_lock);
+
+	spa_config_enter(spa, RW_WRITER);
+
+	return (spa_last_synced_txg(spa) + 1);
+}
+
+/*
+ * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
+ * locking of spa_vdev_enter(), we also want make sure the transactions have
+ * synced to disk, and then update the global configuration cache with the new
+ * information.
+ */
+int
+spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+{
+	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
+
+	spa_config_exit(spa);
+
+	if (vd == spa->spa_root_vdev) {		/* spa_create() */
+		mutex_exit(&spa_vdev_lock);
+		return (error);
+	}
+
+	/*
+	 * Note: this txg_wait_synced() is important because it ensures
+	 * that there won't be more than one config change per txg.
+	 * This allows us to use the txg as the generation number.
+	 */
+	if (error == 0)
+		txg_wait_synced(spa->spa_dsl_pool, txg);
+
+	mutex_exit(&spa_vdev_lock);
+
+	if (vd != NULL) {
+		ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
+		vdev_free(vd);
+	}
+
+	/*
+	 * If we're in the middle of export or destroy, don't sync the
+	 * config -- it will do that anyway, and we deadlock if we try.
+	 */
+	if (error == 0 && spa->spa_state == POOL_STATE_ACTIVE) {
+		mutex_enter(&spa_namespace_lock);
+		spa_config_sync();
+		mutex_exit(&spa_namespace_lock);
+	}
+
+	return (error);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous functions
+ * ==========================================================================
+ */
+
+/*
+ * Rename a spa_t.
+ */
+int
+spa_rename(const char *name, const char *newname)
+{
+	spa_t *spa;
+	int err;
+
+	/*
+	 * Lookup the spa_t and grab the config lock for writing.  We need to
+	 * actually open the pool so that we can sync out the necessary labels.
+	 * It's OK to call spa_open() with the namespace lock held because we
+	 * alllow recursive calls for other reasons.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	if ((err = spa_open(name, &spa, FTAG)) != 0) {
+		mutex_exit(&spa_namespace_lock);
+		return (err);
+	}
+
+	spa_config_enter(spa, RW_WRITER);
+
+	avl_remove(&spa_namespace_avl, spa);
+	spa_strfree(spa->spa_name);
+	spa->spa_name = spa_strdup(newname);
+	avl_add(&spa_namespace_avl, spa);
+
+	/*
+	 * Sync all labels to disk with the new names by marking the root vdev
+	 * dirty and waiting for it to sync.  It will pick up the new pool name
+	 * during the sync.
+	 */
+	vdev_config_dirty(spa->spa_root_vdev);
+
+	spa_config_exit(spa);
+
+	txg_wait_synced(spa->spa_dsl_pool, 0);
+
+	/*
+	 * Sync the updated config cache.
+	 */
+	spa_config_set(spa,
+	    spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0));
+	spa_config_sync();
+
+	spa_close(spa, FTAG);
+
+	mutex_exit(&spa_namespace_lock);
+
+	return (0);
+}
+
+
+/*
+ * Determine whether a pool with given pool_guid exists.  If device_guid is
+ * non-zero, determine whether the pool exists *and* contains a device with the
+ * specified device_guid.
+ */
+boolean_t
+spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+{
+	spa_t *spa;
+	avl_tree_t *t = &spa_namespace_avl;
+	boolean_t locked = B_FALSE;
+
+	if (mutex_owner(&spa_namespace_lock) != curthread) {
+		mutex_enter(&spa_namespace_lock);
+		locked = B_TRUE;
+	}
+
+	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
+		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+			continue;
+		if (spa->spa_root_vdev == NULL)
+			continue;
+		if (spa_guid(spa) == pool_guid && (device_guid == 0 ||
+		    vdev_lookup_by_guid(spa->spa_root_vdev, device_guid)))
+			break;
+	}
+
+	if (locked)
+		mutex_exit(&spa_namespace_lock);
+
+	return (spa != NULL);
+}
+
+char *
+spa_strdup(const char *s)
+{
+	size_t len;
+	char *new;
+
+	len = strlen(s);
+	new = kmem_alloc(len + 1, KM_SLEEP);
+	bcopy(s, new, len);
+	new[len] = '\0';
+
+	return (new);
+}
+
+void
+spa_strfree(char *s)
+{
+	kmem_free(s, strlen(s) + 1);
+}
+
+uint64_t
+spa_get_random(uint64_t range)
+{
+	uint64_t r;
+
+	ASSERT(range != 0);
+
+	(void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
+
+	return (r % range);
+}
+
+void
+sprintf_blkptr(char *buf, blkptr_t *bp)
+{
+	/* XXBP - Need to see if we want all DVAs or not */
+	dva_t *dva = BP_IDENTITY(bp);
+
+	if (bp == NULL) {
+		(void) sprintf(buf, "<NULL>");
+		return;
+	}
+
+	if (BP_IS_HOLE(bp)) {
+		(void) sprintf(buf, "<hole>");
+		return;
+	}
+
+	(void) sprintf(buf, "[L%llu %s] vdev=%llu offset=%llx "
+	    "size=%llxL/%llxP/%llxA %s %s %s %s",
+	    (u_longlong_t)BP_GET_LEVEL(bp),
+	    dmu_ot[BP_GET_TYPE(bp)].ot_name,
+	    (u_longlong_t)DVA_GET_VDEV(dva),
+	    (u_longlong_t)DVA_GET_OFFSET(dva),
+	    (u_longlong_t)BP_GET_LSIZE(bp),
+	    (u_longlong_t)BP_GET_PSIZE(bp),
+	    (u_longlong_t)DVA_GET_ASIZE(dva),
+	    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
+	    zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
+	    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
+	    DVA_GET_GANG(dva) == 0 ? "contiguous" : "gang");
+
+	(void) sprintf(buf + strlen(buf), " birth=%llu fill=%llu"
+	    " cksum=%llx:%llx:%llx:%llx",
+	    (u_longlong_t)bp->blk_birth,
+	    (u_longlong_t)bp->blk_fill,
+	    (u_longlong_t)bp->blk_cksum.zc_word[0],
+	    (u_longlong_t)bp->blk_cksum.zc_word[1],
+	    (u_longlong_t)bp->blk_cksum.zc_word[2],
+	    (u_longlong_t)bp->blk_cksum.zc_word[3]);
+}
+
+void
+spa_freeze(spa_t *spa)
+{
+	uint64_t freeze_txg = 0;
+
+	spa_config_enter(spa, RW_WRITER);
+	if (spa->spa_freeze_txg == UINT64_MAX) {
+		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
+		spa->spa_freeze_txg = freeze_txg;
+	}
+	spa_config_exit(spa);
+	if (freeze_txg != 0)
+		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
+}
+
+/*
+ * ==========================================================================
+ * Accessor functions
+ * ==========================================================================
+ */
+
+krwlock_t *
+spa_traverse_rwlock(spa_t *spa)
+{
+	return (&spa->spa_traverse_lock);
+}
+
+int
+spa_traverse_wanted(spa_t *spa)
+{
+	return (spa->spa_traverse_wanted);
+}
+
+dsl_pool_t *
+spa_get_dsl(spa_t *spa)
+{
+	return (spa->spa_dsl_pool);
+}
+
+blkptr_t *
+spa_get_rootblkptr(spa_t *spa)
+{
+	return (&spa->spa_ubsync.ub_rootbp);
+}
+
+void
+spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
+{
+	spa->spa_uberblock.ub_rootbp = *bp;
+}
+
+void
+spa_altroot(spa_t *spa, char *buf, size_t buflen)
+{
+	if (spa->spa_root == NULL)
+		buf[0] = '\0';
+	else
+		(void) strncpy(buf, spa->spa_root, buflen);
+}
+
+int
+spa_sync_pass(spa_t *spa)
+{
+	return (spa->spa_sync_pass);
+}
+
+char *
+spa_name(spa_t *spa)
+{
+	/*
+	 * Accessing the name requires holding either the namespace lock or the
+	 * config lock, both of which are required to do a rename.
+	 */
+	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+	    spa_config_held(spa, RW_READER) || spa_config_held(spa, RW_WRITER));
+
+	return (spa->spa_name);
+}
+
+uint64_t
+spa_guid(spa_t *spa)
+{
+	return (spa->spa_root_vdev->vdev_guid);
+}
+
+uint64_t
+spa_last_synced_txg(spa_t *spa)
+{
+	return (spa->spa_ubsync.ub_txg);
+}
+
+uint64_t
+spa_first_txg(spa_t *spa)
+{
+	return (spa->spa_first_txg);
+}
+
+int
+spa_state(spa_t *spa)
+{
+	return (spa->spa_state);
+}
+
+uint64_t
+spa_freeze_txg(spa_t *spa)
+{
+	return (spa->spa_freeze_txg);
+}
+
+/*
+ * In the future, this may select among different metaslab classes
+ * depending on the zdp.  For now, there's no such distinction.
+ */
+metaslab_class_t *
+spa_metaslab_class_select(spa_t *spa)
+{
+	return (spa->spa_normal_class);
+}
+
+/*
+ * Return pool-wide allocated space.
+ */
+uint64_t
+spa_get_alloc(spa_t *spa)
+{
+	return (spa->spa_root_vdev->vdev_stat.vs_alloc);
+}
+
+/*
+ * Return pool-wide allocated space.
+ */
+uint64_t
+spa_get_space(spa_t *spa)
+{
+	return (spa->spa_root_vdev->vdev_stat.vs_space);
+}
+
+/* ARGSUSED */
+uint64_t
+spa_get_asize(spa_t *spa, uint64_t lsize)
+{
+	/*
+	 * For now, the worst case is 512-byte RAID-Z blocks, in which
+	 * case the space requirement is exactly 2x; so just assume that.
+	 */
+	return (lsize << 1);
+}
+
+/*
+ * ==========================================================================
+ * Initialization and Termination
+ * ==========================================================================
+ */
+
+static int
+spa_name_compare(const void *a1, const void *a2)
+{
+	const spa_t *s1 = a1;
+	const spa_t *s2 = a2;
+	int s;
+
+	s = strcmp(s1->spa_name, s2->spa_name);
+	if (s > 0)
+		return (1);
+	if (s < 0)
+		return (-1);
+	return (0);
+}
+
+void
+spa_init(int mode)
+{
+	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
+
+	avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
+	    offsetof(spa_t, spa_avl));
+
+	spa_mode = mode;
+
+	refcount_init();
+	unique_init();
+	zio_init();
+	dmu_init();
+	zil_init();
+	spa_config_load();
+}
+
+void
+spa_fini(void)
+{
+	spa_evict_all();
+
+	zil_fini();
+	dmu_fini();
+	zio_fini();
+	refcount_fini();
+
+	avl_destroy(&spa_namespace_avl);
+
+	cv_destroy(&spa_namespace_cv);
+	mutex_destroy(&spa_namespace_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c
new file mode 100644
index 000000000000..25f66bf94b6a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/space_map.c
@@ -0,0 +1,406 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/space_map.h>
+
+/*
+ * Space map routines.
+ * NOTE: caller is responsible for all locking.
+ */
+static int
+space_map_seg_compare(const void *x1, const void *x2)
+{
+	const space_seg_t *s1 = x1;
+	const space_seg_t *s2 = x2;
+
+	if (s1->ss_start < s2->ss_start) {
+		if (s1->ss_end > s2->ss_start)
+			return (0);
+		return (-1);
+	}
+	if (s1->ss_start > s2->ss_start) {
+		if (s1->ss_start < s2->ss_end)
+			return (0);
+		return (1);
+	}
+	return (0);
+}
+
+void
+space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint64_t shift,
+	kmutex_t *lp)
+{
+	avl_create(&sm->sm_root, space_map_seg_compare,
+	    sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
+	sm->sm_start = start;
+	sm->sm_end = start + size;
+	sm->sm_size = size;
+	sm->sm_shift = shift;
+	sm->sm_space = 0;
+	sm->sm_lock = lp;
+}
+
+void
+space_map_destroy(space_map_t *sm)
+{
+	VERIFY3U(sm->sm_space, ==, 0);
+	avl_destroy(&sm->sm_root);
+}
+
+void
+space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	avl_index_t where;
+	space_seg_t ssearch, *ss_before, *ss_after, *ss;
+	uint64_t end = start + size;
+	int merge_before, merge_after;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+	VERIFY(size != 0);
+	VERIFY3U(start, >=, sm->sm_start);
+	VERIFY3U(end, <=, sm->sm_end);
+	VERIFY(sm->sm_space + size <= sm->sm_size);
+	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+	ssearch.ss_start = start;
+	ssearch.ss_end = end;
+	ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+	/* Make sure we don't overlap with either of our neighbors */
+	VERIFY(ss == NULL);
+
+	ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE);
+	ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER);
+
+	merge_before = (ss_before != NULL && ss_before->ss_end == start);
+	merge_after = (ss_after != NULL && ss_after->ss_start == end);
+
+	if (merge_before && merge_after) {
+		avl_remove(&sm->sm_root, ss_before);
+		ss_after->ss_start = ss_before->ss_start;
+		kmem_free(ss_before, sizeof (*ss_before));
+	} else if (merge_before) {
+		ss_before->ss_end = end;
+	} else if (merge_after) {
+		ss_after->ss_start = start;
+	} else {
+		ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
+		ss->ss_start = start;
+		ss->ss_end = end;
+		avl_insert(&sm->sm_root, ss, where);
+	}
+
+	sm->sm_space += size;
+}
+
+void
+space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	avl_index_t where;
+	space_seg_t ssearch, *ss, *newseg;
+	uint64_t end = start + size;
+	int left_over, right_over;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+	VERIFY(size != 0);
+	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+	ssearch.ss_start = start;
+	ssearch.ss_end = end;
+	ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+	/* Make sure we completely overlap with someone */
+	VERIFY(ss != NULL);
+	VERIFY3U(ss->ss_start, <=, start);
+	VERIFY3U(ss->ss_end, >=, end);
+	VERIFY(sm->sm_space - size <= sm->sm_size);
+
+	left_over = (ss->ss_start != start);
+	right_over = (ss->ss_end != end);
+
+	if (left_over && right_over) {
+		newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
+		newseg->ss_start = end;
+		newseg->ss_end = ss->ss_end;
+		ss->ss_end = start;
+		avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
+	} else if (left_over) {
+		ss->ss_end = start;
+	} else if (right_over) {
+		ss->ss_start = end;
+	} else {
+		avl_remove(&sm->sm_root, ss);
+		kmem_free(ss, sizeof (*ss));
+	}
+
+	sm->sm_space -= size;
+}
+
+int
+space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	avl_index_t where;
+	space_seg_t ssearch, *ss;
+	uint64_t end = start + size;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+	VERIFY(size != 0);
+	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
+	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
+
+	ssearch.ss_start = start;
+	ssearch.ss_end = end;
+	ss = avl_find(&sm->sm_root, &ssearch, &where);
+
+	return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end);
+}
+
+void
+space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
+{
+	space_seg_t *ss;
+	void *cookie = NULL;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+
+	while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
+		if (func != NULL)
+			func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
+		kmem_free(ss, sizeof (*ss));
+	}
+	sm->sm_space = 0;
+}
+
+void
+space_map_iterate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
+{
+	space_seg_t *ss;
+
+	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+		func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
+}
+
+void
+space_map_merge(space_map_t *src, space_map_t *dest)
+{
+	space_map_vacate(src, space_map_add, dest);
+}
+
+void
+space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	avl_tree_t *t = &sm->sm_root;
+	avl_index_t where;
+	space_seg_t *ss, search;
+	uint64_t end = start + size;
+	uint64_t rm_start, rm_end;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+
+	search.ss_start = start;
+	search.ss_end = start;
+
+	for (;;) {
+		ss = avl_find(t, &search, &where);
+
+		if (ss == NULL)
+			ss = avl_nearest(t, where, AVL_AFTER);
+
+		if (ss == NULL || ss->ss_start >= end)
+			break;
+
+		rm_start = MAX(ss->ss_start, start);
+		rm_end = MIN(ss->ss_end, end);
+
+		space_map_remove(sm, rm_start, rm_end - rm_start);
+	}
+}
+
+/*
+ * Replace smd with the union of smd and sms.
+ */
+void
+space_map_union(space_map_t *smd, space_map_t *sms)
+{
+	avl_tree_t *t = &sms->sm_root;
+	space_seg_t *ss;
+
+	ASSERT(MUTEX_HELD(smd->sm_lock));
+
+	/*
+	 * For each source segment, remove any intersections with the
+	 * destination, then add the source segment to the destination.
+	 */
+	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
+		space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
+		space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
+	}
+}
+
+int
+space_map_load(space_map_t *sm, space_map_obj_t *smo, uint8_t maptype,
+	objset_t *os, uint64_t end, uint64_t space)
+{
+	uint64_t *entry, *entry_map, *entry_map_end;
+	uint64_t bufsize, size, offset;
+	uint64_t mapstart = sm->sm_start;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+	VERIFY3U(sm->sm_space, ==, 0);
+
+	bufsize = MIN(end, SPACE_MAP_CHUNKSIZE);
+	entry_map = kmem_alloc(bufsize, KM_SLEEP);
+
+	if (maptype == SM_FREE) {
+		space_map_add(sm, sm->sm_start, sm->sm_size);
+		space = sm->sm_size - space;
+	}
+
+	for (offset = 0; offset < end; offset += bufsize) {
+		size = MIN(end - offset, bufsize);
+		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
+		VERIFY(size != 0);
+
+		dprintf("object=%llu  offset=%llx  size=%llx\n",
+		    smo->smo_object, offset, size);
+		dmu_read(os, smo->smo_object, offset, size, entry_map);
+
+		entry_map_end = entry_map + (size / sizeof (uint64_t));
+		for (entry = entry_map; entry < entry_map_end; entry++) {
+			uint64_t e = *entry;
+
+			if (SM_DEBUG_DECODE(e))		/* Skip debug entries */
+				continue;
+
+			(SM_TYPE_DECODE(e) == maptype ?
+			    space_map_add : space_map_remove)(sm,
+			    (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart,
+			    SM_RUN_DECODE(e) << sm->sm_shift);
+		}
+	}
+	VERIFY3U(sm->sm_space, ==, space);
+
+	kmem_free(entry_map, bufsize);
+
+	return (0);
+}
+
+void
+space_map_sync(space_map_t *sm, space_map_t *dest, space_map_obj_t *smo,
+    uint8_t maptype, objset_t *os, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_objset_spa(os);
+	void *cookie = NULL;
+	space_seg_t *ss;
+	uint64_t bufsize, start, size, run_len;
+	uint64_t *entry, *entry_map, *entry_map_end;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+
+	if (sm->sm_space == 0)
+		return;
+
+	dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
+	    smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa),
+	    maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root),
+	    sm->sm_space);
+
+	bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t);
+	bufsize = MIN(bufsize, SPACE_MAP_CHUNKSIZE);
+	entry_map = kmem_alloc(bufsize, KM_SLEEP);
+	entry_map_end = entry_map + (bufsize / sizeof (uint64_t));
+	entry = entry_map;
+
+	*entry++ = SM_DEBUG_ENCODE(1) |
+	    SM_DEBUG_ACTION_ENCODE(maptype) |
+	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
+	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
+
+	while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
+		size = ss->ss_end - ss->ss_start;
+		start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
+
+		if (dest)
+			space_map_add(dest, ss->ss_start, size);
+
+		sm->sm_space -= size;
+		size >>= sm->sm_shift;
+
+		while (size) {
+			run_len = MIN(size, SM_RUN_MAX);
+
+			if (entry == entry_map_end) {
+				dmu_write(os, smo->smo_object, smo->smo_objsize,
+				    bufsize, entry_map, tx);
+				smo->smo_objsize += bufsize;
+				entry = entry_map;
+			}
+
+			*entry++ = SM_OFFSET_ENCODE(start) |
+			    SM_TYPE_ENCODE(maptype) |
+			    SM_RUN_ENCODE(run_len);
+
+			start += run_len;
+			size -= run_len;
+		}
+		kmem_free(ss, sizeof (*ss));
+	}
+
+	if (entry != entry_map) {
+		size = (entry - entry_map) * sizeof (uint64_t);
+		dmu_write(os, smo->smo_object, smo->smo_objsize,
+		    size, entry_map, tx);
+		smo->smo_objsize += size;
+	}
+
+	kmem_free(entry_map, bufsize);
+
+	VERIFY3U(sm->sm_space, ==, 0);
+}
+
+void
+space_map_write(space_map_t *sm, space_map_obj_t *smo, objset_t *os,
+    dmu_tx_t *tx)
+{
+	uint64_t oldsize = smo->smo_objsize;
+
+	dmu_free_range(os, smo->smo_object, 0, smo->smo_objsize, tx);
+
+	smo->smo_objsize = 0;
+
+	VERIFY3U(sm->sm_space, ==, smo->smo_alloc);
+	space_map_sync(sm, NULL, smo, SM_ALLOC, os, tx);
+
+	dprintf("write sm object %llu from %llu to %llu bytes in txg %llu\n",
+	    smo->smo_object, oldsize, smo->smo_objsize, dmu_tx_get_txg(tx));
+}
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
new file mode 100644
index 000000000000..b11cd42b6dc7
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -0,0 +1,90 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_ARC_H
+#define	_SYS_ARC_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/zio.h>
+
+typedef struct arc_buf_hdr arc_buf_hdr_t;
+typedef struct arc_buf arc_buf_t;
+typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
+typedef void arc_byteswap_func_t(void *buf, size_t size);
+
+/* generic arc_done_func_t's which you can use */
+arc_done_func_t arc_bcopy_func;
+arc_done_func_t arc_getbuf_func;
+
+struct arc_buf {
+	arc_buf_hdr_t		*b_hdr;
+	arc_buf_t		*b_next;
+	void			*b_data;
+};
+
+/*
+ * These are the flags we pass into calls to the arc
+ */
+#define	ARC_WAIT	(1 << 1)	/* perform I/O synchronously */
+#define	ARC_NOWAIT	(1 << 2)	/* perform I/O asynchronously */
+#define	ARC_PREFETCH	(1 << 3)	/* I/O is a prefetch */
+
+arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag);
+void arc_buf_free(arc_buf_t *buf, void *tag);
+int arc_buf_size(arc_buf_t *buf);
+void arc_release(arc_buf_t *buf, void *tag);
+int arc_released(arc_buf_t *buf);
+
+int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
+    arc_done_func_t *done, void *private, int priority, int flags,
+    uint32_t arc_flags);
+int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+    uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+    arc_done_func_t *done, void *private, int priority, int flags,
+    uint32_t arc_flags);
+int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    zio_done_func_t *done, void *private, uint32_t arc_flags);
+int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
+
+void arc_flush(void);
+void arc_tempreserve_clear(uint64_t tempreserve);
+int arc_tempreserve_space(uint64_t tempreserve);
+
+void arc_init(void);
+void arc_fini(void);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_ARC_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/bplist.h b/usr/src/uts/common/fs/zfs/sys/bplist.h
new file mode 100644
index 000000000000..0933cb977b60
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/bplist.h
@@ -0,0 +1,83 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_BPLIST_H
+#define	_SYS_BPLIST_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct bplist_phys {
+	/*
+	 * This is the bonus buffer for the dead lists.  The object's
+	 * contents is an array of bpl_entries blkptr_t's, representing
+	 * a total of bpl_bytes physical space.
+	 */
+	uint64_t	bpl_entries;
+	uint64_t	bpl_bytes;
+} bplist_phys_t;
+
+typedef struct bplist_q {
+	blkptr_t	bpq_blk;
+	void		*bpq_next;
+} bplist_q_t;
+
+typedef struct bplist {
+	kmutex_t	bpl_lock;
+	objset_t	*bpl_mos;
+	uint64_t	bpl_object;
+	int		bpl_blockshift;
+	int		bpl_bpshift;
+	bplist_q_t	*bpl_queue;
+	bplist_phys_t	*bpl_phys;
+	dmu_buf_t	*bpl_dbuf;
+	dmu_buf_t	*bpl_cached_dbuf;
+} bplist_t;
+
+extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
+extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
+extern void bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
+extern void bplist_close(bplist_t *bpl);
+extern boolean_t bplist_empty(bplist_t *bpl);
+extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
+extern void bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx);
+extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp);
+extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
+extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_BPLIST_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h
new file mode 100644
index 000000000000..3cf45f5985af
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h
@@ -0,0 +1,302 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DBUF_H
+#define	_SYS_DBUF_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	DB_BONUS_BLKID (-1ULL)
+#define	IN_DMU_SYNC ((blkptr_t *)-1)
+
+/*
+ * define flags for dbuf_read and friends
+ */
+
+#define	DB_RF_MUST_SUCCEED	0
+#define	DB_RF_CANFAIL		(1 << 1)
+#define	DB_RF_HAVESTRUCT	(1 << 2)
+#define	DB_RF_NOPREFETCH	(1 << 3)
+
+/*
+ * The state transition diagram for dbufs looks like:
+ *
+ *		+----> READ ----+
+ *		|		|
+ *		|		V
+ *   (alloc)-->UNCACHED	     CACHED-->(free)
+ *		|		^
+ *		|		|
+ *		+----> FILL ----+
+ */
+typedef enum dbuf_states {
+	DB_UNCACHED,
+	DB_FILL,
+	DB_READ,
+	DB_CACHED
+} dbuf_states_t;
+
+struct objset_impl;
+struct dnode;
+struct dmu_tx;
+
+/*
+ * level = 0 means the user data
+ * level = 1 means the single indirect block
+ * etc.
+ */
+
+#define	LIST_LINK_INACTIVE(link) \
+	((link)->list_next == NULL && (link)->list_prev == NULL)
+
+typedef struct dmu_buf_impl {
+	/*
+	 * The following members are immutable, with the exception of
+	 * db.db_data, which is protected by db_mtx.
+	 */
+
+	/* the publicly visible structure */
+	dmu_buf_t db;
+
+	/* the objset we belong to */
+	struct objset_impl *db_objset;
+
+	/*
+	 * the dnode we belong to (NULL when evicted)
+	 */
+	struct dnode *db_dnode;
+
+	/*
+	 * our parent buffer; if the dnode points to us directly,
+	 * db_parent == db_dnode->dn_dbuf
+	 * only accessed by sync thread ???
+	 * (NULL when evicted)
+	 */
+	struct dmu_buf_impl *db_parent;
+
+	/*
+	 * link for hash table of all dmu_buf_impl_t's
+	 */
+	struct dmu_buf_impl *db_hash_next;
+
+	/* our block number */
+	uint64_t db_blkid;
+
+	/*
+	 * Pointer to the blkptr_t which points to us. May be NULL if we
+	 * don't have one yet. (NULL when evicted)
+	 */
+	blkptr_t *db_blkptr;
+
+	/*
+	 * Our indirection level.  Data buffers have db_level==0.
+	 * Indirect buffers which point to data buffers have
+	 * db_level==1. etc.  Buffers which contain dnodes have
+	 * db_level==0, since the dnodes are stored in a file.
+	 */
+	uint8_t db_level;
+
+	/* db_mtx protects the members below */
+	kmutex_t db_mtx;
+
+	/*
+	 * Current state of the buffer
+	 */
+	dbuf_states_t db_state;
+
+	/*
+	 * Refcount accessed by dmu_buf_{hold,rele}.
+	 * If nonzero, the buffer can't be destroyed.
+	 * Protected by db_mtx.
+	 */
+	refcount_t db_holds;
+
+	/* buffer holding our data */
+	arc_buf_t *db_buf;
+
+	kcondvar_t db_changed;
+	arc_buf_t *db_data_pending;
+
+	/*
+	 * Last time (transaction group) this buffer was dirtied.
+	 */
+	uint64_t db_dirtied;
+
+	/*
+	 * If dd_dnode != NULL, our link on the owner dnodes's dn_dbufs list.
+	 * Protected by its dn_mtx.
+	 */
+	list_node_t db_link;
+
+	/* Our link on dn_dirty_dbufs[txg] */
+	list_node_t db_dirty_node[TXG_SIZE];
+	uint8_t db_dirtycnt;
+
+	/*
+	 * Data which is unique to data (leaf) blocks:
+	 */
+	struct {
+		/* stuff we store for the user (see dmu_buf_set_user) */
+		void *db_user_ptr;
+		void **db_user_data_ptr_ptr;
+		dmu_buf_evict_func_t *db_evict_func;
+		uint8_t db_immediate_evict;
+		uint8_t db_freed_in_flight;
+
+		/*
+		 * db_data_old[txg&TXG_MASK] is set when we
+		 * dirty the buffer, so that we can retain the
+		 * pointer even if it gets COW'd in a subsequent
+		 * transaction group.
+		 *
+		 * If the buffer is dirty in any txg, it can't
+		 * be destroyed.
+		 */
+		/*
+		 * XXX Protected by db_mtx and dn_dirty_mtx.
+		 * db_mtx must be held to read db_dirty[], and
+		 * both db_mtx and dn_dirty_mtx must be held to
+		 * modify (dirty or clean). db_mtx must be held
+		 * before dn_dirty_mtx.
+		 */
+		arc_buf_t *db_data_old[TXG_SIZE];
+		blkptr_t *db_overridden_by[TXG_SIZE];
+	} db_d;
+} dmu_buf_impl_t;
+
+/* Note: the dbuf hash table is exposed only for the mdb module */
+#define	DBUF_MUTEXES 256
+#define	DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
+typedef struct dbuf_hash_table {
+	uint64_t hash_table_mask;
+	dmu_buf_impl_t **hash_table;
+	kmutex_t hash_mutexes[DBUF_MUTEXES];
+} dbuf_hash_table_t;
+
+
+uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
+
+dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
+
+dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid);
+dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
+    void *tag);
+dmu_buf_impl_t *dbuf_hold_bonus(struct dnode *dn, void *tag);
+int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
+    void *tag, dmu_buf_impl_t **dbp);
+
+void dbuf_prefetch(struct dnode *dn, uint64_t blkid);
+
+void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
+void dbuf_remove_ref(dmu_buf_impl_t *db, void *tag);
+uint64_t dbuf_refcount(dmu_buf_impl_t *db);
+
+void dbuf_rele(dmu_buf_impl_t *db);
+
+dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
+
+void dbuf_read(dmu_buf_impl_t *db);
+int dbuf_read_canfail(dmu_buf_impl_t *db);
+void dbuf_read_havestruct(dmu_buf_impl_t *db);
+void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
+void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
+void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
+void dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+
+void dbuf_evict(dmu_buf_impl_t *db);
+
+void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx);
+void dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg);
+
+void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks,
+    struct dmu_tx *);
+
+void dbuf_downgrade(dmu_buf_impl_t *db, int evicting);
+void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
+
+void dbuf_init(void);
+void dbuf_fini(void);
+
+#ifdef ZFS_DEBUG
+
+/*
+ * There should be a ## between the string literal and fmt, to make it
+ * clear that we're joining two strings together, but that piece of shit
+ * gcc doesn't support that preprocessor token.
+ */
+#define	dprintf_dbuf(dbuf, fmt, ...) do { \
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+	char __db_buf[32]; \
+	uint64_t __db_obj = (dbuf)->db.db_object; \
+	if (__db_obj == DMU_META_DNODE_OBJECT) \
+		(void) strcpy(__db_buf, "mdn"); \
+	else \
+		(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
+		    (u_longlong_t)__db_obj); \
+	dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
+	    "obj=%s lvl=%u blkid=%lld " fmt, \
+	    __db_buf, (dbuf)->db_level, \
+	    (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
+	} \
+_NOTE(CONSTCOND) } while (0)
+
+#define	dprintf_dbuf_bp(db, bp, fmt, ...) do {			\
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+	char __blkbuf[200];					\
+	sprintf_blkptr(__blkbuf, bp);				\
+	dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);	\
+	} \
+_NOTE(CONSTCOND) } while (0)
+
+#else
+
+#define	dprintf_dbuf(db, fmt, ...)
+#define	dprintf_dbuf_bp(db, bp, fmt, ...)
+
+#endif
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DBUF_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
new file mode 100644
index 000000000000..f51ab89a90ac
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -0,0 +1,635 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DMU_H
+#define	_SYS_DMU_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * This file describes the interface that the DMU provides for its
+ * consumers.
+ *
+ * The DMU also interacts with the SPA.  That interface is described in
+ * dmu_spa.h.
+ */
+
+#include <sys/inttypes.h>
+#include <sys/types.h>
+#include <sys/param.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct uio;
+struct vnode;
+struct spa;
+struct zilog;
+struct zio;
+struct blkptr;
+struct zap_cursor;
+struct dsl_dataset;
+struct dsl_pool;
+struct dnode;
+struct drr_begin;
+struct drr_end;
+
+typedef struct objset objset_t;
+typedef struct dmu_tx dmu_tx_t;
+typedef struct dsl_dir dsl_dir_t;
+
+typedef enum dmu_object_type {
+	DMU_OT_NONE,
+	/* general: */
+	DMU_OT_OBJECT_DIRECTORY,	/* ZAP */
+	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
+	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
+	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
+	DMU_OT_BPLIST,			/* UINT64 */
+	DMU_OT_BPLIST_HDR,		/* UINT64 */
+	/* spa: */
+	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
+	DMU_OT_SPACE_MAP,		/* UINT64 */
+	/* zil: */
+	DMU_OT_INTENT_LOG,		/* UINT64 */
+	/* dmu: */
+	DMU_OT_DNODE,			/* DNODE */
+	DMU_OT_OBJSET,			/* OBJSET */
+	/* dsl: */
+	DMU_OT_DSL_DATASET,		/* UINT64 */
+	DMU_OT_DSL_DATASET_CHILD_MAP,	/* ZAP */
+	DMU_OT_DSL_OBJSET_SNAP_MAP,	/* ZAP */
+	DMU_OT_DSL_PROPS,		/* ZAP */
+	DMU_OT_DSL_OBJSET,		/* UINT64 */
+	/* zpl: */
+	DMU_OT_ZNODE,			/* ZNODE */
+	DMU_OT_ACL,			/* ACL */
+	DMU_OT_PLAIN_FILE_CONTENTS,	/* UINT8 */
+	DMU_OT_DIRECTORY_CONTENTS,	/* ZAP */
+	DMU_OT_MASTER_NODE,		/* ZAP */
+	DMU_OT_DELETE_QUEUE,		/* ZAP */
+	/* zvol: */
+	DMU_OT_ZVOL,			/* UINT8 */
+	DMU_OT_ZVOL_PROP,		/* ZAP */
+	/* other; for testing only! */
+	DMU_OT_PLAIN_OTHER,		/* UINT8 */
+	DMU_OT_UINT64_OTHER,		/* UINT64 */
+	DMU_OT_ZAP_OTHER,		/* ZAP */
+
+	DMU_OT_NUMTYPES
+} dmu_object_type_t;
+
+typedef enum dmu_objset_type {
+	DMU_OST_NONE,
+	DMU_OST_META,
+	DMU_OST_ZFS,
+	DMU_OST_ZVOL,
+	DMU_OST_OTHER,			/* For testing only! */
+	DMU_OST_ANY,			/* Be careful! */
+	DMU_OST_NUMTYPES
+} dmu_objset_type_t;
+
+void byteswap_uint64_array(void *buf, size_t size);
+void byteswap_uint32_array(void *buf, size_t size);
+void byteswap_uint16_array(void *buf, size_t size);
+void byteswap_uint8_array(void *buf, size_t size);
+void zap_byteswap(void *buf, size_t size);
+void zfs_acl_byteswap(void *buf, size_t size);
+void zfs_znode_byteswap(void *buf, size_t size);
+
+#define	DS_MODE_NONE		0	/* invalid, to aid debugging */
+#define	DS_MODE_STANDARD	1	/* normal access, no special needs */
+#define	DS_MODE_PRIMARY		2	/* the "main" access, e.g. a mount */
+#define	DS_MODE_EXCLUSIVE	3	/* exclusive access, e.g. to destroy */
+#define	DS_MODE_LEVELS		4
+#define	DS_MODE_LEVEL(x)	((x) & (DS_MODE_LEVELS - 1))
+#define	DS_MODE_READONLY	0x8
+#define	DS_MODE_IS_READONLY(x)	((x) & DS_MODE_READONLY)
+#define	DS_MODE_RESTORE		0x10
+#define	DS_MODE_IS_RESTORE(x)	((x) & DS_MODE_RESTORE)
+
+#define	DS_FIND_SNAPSHOTS	0x01
+
+/*
+ * The maximum number of bytes that can be accessed as part of one
+ * operation, including metadata.
+ */
+#define	DMU_MAX_ACCESS (10<<20) /* 10MB */
+
+/*
+ * Public routines to create, destroy, open, and close objsets.
+ */
+int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+    objset_t **osp);
+void dmu_objset_close(objset_t *os);
+int dmu_objset_create(const char *name, dmu_objset_type_t type,
+    objset_t *clone_parent,
+    void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
+int dmu_objset_destroy(const char *name);
+int dmu_objset_rollback(const char *name);
+int dmu_objset_rename(const char *name, const char *newname);
+void dmu_objset_set_quota(objset_t *os, uint64_t quota);
+uint64_t dmu_objset_get_quota(objset_t *os);
+int dmu_objset_request_reservation(objset_t *os, uint64_t reservation);
+void dmu_objset_find(char *name, void func(char *, void *), void *arg,
+    int flags);
+void dmu_objset_byteswap(void *buf, size_t size);
+
+typedef struct dmu_buf {
+	uint64_t db_object;		/* object that this buffer is part of */
+	uint64_t db_offset;		/* byte offset in this object */
+	uint64_t db_size;		/* size of buffer in bytes */
+	void *db_data;			/* data in buffer */
+} dmu_buf_t;
+
+typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
+
+/*
+ * Callback function to perform byte swapping on a block.
+ */
+typedef void dmu_byteswap_func_t(void *buf, size_t size);
+
+#define	DMU_POOL_DIRECTORY_OBJECT	1
+#define	DMU_POOL_CONFIG			"config"
+#define	DMU_POOL_ROOT_DATASET		"root_dataset"
+#define	DMU_POOL_SYNC_BPLIST		"sync_bplist"
+
+/*
+ * Allocate an object from this objset.  The range of object numbers
+ * available is (0, DN_MAX_OBJECT).  Object 0 is the meta-dnode.
+ *
+ * The transaction must be assigned to a txg.  The newly allocated
+ * object will be "held" in the transaction (ie. you can modify the
+ * newly allocated object in this transaction).
+ *
+ * dmu_object_alloc() chooses an object and returns it in *objectp.
+ *
+ * dmu_object_claim() allocates a specific object number.  If that
+ * number is already allocated, it fails and returns EEXIST.
+ *
+ * Return 0 on success, or ENOSPC or EEXIST as specified above.
+ */
+uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * Free an object from this objset.
+ *
+ * The object's data will be freed as well (ie. you don't need to call
+ * dmu_free(object, 0, -1, tx)).
+ *
+ * The object need not be held in the transaction.
+ *
+ * If there are any holds on this object's buffers (via dmu_buf_hold()),
+ * or tx holds on the object (via dmu_tx_hold_object()), you can not
+ * free it; it fails and returns EBUSY.
+ *
+ * If the object is not allocated, it fails and returns ENOENT.
+ *
+ * Return 0 on success, or EBUSY or ENOENT as specified above.
+ */
+int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
+
+/*
+ * Find the next allocated or free object.
+ *
+ * The objectp parameter is in-out.  It will be updated to be the next
+ * object which is allocated.
+ *
+ * XXX Can only be called on a objset with no dirty data.
+ *
+ * Returns 0 on success, or ENOENT if there are no more objects.
+ */
+int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole);
+
+/*
+ * Set the data blocksize for an object.
+ *
+ * The object cannot have any blocks allcated beyond the first.  If
+ * the first block is allocated already, the new size must be greater
+ * than the current block size.  If these conditions are not met,
+ * ENOTSUP will be returned.
+ *
+ * Returns 0 on success, or EBUSY if there are any holds on the object
+ * contents, or ENOTSUP as described above.
+ */
+int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
+    int ibs, dmu_tx_t *tx);
+
+/*
+ * Set the checksum property on a dnode.  The new checksum algorithm will
+ * apply to all newly written blocks; existing blocks will not be affected.
+ */
+void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+    dmu_tx_t *tx);
+
+/*
+ * Set the compress property on a dnode.  The new compression algorithm will
+ * apply to all newly written blocks; existing blocks will not be affected.
+ */
+void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+    dmu_tx_t *tx);
+
+/*
+ * The bonus data is accessed more or less like a regular buffer.
+ * You must dmu_bonus_hold() to get the buffer, which will give you a
+ * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
+ * data.  As with any normal buffer, you must call dmu_buf_read() to
+ * read db_data, dmu_buf_will_dirty() before modifying it, and the
+ * object must be held in an assigned transaction before calling
+ * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
+ * buffer as well.  You must release your hold with dmu_buf_rele().
+ */
+dmu_buf_t *dmu_bonus_hold(objset_t *os, uint64_t object);
+dmu_buf_t *dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag);
+int dmu_bonus_max(void);
+
+/*
+ * Obtain the DMU buffer from the specified object which contains the
+ * specified offset.  dmu_buf_hold() puts a "hold" on the buffer, so
+ * that it will remain in memory.  You must release the hold with
+ * dmu_buf_rele().  You musn't access the dmu_buf_t after releasing your
+ * hold.  You must have a hold on any dmu_buf_t* you pass to the DMU.
+ *
+ * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
+ * on the returned buffer before reading or writing the buffer's
+ * db_data.  The comments for those routines describe what particular
+ * operations are valid after calling them.
+ *
+ * The object number must be a valid, allocated object number.
+ */
+dmu_buf_t *dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset);
+void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
+void dmu_buf_remove_ref(dmu_buf_t *db, void* tag);
+void dmu_buf_rele(dmu_buf_t *db);
+void dmu_buf_rele_tag(dmu_buf_t *db, void *tag);
+uint64_t dmu_buf_refcount(dmu_buf_t *db);
+
+/*
+ * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
+ * range of an object.  A pointer to an array of dmu_buf_t*'s is
+ * returned (in *dbpp).
+ *
+ * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
+ * frees the array.  The hold on the array of buffers MUST be released
+ * with dmu_buf_rele_array.  You can NOT release the hold on each buffer
+ * individually with dmu_buf_rele.
+ */
+dmu_buf_t **dmu_buf_hold_array(objset_t *os, uint64_t object,
+    uint64_t offset, uint64_t length, int *numbufs);
+void dmu_buf_rele_array(dmu_buf_t **, int numbufs);
+
+/*
+ * Returns NULL on success, or the existing user ptr if it's already
+ * been set.
+ *
+ * user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
+ *
+ * user_data_ptr_ptr should be NULL, or a pointer to a pointer which
+ * will be set to db->db_data when you are allowed to access it.  Note
+ * that db->db_data (the pointer) can change when you do dmu_buf_read(),
+ * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill().
+ * *user_data_ptr_ptr will be set to the new value when it changes.
+ *
+ * If non-NULL, pageout func will be called when this buffer is being
+ * excised from the cache, so that you can clean up the data structure
+ * pointed to by user_ptr.
+ *
+ * dmu_evict_user() will call the pageout func for all buffers in a
+ * objset with a given pageout func.
+ */
+void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr,
+    dmu_buf_evict_func_t *pageout_func);
+/*
+ * set_user_ie is the same as set_user, but request immediate eviction
+ * when hold count goes to zero.
+ */
+void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
+    void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func);
+void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
+    void *user_ptr, void *user_data_ptr_ptr,
+    dmu_buf_evict_func_t *pageout_func);
+void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
+
+void dmu_buf_hold_data(dmu_buf_t *db);
+void dmu_buf_rele_data(dmu_buf_t *db);
+
+/*
+ * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
+ */
+void *dmu_buf_get_user(dmu_buf_t *db);
+
+/*
+ * Indicate that you are going to read the buffer's data (db_data).
+ *
+ * This routine will read the data from disk if necessary.
+ *
+ * These routines will return 0 on success, or an errno if there is a
+ * nonrecoverable I/O error.
+ */
+void dmu_buf_read(dmu_buf_t *db);
+int dmu_buf_read_canfail(dmu_buf_t *db);
+void dmu_buf_read_array(dmu_buf_t **dbp, int numbufs);
+int dmu_buf_read_array_canfail(dmu_buf_t **dbp, int numbufs);
+
+/*
+ * Indicate that you are going to modify the buffer's data (db_data).
+ *
+ * The transaction (tx) must be assigned to a txg (ie. you've called
+ * dmu_tx_assign()).  The buffer's object must be held in the tx
+ * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
+ */
+void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
+
+/*
+ * Indicate that you are going to modify the entire contents of the
+ * buffer's data ("fill" it).
+ *
+ * This routine is the same as dmu_buf_will_dirty, except that it won't
+ * read the contents off the disk, so the contents may be uninitialized
+ * and you must overwrite it.
+ *
+ * The transaction (tx) must be assigned to a txg (ie. you've called
+ * dmu_tx_assign()).  The buffer's object must be held in the tx (ie.
+ * you've called dmu_tx_hold_object(tx, db->db_object)).
+ */
+/* void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); */
+
+/*
+ * You must create a transaction, then hold the objects which you will
+ * (or might) modify as part of this transaction.  Then you must assign
+ * the transaction to a transaction group.  Once the transaction has
+ * been assigned, you can modify buffers which belong to held objects as
+ * part of this transaction.  You can't modify buffers before the
+ * transaction has been assigned; you can't modify buffers which don't
+ * belong to objects which this transaction holds; you can't hold
+ * objects once the transaction has been assigned.  You may hold an
+ * object which you are going to free (with dmu_object_free()), but you
+ * don't have to.
+ *
+ * You can abort the transaction before it has been assigned.
+ *
+ * Note that you may hold buffers (with dmu_buf_hold) at any time,
+ * regardless of transaction state.
+ */
+
+#define	DMU_NEW_OBJECT	(-1ULL)
+#define	DMU_OBJECT_END	(-1ULL)
+
+dmu_tx_t *dmu_tx_create(objset_t *os);
+void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
+void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
+    uint64_t len);
+void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int ops);
+void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_abort(dmu_tx_t *tx);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+void dmu_tx_commit(dmu_tx_t *tx);
+
+/*
+ * Free up the data blocks for a defined range of a file.  If size is
+ * zero, the range from offset to end-of-file is freed.
+ */
+void dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+	uint64_t size, dmu_tx_t *tx);
+
+/*
+ * Convenience functions.
+ *
+ * Canfail routines will return 0 on success, or an errno if there is a
+ * nonrecoverable I/O error.
+ */
+void dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+	void *buf);
+int dmu_read_canfail(objset_t *dd, uint64_t object, uint64_t offset,
+	uint64_t size, void *buf);
+void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+	const void *buf, dmu_tx_t *tx);
+int dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    struct uio *uio, dmu_tx_t *tx);
+
+/*
+ * Asynchronously try to read in the data.
+ */
+void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t len);
+
+typedef struct dmu_object_info {
+	/* All sizes are in bytes. */
+	uint32_t doi_data_block_size;
+	uint32_t doi_metadata_block_size;
+	uint64_t doi_bonus_size;
+	dmu_object_type_t doi_type;
+	dmu_object_type_t doi_bonus_type;
+	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
+	uint8_t doi_checksum;
+	uint8_t doi_compress;
+	uint8_t doi_pad[5];
+	/* Values below are number of 512-byte blocks. */
+	uint64_t doi_physical_blks;		/* data + metadata */
+	uint64_t doi_max_block_offset;
+} dmu_object_info_t;
+
+typedef struct dmu_object_type_info {
+	dmu_byteswap_func_t	*ot_byteswap;
+	boolean_t		ot_metadata;
+	char			*ot_name;
+} dmu_object_type_info_t;
+
+extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
+
+/*
+ * Get information on a DMU object.
+ *
+ * Return 0 on success or ENOENT if object is not allocated.
+ *
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
+void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
+void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
+    u_longlong_t *nblk512);
+
+/*
+ * Get the maximum nonzero offset in the object (ie. this offset and all
+ * offsets following are zero).
+ *
+ * XXX Perhaps integrate this with dmu_object_info(), although that
+ * would then have to bring in the indirect blocks.
+ */
+uint64_t dmu_object_max_nonzero_offset(objset_t *os, uint64_t object);
+
+typedef struct dmu_objset_stats {
+	dmu_objset_type_t dds_type;
+	uint8_t dds_is_snapshot;
+	uint8_t dds_is_placeholder;
+	uint8_t dds_pad[2];
+
+	uint64_t dds_creation_time;
+	uint64_t dds_creation_txg;
+
+	char dds_clone_of[MAXNAMELEN];
+
+	/* How much data is there in this objset? */
+
+	/*
+	 * Space referenced, taking into account pending writes and
+	 * frees.  Only relavent to filesystems and snapshots (not
+	 * collections).
+	 */
+	uint64_t dds_space_refd;
+
+	/*
+	 * Space "used", taking into account pending writes and frees, and
+	 * children's reservations (in bytes).  This is the amount of
+	 * space that will be freed if this and all dependent items are
+	 * destroyed (eg. child datasets, objsets, and snapshots).  So
+	 * for snapshots, this is the amount of space unique to this
+	 * snapshot.
+	 */
+	uint64_t dds_space_used;
+
+	/*
+	 * Compressed and uncompressed bytes consumed.  Does not take
+	 * into account reservations.  Used for computing compression
+	 * ratio.
+	 */
+	uint64_t dds_compressed_bytes;
+	uint64_t dds_uncompressed_bytes;
+
+	/*
+	 * The ds_fsid_guid is a 56-bit ID that can change to avoid
+	 * collisions.  The ds_guid is a 64-bit ID that will never
+	 * change, so there is a small probability that it will collide.
+	 */
+	uint64_t dds_fsid_guid;
+	uint64_t dds_guid;
+
+	uint64_t dds_objects_used;	/* number of objects used */
+	uint64_t dds_objects_avail;	/* number of objects available */
+
+	uint64_t dds_num_clones; /* number of clones of this */
+
+	/* The dataset's administratively-set quota, in bytes. */
+	uint64_t dds_quota;
+
+	/* The dataset's administratively-set reservation, in bytes */
+	uint64_t dds_reserved;
+
+	/*
+	 * The amount of additional space that this dataset can consume.
+	 * Takes into account quotas & reservations.
+	 * (Assuming that no other datasets consume it first.)
+	 */
+	uint64_t dds_available;
+
+	/*
+	 * Various properties.
+	 */
+	uint64_t dds_compression;
+	uint64_t dds_checksum;
+	uint64_t dds_zoned;
+	char dds_compression_setpoint[MAXNAMELEN];
+	char dds_checksum_setpoint[MAXNAMELEN];
+	char dds_zoned_setpoint[MAXNAMELEN];
+	char dds_altroot[MAXPATHLEN];
+
+	/* The following are for debugging purposes only */
+	uint64_t dds_last_txg;
+	uint64_t dds_dir_obj;
+	uint64_t dds_objset_obj;
+	uint64_t dds_clone_of_obj;
+} dmu_objset_stats_t;
+
+/*
+ * Get stats on a dataset.
+ */
+void dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds);
+
+int dmu_objset_is_snapshot(objset_t *os);
+
+extern struct spa *dmu_objset_spa(objset_t *os);
+extern struct zilog *dmu_objset_zil(objset_t *os);
+extern struct dsl_pool *dmu_objset_pool(objset_t *os);
+extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
+extern void dmu_objset_name(objset_t *os, char *buf);
+extern dmu_objset_type_t dmu_objset_type(objset_t *os);
+extern uint64_t dmu_objset_id(objset_t *os);
+extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+    uint64_t *id, uint64_t *offp);
+
+/*
+ * Return the txg number for the given assigned transaction.
+ */
+uint64_t dmu_tx_get_txg(dmu_tx_t *tx); /* XXX */
+
+/*
+ * Synchronous write.
+ * On success returns 0 and fills in the blk pointed at by bp.
+ * Note that while the data covered by this function will be on stable
+ * storage when the function returns this new data does not become a
+ * permanent part of the file until the associated transaction commits.
+ */
+int dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
+    struct blkptr *bp, uint64_t txg);
+
+/*
+ * Find the next hole or data block in file starting at *off
+ * Return found offset in *off. Return ESRCH for end of file.
+ */
+int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
+    uint64_t *off);
+
+/*
+ * Initial setup and final teardown.
+ */
+extern void dmu_init(void);
+extern void dmu_fini(void);
+
+typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
+    uint64_t object, uint64_t offset, int len);
+void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
+    dmu_traverse_cb_t cb, void *arg);
+
+int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp);
+int dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
+    struct vnode *vp, uint64_t voffset);
+
+/* CRC64 table */
+#define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
+extern uint64_t zfs_crc64_table[256];
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_DMU_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
new file mode 100644
index 000000000000..b6e8b62ec25b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
@@ -0,0 +1,230 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DMU_IMPL_H
+#define	_SYS_DMU_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/txg_impl.h>
+#include <sys/zio.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * This is the locking strategy for the DMU.  Numbers in parenthesis are
+ * cases that use that lock order, referenced below:
+ *
+ * ARC is self-contained
+ * bplist is self-contained
+ * refcount is self-contained
+ * txg is self-contained (hopefully!)
+ * zst_lock
+ * zf_rwlock
+ *
+ * XXX try to improve evicting path?
+ *
+ * dp_config_rwlock > os_obj_lock > dn_struct_rwlock >
+ * 	dn_dbufs_mtx > hash_mutexes > db_mtx > leafs
+ *
+ * dp_config_rwlock
+ *    must be held before: everything
+ *    protects dd namespace changes
+ *    protects property changes globally
+ *    held from:
+ *    	dsl_dir_open/r:
+ *    	dsl_dir_create_sync/w:
+ *    	dsl_dir_sync_destroy/w:
+ *    	dsl_dir_rename_sync/w:
+ *    	dsl_prop_changed_notify/r:
+ *
+ * os_obj_lock
+ *   must be held before:
+ *   	everything except dp_config_rwlock
+ *   protects os_obj_next
+ *   held from:
+ *   	dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock
+ *
+ * dn_struct_rwlock
+ *   must be held before:
+ *   	everything except dp_config_rwlock and os_obj_lock
+ *   protects structure of dnode (eg. nlevels)
+ *   	db_blkptr can change when syncing out change to nlevels
+ *   	dn_maxblkid
+ *   	dn_nlevels
+ *   	dn_*blksz*
+ *   	phys nlevels, maxblkid, physical blkptr_t's (?)
+ *   held from:
+ *   	callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch
+ *   	dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz)
+ *   	dmu_tx_count_free:
+ *   	dbuf_read_impl: db_mtx, dmu_zfetch()
+ *   	dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch()
+ *   	dbuf_new_size: db_mtx
+ *   	dbuf_dirty: db_mtx
+ *	dbuf_findbp: (callers, phys? - the real need)
+ *	dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?)
+ *	dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx
+ *	dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp()
+ *	dnode_sync/w (increase_indirection): db_mtx (phys)
+ *	dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*)
+ *	dnode_new_blkid/w: (dn_maxblkid)
+ *	dnode_free_range/w: dn_dirty_mtx (dn_maxblkid)
+ *	dnode_next_offset: (phys)
+ *
+ * dn_dbufs_mtx
+ *    must be held before:
+ *    	db_mtx, hash_mutexes
+ *    protects:
+ *    	dn_dbufs
+ *    	dn_evicted
+ *    held from:
+ *    	dmu_evict_user: db_mtx (dn_dbufs)
+ *    	dbuf_free_range: db_mtx (dn_dbufs)
+ *    	dbuf_remove_ref: db_mtx, callees:
+ *    		dbuf_hash_remove: hash_mutexes, db_mtx
+ *    	dbuf_create: hash_mutexes, db_mtx (dn_dbufs)
+ *    	dnode_set_blksz: (dn_dbufs)
+ *
+ * hash_mutexes (global)
+ *   must be held before:
+ *   	db_mtx
+ *   protects dbuf_hash_table (global) and db_hash_next
+ *   held from:
+ *   	dbuf_find: db_mtx
+ *   	dbuf_hash_insert: db_mtx
+ *   	dbuf_hash_remove: db_mtx
+ *
+ * db_mtx (meta-leaf)
+ *   must be held before:
+ *   	dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes)
+ *   protects:
+ *   	db_state
+ * 	db_holds
+ * 	db_buf
+ * 	db_changed
+ * 	db_data_pending
+ * 	db_dirtied
+ * 	db_link
+ * 	db_dirty_node (??)
+ * 	db_dirtycnt
+ * 	db_d.*
+ * 	db.*
+ *   held from:
+ * 	dbuf_dirty: dn_mtx, dn_dirty_mtx
+ * 	dbuf_dirty->dsl_dir_willuse_space: dd_lock
+ * 	dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock
+ * 	dbuf_undirty: dn_dirty_mtx (db_d)
+ * 	dbuf_write_done: dn_dirty_mtx (db_state)
+ * 	dbuf_*
+ * 	dmu_buf_update_user: none (db_d)
+ * 	dmu_evict_user: none (db_d) (maybe can eliminate)
+ *   	dbuf_find: none (db_holds)
+ *   	dbuf_hash_insert: none (db_holds)
+ *   	dmu_buf_read_array_impl: none (db_state, db_changed)
+ *   	dmu_sync: none (db_dirty_node, db_d)
+ *   	dnode_reallocate: none (db)
+ *
+ * dn_mtx (leaf)
+ *   protects:
+ *   	dn_dirty_dbufs
+ *   	dn_ranges
+ *   	phys accounting
+ * 	dn_allocated_txg
+ * 	dn_free_txg
+ * 	dn_assigned_txg
+ * 	dd_assigned_tx
+ * 	dn_notxholds
+ * 	dn_dirtyctx
+ * 	dn_dirtyctx_firstset
+ * 	(dn_phys copy fields?)
+ * 	(dn_phys contents?)
+ *   held from:
+ *   	dnode_*
+ *   	dbuf_dirty: none
+ *   	dbuf_sync: none (phys accounting)
+ *   	dbuf_undirty: none (dn_ranges, dn_dirty_dbufs)
+ *   	dbuf_write_done: none (phys accounting)
+ *   	dmu_object_info_from_dnode: none (accounting)
+ *   	dmu_tx_commit: none
+ *   	dmu_tx_hold_object_impl: none
+ *   	dmu_tx_try_assign: dn_notxholds(cv)
+ *   	dmu_tx_unassign: none
+ *
+ * dd_lock (leaf)
+ *    protects:
+ *    	dd_prop_cbs
+ *    	dd_sync_*
+ *    	dd_used_bytes
+ *    	dd_tempreserved
+ *    	dd_space_towrite
+ *    	dd_myname
+ *    	dd_phys accounting?
+ *    held from:
+ *    	dsl_dir_*
+ *    	dsl_prop_changed_notify: none (dd_prop_cbs)
+ *    	dsl_prop_register: none (dd_prop_cbs)
+ *    	dsl_prop_unregister: none (dd_prop_cbs)
+ *    	dsl_dataset_block_freeable: none (dd_sync_*)
+ *
+ * os_lock (leaf)
+ *   protects:
+ *   	os_dirty_dnodes
+ *   	os_free_dnodes
+ *   	os_dnodes
+ *   	os_downgraded_dbufs
+ *   	dn_dirtyblksz
+ *   	dn_dirty_link
+ *   held from:
+ *   	dnode_create: none (os_dnodes)
+ *   	dnode_destroy: none (os_dnodes)
+ *   	dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes)
+ *   	dnode_free: none (dn_dirtyblksz, os_*_dnodes)
+ *
+ * ds_lock (leaf)
+ *    protects:
+ *    	ds_user_ptr
+ *    	ds_user_evice_func
+ *    	ds_open_refcount
+ *    	ds_snapname
+ *    	ds_phys accounting
+ *    held from:
+ *    	dsl_dataset_*
+ *
+ */
+
+struct objset;
+struct dmu_pool;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_DMU_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
new file mode 100644
index 000000000000..d0a77fcfb9e6
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
@@ -0,0 +1,122 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DMU_OBJSET_H
+#define	_SYS_DMU_OBJSET_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+struct dmu_tx;
+struct objset_impl;
+
+typedef struct objset_phys {
+	dnode_phys_t os_meta_dnode;
+	zil_header_t os_zil_header;
+	uint64_t os_type;
+	char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) -
+	    sizeof (uint64_t)];
+} objset_phys_t;
+
+struct objset {
+	struct objset_impl *os;
+	int os_mode;
+};
+
+typedef struct objset_impl {
+	/* Immutable: */
+	struct dsl_dataset *os_dsl_dataset;
+	spa_t *os_spa;
+	objset_phys_t *os_phys;
+	dnode_t *os_meta_dnode;
+	zilog_t *os_zil;
+	objset_t os;
+	uint8_t os_checksum;	/* can change, under dsl_dir's locks */
+	uint8_t os_compress;	/* can change, under dsl_dir's locks */
+	uint8_t os_md_checksum;
+	uint8_t os_md_compress;
+
+	/* no lock needed: */
+	struct dmu_tx *os_synctx; /* XXX sketchy */
+	blkptr_t os_rootbp;
+
+	/* Protected by os_obj_lock */
+	kmutex_t os_obj_lock;
+	uint64_t os_obj_next;
+
+	/* Protected by os_lock */
+	kmutex_t os_lock;
+	list_t os_dirty_dnodes[TXG_SIZE];
+	list_t os_free_dnodes[TXG_SIZE];
+	list_t os_dnodes;
+	list_t os_downgraded_dbufs;
+} objset_impl_t;
+
+#define	DMU_PRIVATE_OBJECT		(1ULL << 63)
+
+#define	DMU_META_DNODE_OBJECT		(1ULL << 63)
+
+/* XXX rename this to DMU_IS_DNODE_OBJECT? */
+#define	IS_DNODE_DNODE(object) ((object) == DMU_META_DNODE_OBJECT)
+
+/* called from zpl */
+int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
+    objset_t **osp);
+void dmu_objset_close(objset_t *os);
+int dmu_objset_create(const char *name, dmu_objset_type_t type,
+    objset_t *clone_parent,
+    void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
+int dmu_objset_destroy(const char *name);
+int dmu_objset_rollback(const char *name);
+void dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds);
+void dmu_objset_find(char *name, void func(char *, void *), void *arg,
+    int flags);
+void dmu_objset_byteswap(void *buf, size_t size);
+
+/* called from dsl */
+void dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx);
+objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
+    dmu_objset_type_t type, dmu_tx_t *tx);
+objset_impl_t *dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds,
+    blkptr_t *bp);
+void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_OBJSET_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
new file mode 100644
index 000000000000..7087912e0081
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DMU_TRAVERSE_H
+#define	_SYS_DMU_TRAVERSE_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/arc.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	ADVANCE_POST	0		/* post-order traversal */
+#define	ADVANCE_PRE	0x01		/* pre-order traversal */
+#define	ADVANCE_PRUNE	0x02		/* prune by prev snapshot birth time */
+#define	ADVANCE_DATA	0x04		/* read user data blocks */
+#define	ADVANCE_HOLES	0x08		/* visit holes */
+#define	ADVANCE_NOLOCK	0x10		/* Don't grab SPA sync lock */
+
+#define	ZB_NO_LEVEL	-2
+#define	ZB_MAXLEVEL	32		/* Next power of 2 >= DN_MAX_LEVELS */
+#define	ZB_MAXBLKID	(1ULL << 62)
+#define	ZB_MAXOBJSET	(1ULL << 62)
+#define	ZB_MAXOBJECT	(1ULL << 62)
+
+#define	ZB_MOS_CACHE	0
+#define	ZB_MDN_CACHE	1
+#define	ZB_DN_CACHE	2
+#define	ZB_DEPTH	3
+
+typedef struct zbookmark {
+	uint64_t	zb_objset;
+	uint64_t	zb_object;
+	int		zb_level;
+	uint64_t	zb_blkid;
+} zbookmark_t;
+
+typedef struct zseg {
+	uint64_t	seg_mintxg;
+	uint64_t	seg_maxtxg;
+	zbookmark_t	seg_start;
+	zbookmark_t	seg_end;
+	list_node_t	seg_node;
+} zseg_t;
+
+typedef struct traverse_blk_cache {
+	zbookmark_t	bc_bookmark;
+	blkptr_t	bc_blkptr;
+	void		*bc_data;
+	dnode_phys_t	*bc_dnode;
+	int		bc_errno;
+	int		bc_pad1;
+	uint64_t	bc_pad2;
+} traverse_blk_cache_t;
+
+typedef int (blkptr_cb_t)(traverse_blk_cache_t *bc, spa_t *spa, void *arg);
+
+struct traverse_handle {
+	spa_t		*th_spa;
+	blkptr_cb_t	*th_func;
+	void		*th_arg;
+	int		th_advance;
+	int		th_zio_flags;
+	list_t		th_seglist;
+	traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL];
+	uint64_t	th_hits;
+	uint64_t	th_arc_hits;
+	uint64_t	th_reads;
+	uint64_t	th_callbacks;
+	uint64_t	th_syncs;
+	uint64_t	th_restarts;
+	zbookmark_t	th_noread;
+	zbookmark_t	th_lastcb;
+};
+
+int traverse_dsl_dataset(struct dsl_dataset *ds, uint64_t txg_start,
+    int advance, blkptr_cb_t func, void *arg);
+
+traverse_handle_t *traverse_init(spa_t *spa, blkptr_cb_t *func, void *arg,
+    int advance, int zio_flags);
+void traverse_fini(traverse_handle_t *th);
+
+void traverse_add_dnode(traverse_handle_t *th,
+    uint64_t mintxg, uint64_t maxtxg, uint64_t objset, uint64_t object);
+void traverse_add_objset(traverse_handle_t *th,
+    uint64_t mintxg, uint64_t maxtxg, uint64_t objset);
+void traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg);
+
+int traverse_more(traverse_handle_t *th);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DMU_TRAVERSE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h
new file mode 100644
index 000000000000..5d2f1127ce75
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h
@@ -0,0 +1,154 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DMU_TX_H
+#define	_SYS_DMU_TX_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/inttypes.h>
+#include <sys/dmu.h>
+#include <sys/txg.h>
+#include <sys/refcount.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct dmu_buf_impl;
+struct dnode_link;
+struct dsl_pool;
+struct dnode;
+struct dsl_dir;
+
+struct dmu_tx {
+	/*
+	 * No synchronization is needed because a tx can only be handled
+	 * by one thread.
+	 */
+	list_t tx_holds; /* list of dmu_tx_hold_t */
+	objset_t *tx_objset;
+	struct dsl_dir *tx_dir;
+	struct dsl_pool *tx_pool;
+	uint64_t tx_txg;
+	txg_handle_t tx_txgh;
+	uint64_t tx_space_towrite;
+	refcount_t tx_space_written;
+	uint64_t tx_space_tofree;
+	refcount_t tx_space_freed;
+	uint64_t tx_space_tooverwrite;
+	void *tx_tempreserve_cookie;
+	uint8_t tx_anyobj;
+	uint8_t tx_privateobj;
+#ifdef ZFS_DEBUG
+	char *tx_debug_buf;
+	int tx_debug_len;
+#endif
+};
+
+enum dmu_tx_hold_type {
+	THT_NEWOBJECT,
+	THT_WRITE,
+	THT_BONUS,
+	THT_FREE,
+	THT_ZAP,
+	THT_SPACE,
+	THT_NUMTYPES
+};
+
+typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
+    uint64_t arg1, uint64_t arg2);
+
+
+typedef struct dmu_tx_hold {
+	list_node_t dth_node;
+	struct dnode *dth_dnode;
+	enum dmu_tx_hold_type dth_type;
+	dmu_tx_hold_func_t dth_func;
+	uint64_t dth_arg1;
+	uint64_t dth_arg2;
+	/* XXX track what the actual estimates were for this hold */
+} dmu_tx_hold_t;
+
+
+/*
+ * These routines are defined in dmu.h, and are called by the user.
+ */
+dmu_tx_t *dmu_tx_create(objset_t *dd);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+void dmu_tx_commit(dmu_tx_t *tx);
+void dmu_tx_abort(dmu_tx_t *tx);
+uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
+
+/*
+ * These routines are defined in dmu_spa.h, and are called by the SPA.
+ */
+extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * These routines are only called by the DMU.
+ */
+dmu_tx_t *dmu_tx_create_ds(dsl_dir_t *dd);
+int dmu_tx_is_syncing(dmu_tx_t *tx);
+int dmu_tx_private_ok(dmu_tx_t *tx);
+void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object);
+void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta);
+void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db);
+int dmu_tx_holds(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space);
+
+#ifdef ZFS_DEBUG
+
+extern int dmu_use_tx_debug_bufs;
+
+#define	dprintf_tx(tx, fmt, ...) \
+	if (dmu_use_tx_debug_bufs) \
+	do { \
+	char *__bufp; \
+	int __len; \
+	if (tx->tx_debug_buf == NULL) { \
+		__bufp = kmem_zalloc(4096, KM_SLEEP); \
+		tx->tx_debug_buf = __bufp; \
+		tx->tx_debug_len = __len = 4096; \
+	} else { \
+		__len = tx->tx_debug_len; \
+		__bufp = &tx->tx_debug_buf[4096-__len]; \
+	} \
+	tx->tx_debug_len -= snprintf(__bufp, __len, fmt, __VA_ARGS__); \
+_NOTE(CONSTCOND) } while (0); \
+	else dprintf(fmt, __VA_ARGS__)
+
+#else
+
+#define	dprintf_tx(tx, fmt, ...)
+
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_DMU_TX_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h
new file mode 100644
index 000000000000..35466d6874d5
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h
@@ -0,0 +1,76 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_DFETCH_H
+#define	_DFETCH_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+extern uint64_t	zfetch_array_rd_sz;
+
+struct dnode;				/* so we can reference dnode */
+
+typedef enum zfetch_dirn {
+	ZFETCH_FORWARD = 1,		/* prefetch increasing block numbers */
+	ZFETCH_BACKWARD	= -1		/* prefetch decreasing block numbers */
+} zfetch_dirn_t;
+
+typedef struct zstream {
+	uint64_t	zst_offset;	/* offset of starting block in range */
+	uint64_t	zst_len;	/* length of range, in blocks */
+	zfetch_dirn_t	zst_direction;	/* direction of prefetch */
+	uint64_t	zst_stride;	/* length of stride, in blocks */
+	uint64_t	zst_ph_offset;	/* prefetch offset, in blocks */
+	uint64_t	zst_cap;	/* prefetch limit (cap), in blocks */
+	kmutex_t	zst_lock;	/* protects stream */
+	clock_t		zst_last;	/* lbolt of last prefetch */
+	avl_node_t	zst_node;	/* embed avl node here */
+} zstream_t;
+
+typedef struct zfetch {
+	krwlock_t	zf_rwlock;	/* protects zfetch structure */
+	list_t		zf_stream;	/* AVL tree of zstream_t's */
+	struct dnode	*zf_dnode;	/* dnode that owns this zfetch */
+	uint32_t	zf_stream_cnt;	/* # of active streams */
+	uint64_t	zf_alloc_fail;	/* # of failed attempts to alloc strm */
+} zfetch_t;
+
+void		dmu_zfetch_init(zfetch_t *, struct dnode *);
+void		dmu_zfetch_rele(zfetch_t *);
+void		dmu_zfetch(zfetch_t *, uint64_t, uint64_t);
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _DFETCH_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h
new file mode 100644
index 000000000000..2a5ef92b52b6
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h
@@ -0,0 +1,301 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DNODE_H
+#define	_SYS_DNODE_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/refcount.h>
+#include <sys/dmu_zfetch.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Flags.
+ */
+#define	DNODE_MUST_BE_ALLOCATED	1
+#define	DNODE_MUST_BE_FREE	2
+
+/*
+ * Fixed constants.
+ */
+#define	DNODE_SHIFT		9	/* 512 bytes */
+#define	DN_MIN_INDBLKSHIFT	10	/* 1k */
+#define	DN_MAX_INDBLKSHIFT	14	/* 16k */
+#define	DNODE_BLOCK_SHIFT	14	/* 16k */
+#define	DNODE_CORE_SIZE		64	/* 64 bytes for dnode sans blkptrs */
+#define	DN_MAX_OBJECT_SHIFT	48	/* 256 trillion (zfs_fid_t limit) */
+#define	DN_MAX_OFFSET_SHIFT	64	/* 2^64 bytes in a dnode */
+
+/*
+ * Derived constants.
+ */
+#define	DNODE_SIZE	(1 << DNODE_SHIFT)
+#define	DN_MAX_NBLKPTR	((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
+#define	DN_MAX_BONUSLEN	(DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
+
+#define	DNODES_PER_BLOCK_SHIFT	(DNODE_BLOCK_SHIFT - DNODE_SHIFT)
+#define	DNODES_PER_BLOCK	(1ULL << DNODES_PER_BLOCK_SHIFT)
+#define	DNODES_PER_LEVEL_SHIFT	(DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
+
+#define	DN_META_DNODE_LEVELS	\
+	(1 + (DN_MAX_OBJECT_SHIFT - DNODE_SHIFT + SPA_BLKPTRSHIFT -	\
+	DNODES_PER_BLOCK_SHIFT) / DNODES_PER_LEVEL_SHIFT)
+
+/* The +2 here is a cheesy way to round up */
+#define	DN_MAX_LEVELS	(2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
+	(DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
+
+#define	DN_MAX_OBJECT		\
+	((uint64_t)DN_MAX_NBLKPTR << (DNODES_PER_BLOCK_SHIFT +	\
+	(DN_META_DNODE_LEVELS - 1) * DNODES_PER_LEVEL_SHIFT))
+
+#define	DN_BONUS(dnp)	((void*)((dnp)->dn_bonus + \
+	(((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
+
+#define	EPB(blkshift, typeshift)	(1 << (blkshift - typeshift))
+
+struct dmu_buf_impl;
+struct objset_impl;
+struct zio;
+
+enum dnode_dirtycontext {
+	DN_UNDIRTIED,
+	DN_DIRTY_OPEN,
+	DN_DIRTY_SYNC
+};
+
+typedef struct dnode_phys {
+	uint8_t dn_type;		/* dmu_object_type_t */
+	uint8_t dn_indblkshift;		/* ln2(indirect block size) */
+	uint8_t dn_nlevels;		/* 1=dn_blkptr->data blocks */
+	uint8_t dn_nblkptr;		/* length of dn_blkptr */
+	uint8_t dn_bonustype;		/* type of data in bonus buffer */
+	uint8_t	dn_checksum;		/* ZIO_CHECKSUM type */
+	uint8_t	dn_compress;		/* ZIO_COMPRESS type */
+	uint8_t dn_pad1[1];
+	uint16_t dn_datablkszsec;	/* data block size in 512b sectors */
+	uint16_t dn_bonuslen;		/* length of dn_bonus */
+	uint8_t dn_pad2[4];
+
+	/* accounting is protected by dn_dirty_mtx */
+	uint64_t dn_maxblkid;		/* largest allocated block ID */
+	uint64_t dn_secphys;		/* 512b sectors of disk space used */
+
+	uint64_t dn_pad3[4];
+
+	blkptr_t dn_blkptr[1];
+	uint8_t dn_bonus[DN_MAX_BONUSLEN];
+} dnode_phys_t;
+
+typedef struct dnode {
+	/*
+	 * lock ordering:
+	 *
+	 * db_mtx > dn_dirty_mtx
+	 * 	dbuf_syncdone
+	 *
+	 * dn_struct_rwlock/r > dn_dirty_mtx
+	 * 	dmu_object_info
+	 *
+	 * dn_struct_rwlock/r > db_mtx > dn_dirty_mtx
+	 * 	dbuf_dirty
+	 * 	dbuf_setdirty
+	 *
+	 * dn_struct_rwlock/w > db_mtx > dn_mtx
+	 * 	dnode_increase_indirection -> dbuf_find
+	 * 	dbuf_hold_impl
+	 * 	dnode_set_bonus
+	 *
+	 * dn_struct_rwlock/w > dn_mtx
+	 * 	dnode_increase_indirection
+	 *
+	 * dn_dirty_mtx > dn_mtx
+	 * 	dnode_buf_pageout
+	 *
+	 * db_mtx > dn_mtx
+	 * 	dbuf_create
+	 */
+
+	/*
+	 * dn_struct_rwlock protects the structure of the dnode.
+	 * In particular, it protects the number of levels of indirection.
+	 */
+	krwlock_t dn_struct_rwlock;
+
+	/*
+	 * Our link on dataset's dd_dnodes list.
+	 * Protected by dd_accounting_mtx.
+	 */
+	list_node_t dn_link;
+
+	/* immutable: */
+	struct objset_impl *dn_objset;
+	uint64_t dn_object;
+	struct dmu_buf_impl *dn_dbuf;
+	dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
+
+	/*
+	 * Copies of stuff in dn_phys.  They're valid here even before
+	 * the dnode is first synced.
+	 */
+	dmu_object_type_t dn_type;	/* object type (immutable) */
+	uint8_t dn_bonustype;		/* bonus type (immutable) */
+	uint16_t dn_bonuslen;		/* bonus length (immutable) */
+	uint8_t dn_nblkptr;		/* number of blkptrs (immutable) */
+	uint8_t dn_datablkshift;	/* zero if blksz not power of 2! */
+	uint32_t dn_datablksz;		/* in bytes */
+	uint16_t dn_datablkszsec;	/* in 512b sectors */
+
+	uint8_t dn_checksum;		/* ZIO_CHECKSUM type */
+	uint8_t dn_compress;		/* ZIO_COMPRESS type */
+
+	/*
+	 * The following are kept up-to-date in the *open* context, the syncing
+	 * context should only pay attention to the dn_next_* values.
+	 */
+	uint8_t dn_nlevels;
+	uint8_t dn_indblkshift;
+
+	uint8_t dn_next_nlevels[TXG_SIZE];
+	uint8_t dn_next_indblkshift[TXG_SIZE];
+
+	/* protected by os_lock: */
+	uint32_t dn_dirtyblksz[TXG_SIZE];	/* dirty block size in bytes */
+	list_node_t dn_dirty_link[TXG_SIZE];	/* next on dataset's dirty */
+
+	/* protected by dn_mtx: */
+	kmutex_t dn_mtx;
+	list_t dn_dirty_dbufs[TXG_SIZE];
+	uint64_t dn_maxblkid;
+	avl_tree_t dn_ranges[TXG_SIZE];
+	uint64_t dn_allocated_txg;
+	uint64_t dn_free_txg;
+	uint64_t dn_assigned_txg;
+	struct dmu_tx *dn_assigned_tx;		/* if only one tx cares */
+	kcondvar_t dn_notxholds;
+	enum dnode_dirtycontext dn_dirtyctx;
+	uint8_t *dn_dirtyctx_firstset;		/* dbg: contents meaningless */
+
+	/* protected by own devices */
+	refcount_t dn_tx_holds;
+	refcount_t dn_holds;
+
+	kmutex_t dn_dbufs_mtx;
+	list_t dn_dbufs;		/* linked list of descendent dbuf_t's */
+	kcondvar_t dn_evicted;		/* a child dbuf has been evicted */
+
+	/*
+	 * Performance hack: whenever we have a hold on the bonus buffer of a
+	 * ZAP object, we will also have a hold on db0.  This will keep the
+	 * meta-data for a micro-zap object cached as long as the znode for the
+	 * object is in the znode cache.
+	 */
+	struct dmu_buf_impl *dn_db0;
+
+	/* holds prefetch structure */
+	struct zfetch	dn_zfetch;
+} dnode_t;
+
+typedef struct free_range {
+	avl_node_t fr_node;
+	uint64_t fr_blkid;
+	uint64_t fr_nblks;
+} free_range_t;
+
+dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
+    uint64_t object);
+void dnode_special_close(dnode_t *dn);
+
+dnode_t *dnode_hold(struct objset_impl *dd, uint64_t object, void *ref);
+dnode_t *dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
+    void *ref);
+void dnode_add_ref(dnode_t *dn, void *ref);
+void dnode_rele(dnode_t *dn, void *ref);
+void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
+int dnode_sync(dnode_t *dn, int level, struct zio *zio, dmu_tx_t *tx);
+void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+void dnode_free(dnode_t *dn, dmu_tx_t *tx);
+void dnode_byteswap(dnode_phys_t *dnp);
+void dnode_buf_byteswap(void *buf, size_t size);
+void dnode_verify(dnode_t *dn);
+int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
+uint64_t dnode_current_max_length(dnode_t *dn);
+uint64_t dnode_max_nonzero_offset(dnode_t *dn);
+void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
+void dnode_clear_range(dnode_t *dn, uint64_t blkid,
+    uint64_t nblks, dmu_tx_t *tx);
+void dnode_diduse_space(dnode_t *dn, int64_t space);
+void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
+void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx);
+uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
+void dnode_init(void);
+void dnode_fini(void);
+int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
+    uint64_t blkfill);
+
+#ifdef ZFS_DEBUG
+
+/*
+ * There should be a ## between the string literal and fmt, to make it
+ * clear that we're joining two strings together, but that piece of shit
+ * gcc doesn't support that preprocessor token.
+ */
+#define	dprintf_dnode(dn, fmt, ...) do { \
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+	char __db_buf[32]; \
+	uint64_t __db_obj = (dn)->dn_object; \
+	if (__db_obj == DMU_META_DNODE_OBJECT) \
+		(void) strcpy(__db_buf, "mdn"); \
+	else \
+		(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
+		    (u_longlong_t)__db_obj);\
+	dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \
+	    __db_buf, __VA_ARGS__); \
+	} \
+_NOTE(CONSTCOND) } while (0)
+
+#else
+
+#define	dprintf_dnode(db, fmt, ...)
+
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_DNODE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
new file mode 100644
index 000000000000..e56c8a67d9db
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -0,0 +1,164 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DSL_DATASET_H
+#define	_SYS_DSL_DATASET_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/bplist.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+struct dsl_dir;
+struct dsl_pool;
+
+typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
+
+typedef struct dsl_dataset_phys {
+	uint64_t ds_dir_obj;
+	uint64_t ds_prev_snap_obj;
+	uint64_t ds_prev_snap_txg;
+	uint64_t ds_next_snap_obj;
+	uint64_t ds_snapnames_zapobj;	/* zap obj of snaps; ==0 for snaps */
+	uint64_t ds_num_children;	/* clone/snap children; ==0 for head */
+	uint64_t ds_creation_time;	/* seconds since 1970 */
+	uint64_t ds_creation_txg;
+	uint64_t ds_deadlist_obj;
+	uint64_t ds_used_bytes;
+	uint64_t ds_compressed_bytes;
+	uint64_t ds_uncompressed_bytes;
+	uint64_t ds_unique_bytes;	/* only relavent to snapshots */
+	/*
+	 * The ds_fsid_guid is a 56-bit ID that can change to avoid
+	 * collisions.  The ds_guid is a 64-bit ID that will never
+	 * change, so there is a small probability that it will collide.
+	 */
+	uint64_t ds_fsid_guid;
+	uint64_t ds_guid;
+	uint64_t ds_restoring; /* boolean */
+	blkptr_t ds_bp;
+	uint64_t ds_pad[8]; /* pad out to 256 bytes for good measure */
+} dsl_dataset_phys_t;
+
+typedef struct dsl_dataset {
+	/* Immutable: */
+	struct dsl_dir *ds_dir;
+	dsl_dataset_phys_t *ds_phys;
+	dmu_buf_t *ds_dbuf;
+	uint64_t ds_object;
+
+	/* only used in syncing context: */
+	struct dsl_dataset *ds_prev; /* only valid for non-snapshots */
+
+	/* has internal locking: */
+	bplist_t ds_deadlist;
+
+	/* protected by lock on pool's dp_dirty_datasets list */
+	txg_node_t ds_dirty_link;
+	list_node_t ds_synced_link;
+
+	/*
+	 * ds_phys->ds_<accounting> is also protected by ds_lock.
+	 * Protected by ds_lock:
+	 */
+	kmutex_t ds_lock;
+	void *ds_user_ptr;
+	dsl_dataset_evict_func_t *ds_user_evict_func;
+	uint64_t ds_open_refcount;
+
+	/* Protected by ds_lock; keep at end of struct for better locality */
+	char ds_snapname[MAXNAMELEN];
+} dsl_dataset_t;
+
+#define	dsl_dataset_is_snapshot(ds)	\
+	((ds)->ds_phys->ds_num_children != 0)
+
+int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
+    void *tag, dsl_dataset_t **dsp);
+int dsl_dataset_open(const char *name, int mode, void *tag,
+    dsl_dataset_t **dsp);
+dsl_dataset_t *dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj,
+    const char *tail, int mode, void *tag);
+void dsl_dataset_name(dsl_dataset_t *ds, char *name);
+void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag);
+int dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
+    const char *lastname, dsl_dataset_t *clone_parent, dmu_tx_t *tx);
+int dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx);
+int dsl_dataset_destroy(const char *name);
+int dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx);
+int dsl_dataset_rollback(const char *name);
+int dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx);
+int dsl_dataset_rename(const char *name, const char *newname);
+
+void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
+    void *p, dsl_dataset_evict_func_t func);
+void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds);
+
+void dsl_dataset_get_blkptr(dsl_dataset_t *ds, blkptr_t *bp);
+void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
+
+spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
+
+void dsl_dataset_sync(dsl_dataset_t *os, dmu_tx_t *tx);
+
+void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
+void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
+int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth,
+    dmu_tx_t *tx);
+
+void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
+void dsl_dataset_stats(dsl_dataset_t *os, dmu_objset_stats_t *dds);
+struct dsl_pool *dsl_dataset_pool(dsl_dataset_t *ds);
+
+void dsl_dataset_create_root(struct dsl_pool *dp, uint64_t *ddobjp,
+    dmu_tx_t *tx);
+
+#ifdef ZFS_DEBUG
+#define	dprintf_ds(ds, fmt, ...) do { \
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+	char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
+	dsl_dataset_name(ds, __ds_name); \
+	dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
+	kmem_free(__ds_name, MAXNAMELEN); \
+	} \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define	dprintf_ds(dd, fmt, ...)
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DATASET_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
new file mode 100644
index 000000000000..0499d731e6c7
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
@@ -0,0 +1,143 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DSL_DIR_H
+#define	_SYS_DSL_DIR_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/refcount.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+
+typedef struct dsl_dir_phys {
+	uint64_t dd_creation_time;
+	uint64_t dd_head_dataset_obj;
+	uint64_t dd_parent_obj;
+	uint64_t dd_clone_parent_obj;
+	uint64_t dd_child_dir_zapobj;
+	/*
+	 * how much space our children are accounting for; for leaf
+	 * datasets, == physical space used by fs + snaps
+	 */
+	uint64_t dd_used_bytes;
+	uint64_t dd_compressed_bytes;
+	uint64_t dd_uncompressed_bytes;
+	/* Administrative quota setting */
+	uint64_t dd_quota;
+	/* Administrative reservation setting */
+	uint64_t dd_reserved;
+	uint64_t dd_props_zapobj;
+	uint64_t dd_pad[21]; /* pad out to 256 bytes for good measure */
+} dsl_dir_phys_t;
+
+struct dsl_dir {
+	/* These are immutable; no lock needed: */
+	uint64_t dd_object;
+	dsl_dir_phys_t *dd_phys;
+	dmu_buf_t *dd_dbuf;
+	dsl_pool_t *dd_pool;
+
+	/* protected by lock on pool's dp_dirty_dirs list */
+	txg_node_t dd_dirty_link;
+
+	/* protected by dp_config_rwlock */
+	dsl_dir_t *dd_parent;
+
+	/* Protected by dd_lock */
+	kmutex_t dd_lock;
+	list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
+	/* Thing to do when we sync */
+	uint64_t dd_sync_txg;
+	int (*dd_sync_func)(dsl_dir_t *dd, void *arg, dmu_tx_t *tx);
+	void *dd_sync_arg;
+	int dd_sync_err;
+
+	/* Accounting */
+	/* reflects any changes to dd_phys->dd_used_bytes made this syncing */
+	int64_t dd_used_bytes;
+	/* int64_t dd_compressed_bytes; */
+	/* int64_t dd_uncompressed_bytes; */
+	/* gross estimate of space used by in-flight tx's */
+	uint64_t dd_tempreserved[TXG_SIZE];
+	/* amount of space we expect to write; == amount of dirty data */
+	int64_t dd_space_towrite[TXG_SIZE];
+
+	/* protected by dd_lock; keep at end of struct for better locality */
+	char dd_myname[MAXNAMELEN];
+};
+
+void dsl_dir_close(dsl_dir_t *dd, void *tag);
+dsl_dir_t *dsl_dir_open(const char *name, void *tag, const char **tail);
+dsl_dir_t *dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
+    const char **tailp);
+dsl_dir_t *dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+    const char *tail, void *tag);
+void dsl_dir_name(dsl_dir_t *dd, char *buf);
+int dsl_dir_is_private(dsl_dir_t *dd);
+int dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx);
+void dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx);
+int dsl_dir_destroy_sync(dsl_dir_t *pds, void *arg, dmu_tx_t *tx);
+void dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds);
+void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx);
+void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx);
+int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
+    uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx);
+void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx);
+void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx);
+void dsl_dir_diduse_space(dsl_dir_t *dd,
+    int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
+int dsl_dir_sync_task(dsl_dir_t *dd,
+    int (*func)(dsl_dir_t *, void*, dmu_tx_t *), void *arg, uint64_t space);
+int dsl_dir_set_quota(const char *ddname, uint64_t quota);
+int dsl_dir_set_reservation(const char *ddname, uint64_t reservation);
+int dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx);
+
+#ifdef ZFS_DEBUG
+#define	dprintf_dd(dd, fmt, ...) do { \
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+	char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
+	dsl_dir_name(dd, __ds_name); \
+	dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
+	kmem_free(__ds_name, MAXNAMELEN); \
+	} \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define	dprintf_dd(dd, fmt, ...)
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DIR_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
new file mode 100644
index 000000000000..4fca4548ad07
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
@@ -0,0 +1,82 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DSL_POOL_H
+#define	_SYS_DSL_POOL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/txg_impl.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct objset;
+struct dsl_dir;
+
+typedef struct dsl_pool {
+	/* Immutable */
+	spa_t *dp_spa;
+	struct objset *dp_meta_objset;
+	struct dsl_dir *dp_root_dir;
+	struct dsl_dir *dp_mos_dir;
+	uint64_t dp_root_dir_obj;
+
+	/* No lock needed - sync context only */
+	blkptr_t dp_meta_rootbp;
+	list_t dp_synced_objsets;
+
+	/* Has its own locking */
+	tx_state_t dp_tx;
+	txg_list_t dp_dirty_datasets;
+	txg_list_t dp_dirty_dirs;
+
+	/*
+	 * Protects administrative changes (properties, namespace)
+	 * It is only held for write in syncing context.  Therefore
+	 * syncing context does not need to ever have it for read, since
+	 * nobody else could possibly have it for write.
+	 */
+	krwlock_t dp_config_rwlock;
+} dsl_pool_t;
+
+dsl_pool_t *dsl_pool_open(spa_t *spa, uint64_t txg);
+void dsl_pool_close(dsl_pool_t *dp);
+dsl_pool_t *dsl_pool_create(spa_t *spa, uint64_t txg);
+void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
+void dsl_pool_zil_clean(dsl_pool_t *dp);
+int dsl_pool_sync_context(dsl_pool_t *dp);
+uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_POOL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_prop.h b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h
new file mode 100644
index 000000000000..ea810b03ab44
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_DSL_PROP_H
+#define	_SYS_DSL_PROP_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct dsl_dataset;
+
+/* The callback func may not call into the DMU or DSL! */
+typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval);
+
+#define	DSL_PROP_VALUE_UNDEFINED (-1ULL)
+
+typedef struct dsl_prop_cb_record {
+	list_node_t cbr_node; /* link on dd_prop_cbs */
+	const char *cbr_propname;
+	dsl_prop_changed_cb_t *cbr_func;
+	void *cbr_arg;
+} dsl_prop_cb_record_t;
+
+int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
+    dsl_prop_changed_cb_t *callback, void *cbarg);
+int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
+    dsl_prop_changed_cb_t *callback, void *cbarg);
+
+int dsl_prop_get(const char *ddname, const char *propname,
+    int intsz, int numints, void *buf, char *setpoint);
+int dsl_prop_get_string(const char *ddname, const char *propname,
+    char *value, int valuelen, char *setpoint);
+int dsl_prop_get_integer(const char *ddname, const char *propname,
+    uint64_t *valuep, char *setpoint);
+int dsl_prop_get_ds_integer(dsl_dir_t *dd, const char *propname,
+    uint64_t *valuep, char *setpoint);
+
+int dsl_prop_set(const char *ddname, const char *propname,
+    int intsz, int numints, const void *buf);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_DSL_PROP_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab.h b/usr/src/uts/common/fs/zfs/sys/metaslab.h
new file mode 100644
index 000000000000..e592b388fd1e
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h
@@ -0,0 +1,73 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_METASLAB_H
+#define	_SYS_METASLAB_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/space_map.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct metaslab_class metaslab_class_t;
+typedef struct metaslab_group metaslab_group_t;
+
+extern void metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
+    metaslab_t **mspp, uint64_t offset, uint64_t size, uint64_t txg);
+extern void metaslab_fini(metaslab_t *msp);
+extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
+extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
+
+extern int metaslab_alloc(spa_t *spa, uint64_t size, dva_t *dva, uint64_t txg);
+extern void metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg);
+extern int metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg);
+
+extern metaslab_class_t *metaslab_class_create(void);
+extern void metaslab_class_destroy(metaslab_class_t *mc);
+extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
+extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
+
+extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
+    vdev_t *vd);
+extern void metaslab_group_destroy(metaslab_group_t *mg);
+extern void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp,
+    uint64_t weight);
+extern void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp);
+extern void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp,
+    uint64_t weight);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_METASLAB_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
new file mode 100644
index 000000000000..5b1e38872711
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_METASLAB_IMPL_H
+#define	_SYS_METASLAB_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/metaslab.h>
+#include <sys/space_map.h>
+#include <sys/vdev.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct metaslab_class {
+	metaslab_group_t	*mc_rotor;
+	uint64_t		mc_allocated;
+};
+
+struct metaslab_group {
+	kmutex_t		mg_lock;
+	avl_tree_t		mg_metaslab_tree;
+	uint64_t		mg_aliquot;
+	int64_t			mg_bias;
+	metaslab_class_t	*mg_class;
+	vdev_t			*mg_vd;
+	metaslab_group_t	*mg_prev;
+	metaslab_group_t	*mg_next;
+};
+
+/*
+ * Each metaslab's free block list is kept in its own DMU object in the
+ * metaslab freelist dataset.  To minimize space consumption, the list
+ * is circular.
+ *
+ * Allocations and frees can happen in multiple transaction groups at
+ * the same time, which makes it a bit challening to keep the metaslab
+ * consistent.  For example, we cannot allow frees from different
+ * transaction groups to be interleaved in the metaslab's free block list.
+ *
+ * We address this in several ways:
+ *
+ *	We don't allow allocations from the same metaslab in concurrent
+ *	transaction groups.  metaslab_alloc() enforces this by checking
+ *	the ms_last_alloc field, which specifies the last txg in which
+ *	the metaslab was used for allocations.
+ *
+ *	We can't segregate frees this way because we can't choose which
+ *	DVAs someone wants to free.  So we keep separate in-core freelists
+ *	for each active transaction group.  This in-core data is only
+ *	written to the metaslab's on-disk freelist in metaslab_sync(),
+ *	which solves the interleave problem: we only append frees from
+ *	the syncing txg to the on-disk freelist, so the appends all occur
+ *	in txg order.
+ *
+ *	We cannot allow a block which was freed in a given txg to be
+ *	allocated again until that txg has closed; otherwise, if we
+ *	failed to sync that txg and had to roll back to txg - 1,
+ *	changes in txg + 1 could have overwritten the data.  Therefore,
+ *	we partition the free blocks into "available" and "limbo" states.
+ *	A block is available if the txg in which it was freed has closed;
+ *	until then, the block is in limbo.  Each time metaslab_sync() runs,
+ *	if first adds any limbo blocks to the avail list, clears the limbo
+ *	list, and starts writing the new limbo blocks (i.e. the ones that
+ *	were freed in the syncing txg).
+ */
+
+struct metaslab {
+	kmutex_t	ms_lock;	/* metaslab lock		*/
+	space_map_obj_t	*ms_smo;	/* space map object		*/
+	uint64_t	ms_last_alloc;	/* txg of last alloc		*/
+	uint64_t	ms_usable_end;	/* end of free_obj at last sync	*/
+	uint64_t	ms_usable_space; /* usable space at last sync	*/
+	metaslab_group_t *ms_group;	/* metaslab group		*/
+	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
+	uint64_t	ms_weight;	/* weight vs. others in group	*/
+	uint8_t		ms_dirty[TXG_SIZE];	/* per-txg dirty flags	*/
+	space_map_t	ms_allocmap[TXG_SIZE];  /* allocated this txg	*/
+	space_map_t	ms_freemap[TXG_SIZE];	/* freed this txg	*/
+	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/
+	space_map_t	ms_map;		/* in-core free space map	*/
+	uint8_t		ms_map_incore;  /* space map contents are valid */
+	uint64_t	ms_map_cursor[SPA_ASIZEBITS]; /* XXX -- PPD	*/
+};
+
+/*
+ * ms_dirty[] flags
+ */
+#define	MSD_ALLOC	0x01	/* allocated from in this txg		*/
+#define	MSD_FREE	0x02	/* freed to in this txg			*/
+#define	MSD_ADD		0x04	/* added to the pool in this txg	*/
+#define	MSD_CONDENSE	0x08	/* condensed in this txg		*/
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_METASLAB_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h
new file mode 100644
index 000000000000..f9fffd24430e
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/refcount.h
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_REFCOUNT_H
+#define	_SYS_REFCOUNT_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/inttypes.h>
+#include <sys/list.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * If the reference is held only by the calling function and not any
+ * particular object, use FTAG (which is a string) for the holder_tag.
+ * Otherwise, use the object that holds the reference.
+ */
+#define	FTAG ((void*)__func__)
+
+#if defined(DEBUG) || !defined(_KERNEL)
+typedef struct reference {
+	list_node_t ref_link;
+	void *ref_holder;
+	uint64_t ref_number;
+	uint8_t *ref_removed;
+} reference_t;
+
+typedef struct refcount {
+	kmutex_t rc_mtx;
+	list_t rc_list;
+	list_t rc_removed;
+	int64_t rc_count;
+	int64_t rc_removed_count;
+} refcount_t;
+
+/* Note: refcount_t should be initialized to zero before use. */
+
+void refcount_create(refcount_t *rc);
+void refcount_destroy(refcount_t *rc);
+void refcount_destroy_many(refcount_t *rc, uint64_t number);
+int refcount_is_zero(refcount_t *rc);
+int64_t refcount_count(refcount_t *rc);
+int64_t refcount_add(refcount_t *rc, void *holder_tag);
+int64_t refcount_remove(refcount_t *rc, void *holder_tag);
+int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
+int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
+
+void refcount_init(void);
+void refcount_fini(void);
+
+#else /* DEBUG */
+
+typedef struct refcount {
+	uint64_t rc_count;
+} refcount_t;
+
+#define	refcount_create(rc) ((rc)->rc_count = 0)
+#define	refcount_destroy(rc) ((rc)->rc_count = 0)
+#define	refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
+#define	refcount_is_zero(rc) ((rc)->rc_count == 0)
+#define	refcount_count(rc) ((rc)->rc_count)
+#define	refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1)
+#define	refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1)
+#define	refcount_add_many(rc, number, holder) \
+	atomic_add_64_nv(&(rc)->rc_count, number)
+#define	refcount_remove_many(rc, number, holder) \
+	atomic_add_64_nv(&(rc)->rc_count, -number)
+
+#define	refcount_init()
+#define	refcount_fini()
+
+#endif /* DEBUG */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_REFCOUNT_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
new file mode 100644
index 000000000000..9bf0f89d4945
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -0,0 +1,406 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPA_H
+#define	_SYS_SPA_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/avl.h>
+#include <sys/zfs_context.h>
+#include <sys/nvpair.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Forward references that lots of things need.
+ */
+typedef struct spa spa_t;
+typedef struct vdev vdev_t;
+typedef struct metaslab metaslab_t;
+typedef struct zilog zilog_t;
+typedef struct traverse_handle traverse_handle_t;
+struct dsl_pool;
+
+/*
+ * General-purpose 32-bit and 64-bit bitfield encodings.
+ */
+#define	BF32_DECODE(x, low, len)	P2PHASE((x) >> (low), 1U << (len))
+#define	BF64_DECODE(x, low, len)	P2PHASE((x) >> (low), 1ULL << (len))
+#define	BF32_ENCODE(x, low, len)	(P2PHASE((x), 1U << (len)) << (low))
+#define	BF64_ENCODE(x, low, len)	(P2PHASE((x), 1ULL << (len)) << (low))
+
+#define	BF32_GET(x, low, len)		BF32_DECODE(x, low, len)
+#define	BF64_GET(x, low, len)		BF64_DECODE(x, low, len)
+
+#define	BF32_SET(x, low, len, val)	\
+	((x) ^= BF32_ENCODE((x >> low) ^ val, low, len))
+#define	BF64_SET(x, low, len, val)	\
+	((x) ^= BF64_ENCODE((x >> low) ^ val, low, len))
+
+#define	BF32_GET_SB(x, low, len, shift, bias)	\
+	((BF32_GET(x, low, len) + (bias)) << (shift))
+#define	BF64_GET_SB(x, low, len, shift, bias)	\
+	((BF64_GET(x, low, len) + (bias)) << (shift))
+
+#define	BF32_SET_SB(x, low, len, shift, bias, val)	\
+	BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
+#define	BF64_SET_SB(x, low, len, shift, bias, val)	\
+	BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
+
+/*
+ * We currently support nine block sizes, from 512 bytes to 128K.
+ * We could go higher, but the benefits are near-zero and the cost
+ * of COWing a giant block to modify one byte would become excessive.
+ */
+#define	SPA_MINBLOCKSHIFT	9
+#define	SPA_MAXBLOCKSHIFT	17
+#define	SPA_MINBLOCKSIZE	(1ULL << SPA_MINBLOCKSHIFT)
+#define	SPA_MAXBLOCKSIZE	(1ULL << SPA_MAXBLOCKSHIFT)
+
+#define	SPA_BLOCKSIZES		(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
+
+/*
+ * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
+ * The ASIZE encoding should be at least 64 times larger (6 more bits)
+ * to support up to 4-way RAID-Z mirror mode with worst-case gang block
+ * overhead, three DVAs per bp, plus one more bit in case we do anything
+ * else that expands the ASIZE.
+ */
+#define	SPA_LSIZEBITS		16	/* LSIZE up to 32M (2^16 * 512)	*/
+#define	SPA_PSIZEBITS		16	/* PSIZE up to 32M (2^16 * 512)	*/
+#define	SPA_ASIZEBITS		24	/* ASIZE up to 64 times larger	*/
+
+/*
+ * All SPA data is represented by 128-bit data virtual addresses (DVAs).
+ * The members of the dva_t should be considered opaque outside the SPA.
+ */
+typedef struct dva {
+	uint64_t	dva_word[2];
+} dva_t;
+
+/*
+ * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
+ */
+typedef struct zio_cksum {
+	uint64_t	zc_word[4];
+} zio_cksum_t;
+
+/*
+ * Each block is described by its DVAs, time of birth, checksum, etc.
+ * The word-by-word, bit-by-bit layout of the blkptr is as follows:
+ *
+ *	64	56	48	40	32	24	16	8	0
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0	|		vdev1		| GRID  |	  ASIZE		|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 1	|G|			 offset1				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 2	|		vdev2		| GRID  |	  ASIZE		|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 3	|G|			 offset2				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 4	|		vdev3		| GRID  |	  ASIZE		|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 5	|G|			 offset3				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6	|E| lvl | type	| cksum | comp	|     PSIZE	|     LSIZE	|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7	|			padding					|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 8	|			padding					|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 9	|			padding					|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * a	|			birth txg				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * b	|			fill count				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * c	|			checksum[0]				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * d	|			checksum[1]				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * e	|			checksum[2]				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * f	|			checksum[3]				|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * vdev		virtual device ID
+ * offset	offset into virtual device
+ * LSIZE	logical size
+ * PSIZE	physical size (after compression)
+ * ASIZE	allocated size (including RAID-Z parity and gang block headers)
+ * GRID		RAID-Z layout information (reserved for future use)
+ * cksum	checksum function
+ * comp		compression function
+ * G		gang block indicator
+ * E		endianness
+ * type		DMU object type
+ * lvl		level of indirection
+ * birth txg	transaction group in which the block was born
+ * fill count	number of non-zero blocks under this bp
+ * checksum[4]	256-bit checksum of the data this bp describes
+ */
+typedef struct blkptr {
+	dva_t		blk_dva[3];	/* 128-bit Data Virtual Address	*/
+	uint64_t	blk_prop;	/* size, compression, type, etc	*/
+	uint64_t	blk_pad[3];	/* Extra space for the future	*/
+	uint64_t	blk_birth;	/* transaction group at birth	*/
+	uint64_t	blk_fill;	/* fill count			*/
+	zio_cksum_t	blk_cksum;	/* 256-bit checksum		*/
+} blkptr_t;
+
+#define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
+#define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
+
+/*
+ * Macros to get and set fields in a bp or DVA.
+ */
+#define	DVA_GET_ASIZE(dva)	\
+	BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
+#define	DVA_SET_ASIZE(dva, x)	\
+	BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
+
+#define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
+#define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
+
+#define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, 32)
+#define	DVA_SET_VDEV(dva, x)	BF64_SET((dva)->dva_word[0], 32, 32, x)
+
+#define	DVA_GET_OFFSET(dva)	\
+	BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
+#define	DVA_SET_OFFSET(dva, x)	\
+	BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
+
+#define	DVA_GET_GANG(dva)	BF64_GET((dva)->dva_word[1], 63, 1)
+#define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)
+
+#define	BP_GET_LSIZE(bp)	\
+	(BP_IS_HOLE(bp) ? 0 : \
+	BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
+#define	BP_SET_LSIZE(bp, x)	\
+	BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define	BP_GET_PSIZE(bp)	\
+	BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define	BP_SET_PSIZE(bp, x)	\
+	BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define	BP_GET_COMPRESS(bp)	BF64_GET((bp)->blk_prop, 32, 8)
+#define	BP_SET_COMPRESS(bp, x)	BF64_SET((bp)->blk_prop, 32, 8, x)
+
+#define	BP_GET_CHECKSUM(bp)	BF64_GET((bp)->blk_prop, 40, 8)
+#define	BP_SET_CHECKSUM(bp, x)	BF64_SET((bp)->blk_prop, 40, 8, x)
+
+#define	BP_GET_TYPE(bp)		BF64_GET((bp)->blk_prop, 48, 8)
+#define	BP_SET_TYPE(bp, x)	BF64_SET((bp)->blk_prop, 48, 8, x)
+
+#define	BP_GET_LEVEL(bp)	BF64_GET((bp)->blk_prop, 56, 5)
+#define	BP_SET_LEVEL(bp, x)	BF64_SET((bp)->blk_prop, 56, 5, x)
+
+#define	BP_GET_BYTEORDER(bp)	(0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define	BP_SET_BYTEORDER(bp, x)	BF64_SET((bp)->blk_prop, 63, 1, x)
+
+#define	BP_GET_ASIZE(bp)	\
+	(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+	DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define	DVA_EQUAL(dva1, dva2)	\
+	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
+	(dva1)->dva_word[0] == (dva2)->dva_word[0])
+
+#define	DVA_IS_VALID(dva)	(DVA_GET_ASIZE(dva) != 0)
+
+#define	ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3)	\
+{						\
+	(zcp)->zc_word[0] = w0;			\
+	(zcp)->zc_word[1] = w1;			\
+	(zcp)->zc_word[2] = w2;			\
+	(zcp)->zc_word[3] = w3;			\
+}
+
+#define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
+
+#define	BP_IDENTITY(bp)		(&(bp)->blk_dva[0])
+
+#define	BP_ZERO(bp)				\
+{						\
+	(bp)->blk_dva[0].dva_word[0] = 0;	\
+	(bp)->blk_dva[0].dva_word[1] = 0;	\
+	(bp)->blk_dva[1].dva_word[0] = 0;	\
+	(bp)->blk_dva[1].dva_word[1] = 0;	\
+	(bp)->blk_dva[2].dva_word[0] = 0;	\
+	(bp)->blk_dva[2].dva_word[1] = 0;	\
+	(bp)->blk_prop = 0;			\
+	(bp)->blk_pad[0] = 0;			\
+	(bp)->blk_pad[1] = 0;			\
+	(bp)->blk_pad[2] = 0;			\
+	(bp)->blk_birth = 0;			\
+	(bp)->blk_fill = 0;			\
+	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
+}
+
+/*
+ * Note: the byteorder is either 0 or -1, both of which are palindromes.
+ * This simplifies the endianness handling a bit.
+ */
+#ifdef _BIG_ENDIAN
+#define	ZFS_HOST_BYTEORDER	(0ULL)
+#else
+#define	ZFS_HOST_BYTEORDER	(-1ULL)
+#endif
+
+#define	BP_SHOULD_BYTESWAP(bp)	(BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
+
+#include <sys/dmu.h>
+
+/*
+ * Routines found in spa.c
+ */
+
+/* state manipulation functions */
+extern int spa_open(const char *pool, spa_t **, void *tag);
+extern int spa_get_stats(const char *pool, nvlist_t **config);
+extern int spa_create(const char *pool, nvlist_t *config, char *altroot);
+extern int spa_import(const char *pool, nvlist_t *config, char *altroot);
+extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
+extern int spa_destroy(char *pool);
+extern int spa_export(char *pool);
+
+/* device manipulation */
+extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
+extern int spa_vdev_add_unlocked(spa_t *spa, nvlist_t *nvroot);
+extern int spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot,
+    int replacing);
+extern int spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid,
+    int replace_done);
+extern void spa_vdev_replace_done(spa_t *spa);
+
+/* scrubbing */
+extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force);
+extern void spa_scrub_suspend(spa_t *spa);
+extern void spa_scrub_resume(spa_t *spa);
+extern void spa_scrub_restart(spa_t *spa, uint64_t txg);
+
+/* spa syncing */
+extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
+extern void spa_sync_allpools(void);
+
+/*
+ * SPA configuration functions in spa_config.c
+ */
+extern void spa_config_sync(void);
+extern void spa_config_load(void);
+extern nvlist_t *spa_all_configs(uint64_t *);
+extern void spa_config_set(spa_t *spa, nvlist_t *config);
+extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
+    int getstats);
+
+/*
+ * Miscellaneous SPA routines in spa_misc.c
+ */
+
+/* Namespace manipulation */
+extern spa_t *spa_lookup(const char *name);
+extern spa_t *spa_add(const char *name);
+extern void spa_remove(spa_t *spa);
+extern spa_t *spa_next(spa_t *prev);
+
+/* Refcount functions */
+extern void spa_open_ref(spa_t *spa, void *tag);
+extern void spa_close(spa_t *spa, void *tag);
+extern boolean_t spa_refcount_zero(spa_t *spa);
+
+/* Pool configuration lock */
+extern void spa_config_enter(spa_t *spa, krw_t rw);
+extern void spa_config_exit(spa_t *spa);
+extern boolean_t spa_config_held(spa_t *spa, krw_t rw);
+
+/* Pool vdev add/remove lock */
+extern uint64_t spa_vdev_enter(spa_t *spa);
+extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
+
+/* Accessor functions */
+extern krwlock_t *spa_traverse_rwlock(spa_t *spa);
+extern int spa_traverse_wanted(spa_t *spa);
+extern struct dsl_pool *spa_get_dsl(spa_t *spa);
+extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
+extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
+extern void spa_altroot(spa_t *, char *, size_t);
+extern int spa_sync_pass(spa_t *spa);
+extern char *spa_name(spa_t *spa);
+extern uint64_t spa_guid(spa_t *spa);
+extern uint64_t spa_last_synced_txg(spa_t *spa);
+extern uint64_t spa_first_txg(spa_t *spa);
+extern int spa_state(spa_t *spa);
+extern uint64_t spa_freeze_txg(spa_t *spa);
+struct metaslab_class;
+extern struct metaslab_class *spa_metaslab_class_select(spa_t *spa);
+extern uint64_t spa_get_alloc(spa_t *spa);
+extern uint64_t spa_get_space(spa_t *spa);
+extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
+extern int spa_busy(void);
+
+/* Miscellaneous support routines */
+extern int spa_rename(const char *oldname, const char *newname);
+extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
+extern char *spa_strdup(const char *);
+extern void spa_strfree(char *);
+extern uint64_t spa_get_random(uint64_t range);
+extern void sprintf_blkptr(char *buf, blkptr_t *bp);
+extern void spa_freeze(spa_t *spa);
+extern void spa_evict_all(void);
+
+/* Initialization and termination */
+extern void spa_init(int flags);
+extern void spa_fini(void);
+
+#ifdef ZFS_DEBUG
+#define	dprintf_bp(bp, fmt, ...) do {			\
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+	char __blkbuf[200];				\
+	sprintf_blkptr(__blkbuf, (bp));			\
+	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);	\
+	} \
+_NOTE(CONSTCOND) } while (0)
+#else
+#define	dprintf_bp(bp, fmt, ...)
+#endif
+
+extern int spa_mode;			/* mode, e.g. FREAD | FWRITE */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SPA_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
new file mode 100644
index 000000000000..0fcef6c48b21
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -0,0 +1,118 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPA_IMPL_H
+#define	_SYS_SPA_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/vdev.h>
+#include <sys/metaslab.h>
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/uberblock_impl.h>
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/refcount.h>
+#include <sys/bplist.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct spa_config_lock {
+	kmutex_t	scl_lock;
+	uint64_t	scl_count;
+	kthread_t	*scl_writer;
+	kcondvar_t	scl_cv;
+} spa_config_lock_t;
+
+struct spa {
+	/*
+	 * Fields protected by spa_namespace_lock.
+	 */
+	char		*spa_name;
+	avl_node_t	spa_avl;
+	int		spa_anon;
+	nvlist_t	*spa_config;
+	uint64_t	spa_config_txg;		/* txg of last config change */
+	spa_config_lock_t spa_config_lock;	/* configuration changes */
+	kmutex_t	spa_config_cache_lock;	/* for spa_config RW_READER */
+	int		spa_sync_pass;		/* iterate-to-convergence */
+	int		spa_state;		/* pool state */
+	uint8_t		spa_minref;		/* min refcnt of open pool */
+	uint8_t		spa_traverse_wanted;	/* traverse lock wanted */
+	taskq_t		*spa_vdev_retry_taskq;
+	taskq_t		*spa_zio_issue_taskq[ZIO_TYPES];
+	taskq_t		*spa_zio_intr_taskq[ZIO_TYPES];
+	dsl_pool_t	*spa_dsl_pool;
+	metaslab_class_t *spa_normal_class;	/* normal data class */
+	uint64_t	spa_first_txg;		/* first txg after spa_open() */
+	uint64_t	spa_freeze_txg;		/* freeze pool at this txg */
+	objset_t	*spa_meta_objset;	/* copy of dp->dp_meta_objset */
+	txg_list_t	spa_vdev_txg_list;	/* per-txg dirty vdev list */
+	vdev_t		*spa_root_vdev;		/* top-level vdev container */
+	list_t		spa_dirty_list;		/* vdevs with dirty labels */
+	uint64_t	spa_config_object;	/* MOS object for pool config */
+	uint64_t	spa_syncing_txg;	/* txg currently syncing */
+	uint64_t	spa_sync_bplist_obj;	/* object for deferred frees */
+	bplist_t	spa_sync_bplist;	/* deferred-free bplist */
+	krwlock_t	spa_traverse_lock;	/* traverse vs. spa_sync() */
+	uberblock_t	spa_ubsync;		/* last synced uberblock */
+	uberblock_t	spa_uberblock;		/* current uberblock */
+	kmutex_t	spa_scrub_lock;		/* resilver/scrub lock */
+	kthread_t	*spa_scrub_thread;	/* scrub/resilver thread */
+	traverse_handle_t *spa_scrub_th;	/* scrub traverse handle */
+	uint64_t	spa_scrub_restart_txg;	/* need to restart */
+	uint64_t	spa_scrub_maxtxg;	/* max txg we'll scrub */
+	uint64_t	spa_scrub_inflight;	/* in-flight scrub I/Os */
+	uint64_t	spa_scrub_errors;	/* scrub I/O error count */
+	kcondvar_t	spa_scrub_cv;		/* scrub thread state change */
+	kcondvar_t	spa_scrub_io_cv;	/* scrub I/O completion */
+	uint8_t		spa_scrub_stop;		/* tell scrubber to stop */
+	uint8_t		spa_scrub_suspend;	/* tell scrubber to suspend */
+	uint8_t		spa_scrub_active;	/* active or suspended? */
+	uint8_t		spa_scrub_type;		/* type of scrub we're doing */
+	int		spa_sync_on;		/* sync threads are running */
+	char		*spa_root;		/* alternate root directory */
+	kmutex_t	spa_uberblock_lock;	/* vdev_uberblock_load_done() */
+	/*
+	 * spa_refcnt must be the last element because it changes size based on
+	 * compilation options.  In order for the MDB module to function
+	 * correctly, the other fields must remain in the same location.
+	 */
+	refcount_t	spa_refcount;		/* number of opens */
+};
+
+extern const char *spa_config_dir;
+extern kmutex_t spa_namespace_lock;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SPA_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/space_map.h b/usr/src/uts/common/fs/zfs/sys/space_map.h
new file mode 100644
index 000000000000..9f0cf83c9aaa
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/space_map.h
@@ -0,0 +1,144 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPACE_MAP_H
+#define	_SYS_SPACE_MAP_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/avl.h>
+#include <sys/dmu.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct space_map {
+	avl_tree_t	sm_root;	/* Root of the AVL tree */
+	uint64_t	sm_start;	/* Start of map (inclusive) */
+	uint64_t	sm_end;		/* End of map (exclusive) */
+	uint64_t	sm_size;	/* Size of map (end - start) */
+	uint64_t	sm_shift;	/* Unit shift */
+	uint64_t	sm_space;	/* Sum of all segments in the map */
+	kmutex_t	*sm_lock;	/* pointer to lock that protects map */
+} space_map_t;
+
+typedef struct space_seg {
+	avl_node_t	ss_node;	/* AVL node */
+	uint64_t	ss_start;	/* starting offset of this segment */
+	uint64_t	ss_end;		/* ending offset (non-inclusive) */
+} space_seg_t;
+
+typedef struct space_map_obj {
+	uint64_t	smo_object;	/* on-disk space map object */
+	uint64_t	smo_objsize;	/* size of the object */
+	uint64_t	smo_alloc;	/* space allocated from the map */
+} space_map_obj_t;
+
+/*
+ * debug entry
+ *
+ *    1      3         10                     50
+ *  ,---+--------+------------+---------------------------------.
+ *  | 1 | action |  syncpass  |        txg (lower bits)         |
+ *  `---+--------+------------+---------------------------------'
+ *   63  62    60 59        50 49                               0
+ *
+ *
+ *
+ * non-debug entry
+ *
+ *    1               47                   1           15
+ *  ,-----------------------------------------------------------.
+ *  | 0 |   offset (sm_shift units)    | type |       run       |
+ *  `-----------------------------------------------------------'
+ *   63  62                          17   16   15               0
+ */
+
+/* All this stuff takes and returns bytes */
+#define	SM_RUN_DECODE(x)	(BF64_DECODE(x, 0, 15) + 1)
+#define	SM_RUN_ENCODE(x)	BF64_ENCODE((x) - 1, 0, 15)
+#define	SM_TYPE_DECODE(x)	BF64_DECODE(x, 15, 1)
+#define	SM_TYPE_ENCODE(x)	BF64_ENCODE(x, 15, 1)
+#define	SM_OFFSET_DECODE(x)	BF64_DECODE(x, 16, 47)
+#define	SM_OFFSET_ENCODE(x)	BF64_ENCODE(x, 16, 47)
+#define	SM_DEBUG_DECODE(x)	BF64_DECODE(x, 63, 1)
+#define	SM_DEBUG_ENCODE(x)	BF64_ENCODE(x, 63, 1)
+
+#define	SM_DEBUG_ACTION_DECODE(x)	BF64_DECODE(x, 60, 3)
+#define	SM_DEBUG_ACTION_ENCODE(x)	BF64_ENCODE(x, 60, 3)
+
+#define	SM_DEBUG_SYNCPASS_DECODE(x)	BF64_DECODE(x, 50, 10)
+#define	SM_DEBUG_SYNCPASS_ENCODE(x)	BF64_ENCODE(x, 50, 10)
+
+#define	SM_DEBUG_TXG_DECODE(x)		BF64_DECODE(x, 0, 50)
+#define	SM_DEBUG_TXG_ENCODE(x)		BF64_ENCODE(x, 0, 50)
+
+#define	SM_RUN_MAX			SM_RUN_DECODE(~0ULL)
+
+#define	SM_ALLOC	0x0
+#define	SM_FREE		0x1
+
+/*
+ * The data for a given space map can be kept on blocks of any size.
+ * Larger blocks entail fewer i/o operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more i/o bandwidth
+ * when only a few blocks have changed since the last transaction group.
+ * This could use a lot more research, but for now, set the freelist
+ * block size to 4k (2^12).
+ */
+#define	SPACE_MAP_BLOCKSHIFT	12
+
+#define	SPACE_MAP_CHUNKSIZE	(1<<20)
+
+typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size);
+
+extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
+    uint64_t shift, kmutex_t *lp);
+extern void space_map_destroy(space_map_t *sm);
+extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
+extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_vacate(space_map_t *sm,
+    space_map_func_t *func, space_map_t *mdest);
+extern void space_map_iterate(space_map_t *sm,
+    space_map_func_t *func, space_map_t *mdest);
+extern void space_map_merge(space_map_t *dest, space_map_t *src);
+extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size);
+extern void space_map_union(space_map_t *smd, space_map_t *sms);
+
+extern int space_map_load(space_map_t *sm, space_map_obj_t *smo,
+    uint8_t maptype, objset_t *os, uint64_t end, uint64_t space);
+extern void space_map_sync(space_map_t *sm, space_map_t *dest,
+    space_map_obj_t *smo, uint8_t maptype, objset_t *os, dmu_tx_t *tx);
+extern void space_map_write(space_map_t *sm, space_map_obj_t *smo,
+    objset_t *os, dmu_tx_t *tx);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SPACE_MAP_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/txg.h b/usr/src/uts/common/fs/zfs/sys/txg.h
new file mode 100644
index 000000000000..dae129c2e5a4
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/txg.h
@@ -0,0 +1,120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_TXG_H
+#define	_SYS_TXG_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	TXG_CONCURRENT_STATES	3	/* open, quiescing, syncing	*/
+#define	TXG_SIZE		4		/* next power of 2	*/
+#define	TXG_MASK		(TXG_SIZE - 1)	/* mask for size	*/
+#define	TXG_INITIAL		TXG_SIZE	/* initial txg 		*/
+#define	TXG_IDX			(txg & TXG_MASK)
+
+#define	TXG_WAIT		1ULL
+#define	TXG_NOWAIT		2ULL
+
+typedef struct tx_cpu tx_cpu_t;
+
+typedef struct txg_handle {
+	tx_cpu_t	*th_cpu;
+	uint64_t	th_txg;
+} txg_handle_t;
+
+typedef struct txg_node {
+	struct txg_node	*tn_next[TXG_SIZE];
+	uint8_t		tn_member[TXG_SIZE];
+} txg_node_t;
+
+typedef struct txg_list {
+	kmutex_t	tl_lock;
+	size_t		tl_offset;
+	txg_node_t	*tl_head[TXG_SIZE];
+} txg_list_t;
+
+struct dsl_pool;
+
+extern void txg_init(struct dsl_pool *dp, uint64_t txg);
+extern void txg_fini(struct dsl_pool *dp);
+extern void txg_sync_start(struct dsl_pool *dp);
+extern void txg_sync_stop(struct dsl_pool *dp);
+extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp);
+extern void txg_rele_to_quiesce(txg_handle_t *txghp);
+extern void txg_rele_to_sync(txg_handle_t *txghp);
+extern void txg_suspend(struct dsl_pool *dp);
+extern void txg_resume(struct dsl_pool *dp);
+
+/*
+ * Wait until the given transaction group has finished syncing.
+ * Try to make this happen as soon as possible (eg. kick off any
+ * necessary syncs immediately).  If txg==0, wait for the currently open
+ * txg to finish syncing.
+ */
+extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * Wait until the given transaction group, or one after it, is
+ * the open transaction group.  Try to make this happen as soon
+ * as possible (eg. kick off any necessary syncs immediately).
+ * If txg == 0, wait for the next open txg.
+ */
+extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg);
+
+/*
+ * Returns TRUE if we are "backed up" waiting for the syncing
+ * transaction to complete; otherwise returns FALSE.
+ */
+extern int txg_stalled(struct dsl_pool *dp);
+
+/*
+ * Per-txg object lists.
+ */
+
+#define	TXG_CLEAN(txg)	((txg) - 1)
+
+extern void txg_list_create(txg_list_t *tl, size_t offset);
+extern void txg_list_destroy(txg_list_t *tl);
+extern int txg_list_empty(txg_list_t *tl, uint64_t txg);
+extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
+extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
+extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg);
+extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
+extern void *txg_list_head(txg_list_t *tl, uint64_t txg);
+extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_TXG_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/txg_impl.h b/usr/src/uts/common/fs/zfs/sys/txg_impl.h
new file mode 100644
index 000000000000..45a138afaac3
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/txg_impl.h
@@ -0,0 +1,77 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_TXG_IMPL_H
+#define	_SYS_TXG_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/txg.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct tx_cpu {
+	kmutex_t	tc_lock;
+	kcondvar_t	tc_cv[TXG_SIZE];
+	uint64_t	tc_count[TXG_SIZE];
+	char		tc_pad[16];
+};
+
+typedef struct tx_state {
+	tx_cpu_t	*tx_cpu;	/* protects right to enter txg	*/
+	kmutex_t	tx_sync_lock;	/* protects tx_state_t */
+	krwlock_t	tx_suspend;
+	uint64_t	tx_open_txg;	/* currently open txg id */
+	uint64_t	tx_quiesced_txg; /* quiesced txg waiting for sync */
+	uint64_t	tx_syncing_txg;	/* currently syncing txg id */
+	uint64_t	tx_synced_txg;	/* last synced txg id */
+
+	uint64_t	tx_sync_txg_waiting; /* txg we're waiting to sync */
+	uint64_t	tx_quiesce_txg_waiting; /* txg we're waiting to open */
+
+	kcondvar_t	tx_sync_more_cv;
+	kcondvar_t	tx_sync_done_cv;
+	kcondvar_t	tx_quiesce_more_cv;
+	kcondvar_t	tx_quiesce_done_cv;
+	kcondvar_t	tx_timeout_exit_cv;
+	kcondvar_t	tx_exit_cv;	/* wait for all threads to exit */
+
+	uint8_t		tx_threads;	/* number of threads */
+	uint8_t		tx_exiting;	/* set when we're exiting */
+
+	kthread_t	*tx_sync_thread;
+	kthread_t	*tx_quiesce_thread;
+	kthread_t	*tx_timelimit_thread;
+} tx_state_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_TXG_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/uberblock.h b/usr/src/uts/common/fs/zfs/sys/uberblock.h
new file mode 100644
index 000000000000..93d936ae4b18
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/uberblock.h
@@ -0,0 +1,50 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UBERBLOCK_H
+#define	_SYS_UBERBLOCK_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/vdev.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct uberblock uberblock_t;
+
+extern int uberblock_verify(uberblock_t *ub);
+extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_UBERBLOCK_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h
new file mode 100644
index 000000000000..5bfcea097ddf
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/uberblock_impl.h
@@ -0,0 +1,76 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UBERBLOCK_IMPL_H
+#define	_SYS_UBERBLOCK_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/uberblock.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * The uberblock version is incremented whenever an incompatible on-disk
+ * format change is made to the SPA, DMU, or ZAP.
+ *
+ * Note: the first two fields should never be moved.  When a storage pool
+ * is opened, the uberblock must be read off the disk before the version
+ * can be checked.  If the ub_version field is moved, we may not detect
+ * version mismatch.  If the ub_magic field is moved, applications that
+ * expect the magic number in the first word won't work.
+ */
+
+#define	UBERBLOCK_SHIFT		(10)
+#define	UBERBLOCK_SIZE		(1ULL << UBERBLOCK_SHIFT)
+
+#define	UBERBLOCK_MAGIC		0x00bab10c		/* oo-ba-bloc!	*/
+
+#define	UBERBLOCK_VERSION	1ULL
+
+struct uberblock {
+	uint64_t	ub_magic;	/* UBERBLOCK_MAGIC		*/
+	uint64_t	ub_version;	/* UBERBLOCK_VERSION		*/
+	uint64_t	ub_txg;		/* txg of last sync		*/
+	uint64_t	ub_guid_sum;	/* sum of all vdev guids	*/
+	uint64_t	ub_timestamp;	/* UTC time of last sync	*/
+	blkptr_t	ub_rootbp;	/* MOS objset_phys_t		*/
+};
+
+typedef struct uberblock_phys {
+	uberblock_t	ubp_uberblock;
+	char		ubp_pad[UBERBLOCK_SIZE - sizeof (uberblock_t) -
+	    sizeof (zio_block_tail_t)];
+	zio_block_tail_t ubp_zbt;
+} uberblock_phys_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_UBERBLOCK_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/unique.h b/usr/src/uts/common/fs/zfs/sys/unique.h
new file mode 100644
index 000000000000..c8c177e3ca6c
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/unique.h
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_UNIQUE_H
+#define	_SYS_UNIQUE_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/* The number of significant bits in each unique value. */
+#define	UNIQUE_BITS	56
+
+void unique_init(void);
+
+/* Return a new unique value. */
+uint64_t unique_create(void);
+
+/* Return a unique value, which equals the one passed in if possible. */
+uint64_t unique_insert(uint64_t value);
+
+/* Indicate that this value no longer needs to be uniquified against. */
+void unique_remove(uint64_t value);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_UNIQUE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
new file mode 100644
index 000000000000..4113ff2ca607
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -0,0 +1,135 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_H
+#define	_SYS_VDEV_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+#include <sys/space_map.h>
+#include <sys/fs/zfs.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Vdev knobs.
+ */
+typedef struct vdev_knob {
+	char		*vk_name;		/* knob name		*/
+	char		*vk_desc;		/* knob description	*/
+	uint64_t	vk_min;			/* minimum legal value	*/
+	uint64_t	vk_max;			/* maximum legal value	*/
+	uint64_t	vk_default;		/* default value	*/
+	size_t		vk_offset;		/* offset into vdev_t	*/
+} vdev_knob_t;
+
+/*
+ * Fault injection modes.
+ */
+#define	VDEV_FAULT_NONE		0
+#define	VDEV_FAULT_RANDOM	1
+#define	VDEV_FAULT_COUNT	2
+
+extern int vdev_open(vdev_t *);
+extern void vdev_close(vdev_t *);
+extern int vdev_create(vdev_t *, uint64_t txg);
+extern void vdev_init(vdev_t *, uint64_t txg);
+extern void vdev_reopen(vdev_t *, zio_t **zq);
+
+extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
+extern vdev_t *vdev_lookup_by_path(vdev_t *vd, const char *path);
+extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
+extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size);
+extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
+extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
+    int scrub_done);
+
+extern const char *vdev_description(vdev_t *vd);
+
+extern void vdev_metaslab_init(vdev_t *vd, uint64_t txg);
+extern void vdev_metaslab_fini(vdev_t *vd);
+
+extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
+extern void vdev_stat_update(zio_t *zio);
+extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
+    boolean_t complete);
+extern void vdev_checksum_error(zio_t *zio, vdev_t *vd);
+extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
+extern void vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux);
+
+extern void vdev_space_update(vdev_t *vd, uint64_t space_delta,
+    uint64_t alloc_delta);
+
+extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
+
+extern void vdev_io_start(zio_t *zio);
+extern void vdev_io_done(zio_t *zio);
+
+extern int vdev_online(spa_t *spa, const char *path);
+extern int vdev_offline(spa_t *spa, const char *path);
+
+extern int vdev_error_setup(spa_t *spa, const char *path, int mode, int mask,
+    uint64_t arg);
+extern int vdev_error_inject(vdev_t *vd, zio_t *zio);
+extern int vdev_is_dead(vdev_t *vd);
+
+extern void vdev_cache_init(vdev_t *vd);
+extern void vdev_cache_fini(vdev_t *vd);
+extern int vdev_cache_read(zio_t *zio);
+extern void vdev_cache_write(zio_t *zio);
+
+extern void vdev_queue_init(vdev_t *vd);
+extern void vdev_queue_fini(vdev_t *vd);
+extern zio_t *vdev_queue_io(zio_t *zio);
+extern void vdev_queue_io_done(zio_t *zio);
+
+extern vdev_knob_t *vdev_knob_next(vdev_knob_t *vk);
+
+extern void vdev_config_dirty(vdev_t *vd);
+extern void vdev_config_clean(vdev_t *vd);
+
+extern nvlist_t *vdev_config_generate(vdev_t *vd, int getstats);
+
+/*
+ * Label routines
+ */
+struct uberblock;
+extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
+extern nvlist_t *vdev_label_read_config(vdev_t *vd);
+extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub);
+int vdev_label_init(vdev_t *vd, uint64_t create_txg);
+extern int spa_sync_labels(spa_t *spa, uint64_t txg);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_VDEV_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_disk.h b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
new file mode 100644
index 000000000000..95536a77db9a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_disk.h
@@ -0,0 +1,52 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_DISK_H
+#define	_SYS_VDEV_DISK_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/vdev.h>
+#ifdef _KERNEL
+#include <sys/sunldi.h>
+#include <sys/sunddi.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_disk {
+	ddi_devid_t	vd_devid;
+	char		*vd_minor;
+	ldi_handle_t	vd_lh;
+} vdev_disk_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_VDEV_DISK_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_file.h b/usr/src/uts/common/fs/zfs/sys/vdev_file.h
new file mode 100644
index 000000000000..cd496735778c
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_file.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_FILE_H
+#define	_SYS_VDEV_FILE_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/vdev.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct vdev_file {
+	vnode_t		*vf_vnode;
+} vdev_file_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_VDEV_FILE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
new file mode 100644
index 000000000000..4ae346761916
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -0,0 +1,287 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_IMPL_H
+#define	_SYS_VDEV_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/avl.h>
+#include <sys/dmu.h>
+#include <sys/metaslab.h>
+#include <sys/nvpair.h>
+#include <sys/space_map.h>
+#include <sys/vdev.h>
+#include <sys/dkio.h>
+#include <sys/uberblock_impl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Virtual device descriptors.
+ *
+ * All storage pool operations go through the virtual device framework,
+ * which provides data replication and I/O scheduling.
+ */
+
+/*
+ * Forward declarations that lots of things need.
+ */
+typedef struct vdev_queue vdev_queue_t;
+typedef struct vdev_cache vdev_cache_t;
+typedef struct vdev_cache_entry vdev_cache_entry_t;
+
+/*
+ * Virtual device operations
+ */
+typedef int	vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift);
+typedef void	vdev_close_func_t(vdev_t *vd);
+typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
+typedef void	vdev_io_start_func_t(zio_t *zio);
+typedef void	vdev_io_done_func_t(zio_t *zio);
+typedef void	vdev_state_change_func_t(vdev_t *vd, int, int);
+
+typedef struct vdev_ops {
+	vdev_open_func_t		*vdev_op_open;
+	vdev_close_func_t		*vdev_op_close;
+	vdev_asize_func_t		*vdev_op_asize;
+	vdev_io_start_func_t		*vdev_op_io_start;
+	vdev_io_done_func_t		*vdev_op_io_done;
+	vdev_state_change_func_t	*vdev_op_state_change;
+	char				vdev_op_type[16];
+	boolean_t			vdev_op_leaf;
+} vdev_ops_t;
+
+/*
+ * Virtual device properties
+ */
+struct vdev_cache_entry {
+	char		*ve_data;
+	uint64_t	ve_offset;
+	uint64_t	ve_lastused;
+	avl_node_t	ve_offset_node;
+	avl_node_t	ve_lastused_node;
+	uint32_t	ve_hits;
+	uint16_t	ve_missed_update;
+	zio_t		*ve_fill_io;
+};
+
+struct vdev_cache {
+	uint64_t	vc_size;
+	uint64_t	vc_bshift;
+	uint64_t	vc_blocksize;
+	uint64_t	vc_max;
+	avl_tree_t	vc_offset_tree;
+	avl_tree_t	vc_lastused_tree;
+	kmutex_t	vc_lock;
+};
+
+struct vdev_queue {
+	uint64_t	vq_min_pending;
+	uint64_t	vq_max_pending;
+	uint64_t	vq_agg_limit;
+	uint64_t	vq_time_shift;
+	uint64_t	vq_ramp_rate;
+	avl_tree_t	vq_deadline_tree;
+	avl_tree_t	vq_read_tree;
+	avl_tree_t	vq_write_tree;
+	avl_tree_t	vq_pending_tree;
+	kmutex_t	vq_lock;
+};
+
+/*
+ * Virtual device descriptor
+ */
+struct vdev {
+	/*
+	 * Common to all vdev types.
+	 */
+	uint64_t	vdev_id;	/* child number in vdev parent	*/
+	uint64_t	vdev_guid;	/* unique ID for this vdev	*/
+	uint64_t	vdev_guid_sum;	/* self guid + all child guids	*/
+	uint64_t	vdev_asize;	/* allocatable device capacity	*/
+	uint64_t	vdev_ashift;	/* block alignment shift	*/
+	uint64_t	vdev_state;	/* see VDEV_STATE_* #defines	*/
+	vdev_ops_t	*vdev_ops;	/* vdev operations		*/
+	spa_t		*vdev_spa;	/* spa for this vdev		*/
+	void		*vdev_tsd;	/* type-specific data		*/
+	vdev_t		*vdev_top;	/* top-level vdev		*/
+	vdev_t		*vdev_parent;	/* parent vdev			*/
+	vdev_t		**vdev_child;	/* array of children		*/
+	uint64_t	vdev_children;	/* number of children		*/
+	space_map_t	vdev_dtl_map;	/* dirty time log in-core state	*/
+	space_map_t	vdev_dtl_scrub;	/* DTL for scrub repair writes	*/
+	vdev_stat_t	vdev_stat;	/* virtual device statistics	*/
+
+	/*
+	 * Top-level vdev state.
+	 */
+	uint64_t	vdev_ms_array;	/* metaslab array object	*/
+	uint64_t	vdev_ms_shift;	/* metaslab size shift		*/
+	uint64_t	vdev_ms_count;	/* number of metaslabs		*/
+	metaslab_group_t *vdev_mg;	/* metaslab group		*/
+	metaslab_t	**vdev_ms;	/* metaslab array		*/
+	space_map_obj_t	*vdev_smo;	/* metaslab space map array	*/
+	txg_list_t	vdev_ms_list;	/* per-txg dirty metaslab lists	*/
+	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
+	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
+	uint8_t		vdev_dirty[TXG_SIZE]; /* per-txg dirty flags	*/
+	int		vdev_is_dirty;	/* on config dirty list?	*/
+	list_node_t	vdev_dirty_node; /* config dirty list		*/
+	zio_t		*vdev_io_retry;	/* I/O retry list		*/
+	list_t		vdev_io_pending; /* I/O pending list		*/
+
+	/*
+	 * Leaf vdev state.
+	 */
+	uint64_t	vdev_psize;	/* physical device capacity	*/
+	space_map_obj_t	vdev_dtl;	/* dirty time log on-disk state	*/
+	txg_node_t	vdev_dtl_node;	/* per-txg dirty DTL linkage	*/
+	char		*vdev_path;	/* vdev path (if any)		*/
+	char		*vdev_devid;	/* vdev devid (if any)		*/
+	uint64_t	vdev_fault_arg; /* fault injection paramater	*/
+	int		vdev_fault_mask; /* zio types to fault		*/
+	uint8_t		vdev_fault_mode; /* fault injection mode	*/
+	uint8_t		vdev_cache_active; /* vdev_cache and vdev_queue	*/
+	uint8_t		vdev_offline;	/* device taken offline?	*/
+	uint8_t		vdev_detached;	/* device detached?		*/
+	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
+	vdev_cache_t	vdev_cache;	/* physical block cache		*/
+
+	/*
+	 * For DTrace to work in userland (libzpool) context, these fields must
+	 * remain at the end of the structure.  DTrace will use the kernel's
+	 * CTF definition for 'struct vdev', and since the size of a kmutex_t is
+	 * larger in userland, the offsets for the rest fields would be
+	 * incorrect.
+	 */
+	kmutex_t	vdev_dtl_lock;	/* vdev_dtl_{map,resilver}	*/
+	kmutex_t	vdev_dirty_lock; /* vdev_dirty[]		*/
+	kmutex_t	vdev_io_lock;	/* vdev_io_pending list		*/
+	kcondvar_t	vdev_io_cv;	/* vdev_io_pending list empty?	*/
+	kmutex_t	vdev_stat_lock;	/* vdev_stat			*/
+};
+
+#define	VDEV_SKIP_SIZE		(8 << 10)
+#define	VDEV_BOOT_HEADER_SIZE	(8 << 10)
+#define	VDEV_PHYS_SIZE		(112 << 10)
+#define	VDEV_UBERBLOCKS		((128 << 10) >> UBERBLOCK_SHIFT)
+
+#define	VDEV_BOOT_MAGIC		0x2f5b007b10c	/* ZFS boot block	*/
+#define	VDEV_BOOT_VERSION	1		/* version number	*/
+
+typedef struct vdev_boot_header {
+	uint64_t	vb_magic;		/* VDEV_BOOT_MAGIC	*/
+	uint64_t	vb_version;		/* VDEV_BOOT_VERSION	*/
+	uint64_t	vb_offset;		/* start offset	(bytes) */
+	uint64_t	vb_size;		/* size (bytes)		*/
+	char		vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)];
+} vdev_boot_header_t;
+
+typedef struct vdev_phys {
+	char		vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
+	zio_block_tail_t vp_zbt;
+} vdev_phys_t;
+
+typedef struct vdev_label {
+	char			vl_pad[VDEV_SKIP_SIZE];		/*   8K	*/
+	vdev_boot_header_t	vl_boot_header;			/*   8K	*/
+	vdev_phys_t		vl_vdev_phys;			/* 120K	*/
+	uberblock_phys_t	vl_uberblock[VDEV_UBERBLOCKS];	/* 128K	*/
+} vdev_label_t;							/* 256K total */
+
+/*
+ * Size and offset of embedded boot loader region on each label.
+ * The total size of the first two labels plus the boot area is 4MB.
+ */
+#define	VDEV_BOOT_OFFSET	(2 * sizeof (vdev_label_t))
+#define	VDEV_BOOT_SIZE		(7ULL << 19)			/* 3.5M	*/
+
+/*
+ * vdev_dirty[] flags
+ */
+#define	VDD_ALLOC	0x01	/* allocated from in this txg		*/
+#define	VDD_FREE	0x02	/* freed to in this txg			*/
+#define	VDD_ADD		0x04	/* added to the pool in this txg	*/
+#define	VDD_DTL		0x08	/* dirty time log entry in this txg	*/
+
+/*
+ * Size of label regions at the start and end of each leaf device.
+ */
+#define	VDEV_LABEL_START_SIZE	(2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
+#define	VDEV_LABEL_END_SIZE	(2 * sizeof (vdev_label_t))
+#define	VDEV_LABELS		4
+
+#define	VDEV_ALLOC_LOAD		0
+#define	VDEV_ALLOC_ADD		1
+
+/*
+ * Allocate or free a vdev
+ */
+extern vdev_t *vdev_alloc(spa_t *spa, nvlist_t *config, vdev_t *parent,
+    uint_t id, int alloctype);
+extern void vdev_free(vdev_t *vd);
+
+/*
+ * Add or remove children and parents
+ */
+extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd);
+extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd);
+extern void vdev_compact_children(vdev_t *pvd);
+extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops);
+extern void vdev_remove_parent(vdev_t *cvd);
+
+/*
+ * vdev sync load and sync
+ */
+extern int vdev_load(vdev_t *vd, int import);
+extern void vdev_sync(vdev_t *vd, uint64_t txg);
+extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
+extern void vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg);
+
+/*
+ * Available vdev types.
+ */
+extern vdev_ops_t vdev_root_ops;
+extern vdev_ops_t vdev_mirror_ops;
+extern vdev_ops_t vdev_replacing_ops;
+extern vdev_ops_t vdev_raidz_ops;
+extern vdev_ops_t vdev_disk_ops;
+extern vdev_ops_t vdev_file_ops;
+extern vdev_ops_t vdev_missing_ops;
+
+/*
+ * Common asize function
+ */
+extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_VDEV_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zap.h b/usr/src/uts/common/fs/zfs/sys/zap.h
new file mode 100644
index 000000000000..94ad0ffebe3d
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zap.h
@@ -0,0 +1,353 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_ZAP_H
+#define	_SYS_ZAP_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * ZAP - ZFS Attribute Processor
+ *
+ * The ZAP is a module which sits on top of the DMU (Data Managemnt
+ * Unit) and implements a higher-level storage primitive using DMU
+ * objects.  Its primary consumer is the ZPL (ZFS Posix Layer).
+ *
+ * A "zapobj" is a DMU object which the ZAP uses to stores attributes.
+ * Users should use only zap routines to access a zapobj - they should
+ * not access the DMU object directly using DMU routines.
+ *
+ * The attributes stored in a zapobj are name-value pairs.  The name is
+ * a zero-terminated string of up to 256 bytes (including terminating
+ * NULL).  The value is an array of integers (whose length is limited
+ * only by the size of the zapobj).  The integers may be 1, 2, 4, or 8
+ * bytes long.  Note that an 8-byte integer value can be used to store
+ * the location (object number) of another dmu object (which may be
+ * itself a zapobj).  Note that you can use a zero-length attribute to
+ * store a single bit of information - the attribute is present or not.
+ *
+ * The ZAP routines are thread-safe.  However, you must observe the
+ * DMU's restriction that a transaction may not be operated on
+ * concurrently.
+ *
+ * Any of the routines that return an int may return an I/O error (EIO
+ * or ECHECKSUM).
+ *
+ *
+ * Implementation / Performance Notes:
+ *
+ * The ZAP is intended to operate most efficiently on attributes with
+ * short (23 bytes or less) names and short (23 bytes or less) values.
+ * The ZAP should be efficient enough so that the user does not need to
+ * cache these attributes.
+ *
+ * Using extremely long (~256 bytes or more) attribute names or values
+ * values will result in poor performance, due to the memcpy from the
+ * user's buffer into the ZAP object.  This penalty can be avoided by
+ * creating an integer-type attribute to store an object number, and
+ * accessing that object using the DMU directly.
+ *
+ * The ZAP's locking scheme makes its routines thread-safe.  Operations
+ * on different zapobjs will be processed concurrently.  Operations on
+ * the same zapobj which only read data will be processed concurrently.
+ * Operations on the same zapobj which modify data will be processed
+ * concurrently when there are many attributes in the zapobj (because
+ * the ZAP uses per-block locking - more than 32 * (number of cpus)
+ * small attributes will suffice).
+ */
+
+/*
+ * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C
+ * strings) for the names of attributes, rather than a byte string
+ * bounded by an explicit length.  If some day we want to support names
+ * in character sets which have embedded zeros (eg. UTF-16, UTF-32),
+ * we'll have to add routines for using length-bounded strings.
+ */
+
+#include <sys/dmu.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Create a new zapobj with no attributes and return its object number.
+ */
+uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * Create a new zapobj with no attributes from the given (unallocated)
+ * object number.
+ */
+int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+
+/*
+ * The zapobj passed in must be a valid ZAP object for all of the
+ * following routines.
+ */
+
+/*
+ * Destroy this zapobj and all its attributes.
+ *
+ * Frees the object number using dmu_object_free.
+ */
+int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
+
+/*
+ * Manipulate attributes.
+ *
+ * 'integer_size' is in bytes, and must be 1, 2, 4, or 8.
+ */
+
+/*
+ * Retrieve the contents of the attribute with the given name.
+ *
+ * If the requested attribute does not exist, the call will fail and
+ * return ENOENT.
+ *
+ * If 'integer_size' is smaller than the attribute's integer size, the
+ * call will fail and return EINVAL.
+ *
+ * If 'integer_size' is equal to or larger than the attribute's integer
+ * size, the call will succeed and return 0.  * When converting to a
+ * larger integer size, the integers will be treated as unsigned (ie. no
+ * sign-extension will be performed).
+ *
+ * 'num_integers' is the length (in integers) of 'buf'.
+ *
+ * If the attribute is longer than the buffer, as many integers as will
+ * fit will be transferred to 'buf'.  If the entire attribute was not
+ * transferred, the call will return EOVERFLOW.
+ */
+int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf);
+
+/*
+ * Create an attribute with the given name and value.
+ *
+ * If an attribute with the given name already exists, the call will
+ * fail and return EEXIST.
+ */
+int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
+    int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx);
+
+/*
+ * Set the attribute with the given name to the given value.  If an
+ * attribute with the given name does not exist, it will be created.  If
+ * an attribute with the given name already exists, the previous value
+ * will be overwritten.  The integer_size may be different from the
+ * existing attribute's integer size, in which case the attribute's
+ * integer size will be updated to the new value.
+ */
+int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+
+/*
+ * Get the length (in integers) and the integer size of the specified
+ * attribute.
+ *
+ * If the requested attribute does not exist, the call will fail and
+ * return ENOENT.
+ */
+int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
+    uint64_t *integer_size, uint64_t *num_integers);
+
+/*
+ * Remove the specified attribute.
+ *
+ * If the specified attribute does not exist, the call will fail and
+ * return ENOENT.
+ */
+int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
+
+/*
+ * Returns (in *count) the number of attributes in the specified zap
+ * object.
+ */
+int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
+
+
+/*
+ * Returns (in name) the name of the entry whose value
+ * (za_first_integer) is value, or ENOENT if not found.  The string
+ * pointed to by name must be at least 256 bytes long.
+ */
+int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name);
+
+typedef struct zap_cursor {
+	/* This structure is opaque! */
+	objset_t *zc_objset;
+	uint64_t zc_zapobj;
+	uint64_t zc_hash;
+	uint32_t zc_cd;
+} zap_cursor_t;
+
+typedef struct {
+	int za_integer_length;
+	uint64_t za_num_integers;
+	uint64_t za_first_integer;	/* no sign extension for <8byte ints */
+	char za_name[MAXNAMELEN];
+} zap_attribute_t;
+
+/*
+ * The interface for listing all the attributes of a zapobj can be
+ * thought of as cursor moving down a list of the attributes one by
+ * one.  The cookie returned by the zap_cursor_serialize routine is
+ * persistent across system calls (and across reboot, even).
+ */
+
+/*
+ * Initialize a zap cursor, pointing to the "first" attribute of the
+ * zapobj.
+ */
+void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
+
+/*
+ * Get the attribute currently pointed to by the cursor.  Returns
+ * ENOENT if at the end of the attributes.
+ */
+int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za);
+
+/*
+ * Advance the cursor to the next attribute.
+ */
+void zap_cursor_advance(zap_cursor_t *zc);
+
+/*
+ * Get a persistent cookie pointing to the current position of the zap
+ * cursor.  The low 4 bits in the cookie are always zero, and thus can
+ * be used as to differentiate a serialized cookie from a different type
+ * of value.  The cookie will be less than 2^32 as long as there are
+ * fewer than 2^22 (4.2 million) entries in the zap object.
+ */
+uint64_t zap_cursor_serialize(zap_cursor_t *zc);
+
+/*
+ * Initialize a zap cursor pointing to the position recorded by
+ * zap_cursor_serialize (in the "serialized" argument).  You can also
+ * use a "serialized" argument of 0 to start at the beginning of the
+ * zapobj (ie.  zap_cursor_init_serialized(..., 0) is equivalent to
+ * zap_cursor_init(...).)
+ */
+void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds,
+    uint64_t zapobj, uint64_t serialized);
+
+
+#define	ZAP_HISTOGRAM_SIZE 10
+
+typedef struct zap_stats {
+	/*
+	 * Size of the pointer table (in number of entries).
+	 * This is always a power of 2, or zero if it's a microzap.
+	 * In general, it should be considerably greater than zs_num_leafs.
+	 */
+	uint64_t zs_ptrtbl_len;
+
+	uint64_t zs_blocksize;		/* size of zap blocks */
+
+	uint64_t zs_num_leafs;		/* The number of leaf blocks */
+
+	uint64_t zs_num_entries;	/* The number of zap entries */
+
+	/*
+	 * The number of blocks used.  Note that some blocks may be
+	 * wasted because old ptrtbl's and large name/value blocks are
+	 * not reused.  (Although their space is reclaimed, we don't
+	 * reuse those offsets in the object.)
+	 */
+	uint64_t zs_num_blocks;
+
+	/* The number of blocks used for large names or values */
+	uint64_t zs_num_blocks_large;
+
+	/*
+	 * Histograms.  For all histograms, the last index
+	 * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater
+	 * than what can be represented.  For example
+	 * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number
+	 * of leafs with more than 45 entries.
+	 */
+
+	/*
+	 * zs_leafs_with_n_pointers[n] is the number of leafs with
+	 * 2^n pointers to it.
+	 */
+	uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE];
+
+	/*
+	 * zs_leafs_with_n_chained[n] is the number of leafs with n
+	 * chained blocks.  zs_leafs_with_n_chained[0] (leafs with no
+	 * chained blocks) should be very close to zs_num_leafs.
+	 */
+	uint64_t zs_leafs_with_n_chained[ZAP_HISTOGRAM_SIZE];
+
+	/*
+	 * zs_leafs_with_n_entries[n] is the number of leafs with
+	 * [n*5, (n+1)*5) entries.  In the current implementation, there
+	 * can be at most 55 entries in any block, but there may be
+	 * fewer if the name or value is large, or the block is not
+	 * completely full.
+	 */
+	uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE];
+
+	/*
+	 * zs_leafs_n_tenths_full[n] is the number of leafs whose
+	 * fullness is in the range [n/10, (n+1)/10).
+	 */
+	uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE];
+
+	/*
+	 * zs_entries_using_n_chunks[n] is the number of entries which
+	 * consume n 24-byte chunks.  (Note, large names/values only use
+	 * one chunk, but contribute to zs_num_blocks_large.)
+	 */
+	uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE];
+
+	/*
+	 * zs_buckets_with_n_entries[n] is the number of buckets (each
+	 * leaf has 64 buckets) with n entries.
+	 * zs_buckets_with_n_entries[1] should be very close to
+	 * zs_num_entries.
+	 */
+	uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE];
+} zap_stats_t;
+
+/*
+ * Get statistics about a ZAP object.  Note: you need to be aware of the
+ * internal implementation of the ZAP to correctly interpret some of the
+ * statistics.  This interface shouldn't be relied on unless you really
+ * know what you're doing.
+ */
+int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zap_impl.h b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
new file mode 100644
index 000000000000..6593e20a1487
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
@@ -0,0 +1,190 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_ZAP_IMPL_H
+#define	_SYS_ZAP_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	ZAP_MAGIC 0x2F52AB2AB
+
+#define	ZAP_BLOCK_SHIFT		17
+
+#define	ZAP_MAXCD		(uint32_t)(-1)
+#define	ZAP_HASHBITS		28
+#define	MZAP_ENT_LEN		64
+#define	MZAP_NAME_LEN		(MZAP_ENT_LEN - 8 - 4 - 2)
+#define	MZAP_MAX_BLKSHIFT	ZAP_BLOCK_SHIFT
+#define	MZAP_MAX_BLKSZ		(1 << MZAP_MAX_BLKSHIFT)
+
+typedef struct mzap_ent_phys {
+	uint64_t mze_value;
+	uint32_t mze_cd;
+	uint16_t mze_pad;	/* in case we want to chain them someday */
+	char mze_name[MZAP_NAME_LEN];
+} mzap_ent_phys_t;
+
+typedef struct mzap_phys {
+	uint64_t mz_block_type;	/* ZBT_MICRO */
+	uint64_t mz_salt;
+	uint64_t mz_pad[6];
+	mzap_ent_phys_t mz_chunk[1];
+	/* actually variable size depending on block size */
+} mzap_phys_t;
+
+typedef struct mzap_ent {
+	avl_node_t mze_node;
+	int mze_chunkid;
+	uint64_t mze_hash;
+	mzap_ent_phys_t mze_phys;
+} mzap_ent_t;
+
+
+/*
+ * The (fat) zap is stored in one object. It is an array of
+ * 1<<ZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
+ *
+ * ptrtbl fits in first block:
+ * 	[zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
+ *
+ * ptrtbl too big for first block:
+ * 	[zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
+ *
+ */
+
+struct dmu_buf;
+struct zap_leaf;
+
+#define	ZBT_LEAF		((1ULL << 63) + 0)
+#define	ZBT_HEADER		((1ULL << 63) + 1)
+#define	ZBT_MICRO		((1ULL << 63) + 3)
+/* any other values are ptrtbl blocks */
+
+/* 1/2 the block size */
+#define	ZAP_PTRTBL_MIN_SHIFT (ZAP_BLOCK_SHIFT - 3 - 1)
+
+/*
+ * TAKE NOTE:
+ * If zap_phys_t is modified, zap_byteswap() must be modified.
+ */
+typedef struct zap_phys {
+	uint64_t zap_block_type;	/* ZBT_HEADER */
+	uint64_t zap_magic;		/* ZAP_MAGIC */
+
+	struct zap_table_phys {
+		uint64_t zt_blk;	/* starting block number */
+		uint64_t zt_numblks;	/* number of blocks */
+		uint64_t zt_shift;	/* bits to index it */
+		uint64_t zt_nextblk;	/* next (larger) copy start block */
+		uint64_t zt_blks_copied; /* number source blocks copied */
+	} zap_ptrtbl;
+
+	uint64_t zap_freeblk;		/* the next free block */
+	uint64_t zap_num_leafs;		/* number of leafs */
+	uint64_t zap_num_entries;	/* number of entries */
+	uint64_t zap_salt;		/* salt to stir into hash function */
+	uint64_t zap_pad[8181];
+	uint64_t zap_leafs[1 << ZAP_PTRTBL_MIN_SHIFT];
+} zap_phys_t;
+
+typedef struct zap_table_phys zap_table_phys_t;
+
+typedef struct zap {
+	objset_t *zap_objset;
+	uint64_t zap_object;
+	struct dmu_buf *zap_dbuf;
+	krwlock_t zap_rwlock;
+	int zap_ismicro;
+	uint64_t zap_salt;
+	union {
+		struct {
+			zap_phys_t *zap_phys;
+
+			/*
+			 * zap_num_entries_mtx protects
+			 * zap_num_entries
+			 */
+			kmutex_t zap_num_entries_mtx;
+		} zap_fat;
+		struct {
+			mzap_phys_t *zap_phys;
+			int16_t zap_num_entries;
+			int16_t zap_num_chunks;
+			int16_t zap_alloc_next;
+			avl_tree_t zap_avl;
+		} zap_micro;
+	} zap_u;
+} zap_t;
+
+#define	zap_f	zap_u.zap_fat
+#define	zap_m	zap_u.zap_micro
+
+uint64_t zap_hash(zap_t *zap, const char *name);
+int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+    krw_t lti, int fatreader, zap_t **zapp);
+void zap_unlockdir(zap_t *zap);
+void zap_pageout(dmu_buf_t *db, void *vmzap);
+
+void zap_print(zap_t *);
+struct zap_leaf *zap_create_leaf(zap_t *zd, dmu_tx_t *tx);
+void zap_destroy_leaf(zap_t *zap, struct zap_leaf *l, dmu_tx_t *tx);
+uint64_t zap_allocate_blocks(zap_t *zap, int nblocks, dmu_tx_t *tx);
+
+#define	ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
+
+void fzap_byteswap(void *buf, size_t size);
+int fzap_count(zap_t *zap, uint64_t *count);
+int fzap_lookup(zap_t *zap, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf);
+int fzap_add(zap_t *zap, const char *name,
+    uint64_t integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx);
+int fzap_update(zap_t *zap, const char *name,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int fzap_length(zap_t *zap, const char *name,
+    uint64_t *integer_size, uint64_t *num_integers);
+int fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx);
+int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
+void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
+
+int fzap_add_cd(zap_t *zap, const char *name,
+    uint64_t integer_size, uint64_t num_integers,
+    const void *val, uint32_t cd, dmu_tx_t *tx, struct zap_leaf **lp);
+void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h
new file mode 100644
index 000000000000..aee70ae633cd
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h
@@ -0,0 +1,204 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_ZAP_LEAF_H
+#define	_SYS_ZAP_LEAF_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct zap;
+
+#define	ZAP_LEAF_MAGIC 0x2AB1EAF
+
+/* chunk size = 24 bytes */
+
+#define	ZAP_LEAF_NUMCHUNKS 5118
+#define	ZAP_LEAF_ARRAY_BYTES 21
+#define	ZAP_LEAF_HASH_SHIFT 12
+#define	ZAP_LEAF_HASH_NUMENTRIES (1 << ZAP_LEAF_HASH_SHIFT)
+#define	ZAP_LLA_DATA_BYTES ((1 << ZAP_BLOCK_SHIFT) - 16)
+
+typedef enum zap_entry_type {
+	ZAP_LEAF_FREE = 253,
+	ZAP_LEAF_ENTRY = 252,
+	ZAP_LEAF_ARRAY = 251,
+	ZAP_LEAF_TYPE_MAX = 250
+} zap_entry_type_t;
+
+/*
+ * TAKE NOTE:
+ * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
+ */
+typedef struct zap_leaf_phys {
+	struct zap_leaf_header {
+		uint64_t lhr_block_type;	/* ZBT_LEAF */
+		uint64_t lhr_next;		/* next block in leaf chain */
+		uint64_t lhr_prefix;
+		uint32_t lhr_magic;		/* ZAP_LEAF_MAGIC */
+		uint16_t lhr_nfree;		/* number free chunks */
+		uint16_t lhr_nentries;		/* number of entries */
+		uint16_t lhr_prefix_len;
+
+#define	lh_block_type 	l_phys->l_hdr.lhr_block_type
+#define	lh_magic 	l_phys->l_hdr.lhr_magic
+#define	lh_next 	l_phys->l_hdr.lhr_next
+#define	lh_prefix 	l_phys->l_hdr.lhr_prefix
+#define	lh_nfree 	l_phys->l_hdr.lhr_nfree
+#define	lh_prefix_len 	l_phys->l_hdr.lhr_prefix_len
+#define	lh_nentries 	l_phys->l_hdr.lhr_nentries
+
+/* above is accessable to zap, below is zap_leaf private */
+
+		uint16_t lh_freelist;		/* chunk head of free list */
+		uint8_t lh_pad2[12];
+	} l_hdr; /* 2 24-byte chunks */
+
+	uint16_t l_hash[ZAP_LEAF_HASH_NUMENTRIES];
+	/* 170 24-byte chunks plus 16 bytes leftover space */
+
+	union zap_leaf_chunk {
+		struct zap_leaf_entry {
+			uint8_t le_type; 	/* always ZAP_LEAF_ENTRY */
+			uint8_t le_int_size;	/* size of ints */
+			uint16_t le_next;	/* next entry in hash chain */
+			uint16_t le_name_chunk;	/* first chunk of the name */
+			uint16_t le_name_length; /* bytes in name, incl null */
+			uint16_t le_value_chunk; /* first chunk of the value */
+			uint16_t le_value_length; /* value length in ints */
+			uint32_t le_cd;		/* collision differentiator */
+			uint64_t le_hash;	/* hash value of the name */
+		} l_entry;
+		struct zap_leaf_array {
+			uint8_t la_type;
+			uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
+			uint16_t la_next;	/* next blk or CHAIN_END */
+		} l_array;
+		struct zap_leaf_free {
+			uint8_t lf_type;	/* always ZAP_LEAF_FREE */
+			uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
+			uint16_t lf_next;  /* next in free list, or CHAIN_END */
+		} l_free;
+	} l_chunk[ZAP_LEAF_NUMCHUNKS];
+} zap_leaf_phys_t;
+
+typedef struct zap_leaf {
+	krwlock_t l_rwlock; 		/* only used on head of chain */
+	uint64_t l_blkid;		/* 1<<ZAP_BLOCK_SHIFT byte block off */
+	struct zap_leaf *l_next;	/* next in chain */
+	dmu_buf_t *l_dbuf;
+	zap_leaf_phys_t *l_phys;
+} zap_leaf_t;
+
+
+typedef struct zap_entry_handle {
+	/* below is set by zap_leaf.c and is public to zap.c */
+	uint64_t zeh_num_integers;
+	uint64_t zeh_hash;
+	uint32_t zeh_cd;
+	uint8_t zeh_integer_size;
+
+	/* below is private to zap_leaf.c */
+	uint16_t zeh_fakechunk;
+	uint16_t *zeh_chunkp;
+	zap_leaf_t *zeh_head_leaf;
+	zap_leaf_t *zeh_found_leaf;
+} zap_entry_handle_t;
+
+/*
+ * Return a handle to the named entry, or ENOENT if not found.  The hash
+ * value must equal zap_hash(name).
+ */
+extern int zap_leaf_lookup(zap_leaf_t *l,
+	const char *name, uint64_t h, zap_entry_handle_t *zeh);
+
+/*
+ * Return a handle to the entry with this hash+cd, or the entry with the
+ * next closest hash+cd.
+ */
+extern int zap_leaf_lookup_closest(zap_leaf_t *l,
+    uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh);
+
+/*
+ * Read the first num_integers in the attribute.  Integer size
+ * conversion will be done without sign extension.  Return EINVAL if
+ * integer_size is too small.  Return EOVERFLOW if there are more than
+ * num_integers in the attribute.
+ */
+extern int zap_entry_read(const zap_entry_handle_t *zeh,
+	uint8_t integer_size, uint64_t num_integers, void *buf);
+
+extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
+	uint16_t buflen, char *buf);
+
+/*
+ * Replace the value of an existing entry.
+ *
+ * zap_entry_update may fail if it runs out of space (ENOSPC).
+ */
+extern int zap_entry_update(zap_entry_handle_t *zeh,
+	uint8_t integer_size, uint64_t num_integers, const void *buf);
+
+/*
+ * Remove an entry.
+ */
+extern void zap_entry_remove(zap_entry_handle_t *zeh);
+
+/*
+ * Create an entry. An equal entry must not exist, and this entry must
+ * belong in this leaf (according to its hash value).  Fills in the
+ * entry handle on success.  Returns 0 on success or ENOSPC on failure.
+ */
+extern int zap_entry_create(zap_leaf_t *l,
+	const char *name, uint64_t h, uint32_t cd,
+	uint8_t integer_size, uint64_t num_integers, const void *buf,
+	zap_entry_handle_t *zeh);
+
+/*
+ * Other stuff.
+ */
+
+extern void zap_leaf_init(zap_leaf_t *l);
+extern void zap_leaf_byteswap(zap_leaf_phys_t *buf);
+
+extern zap_leaf_t *zap_leaf_split(struct zap *zap, zap_leaf_t *l, dmu_tx_t *tx);
+
+extern int zap_leaf_merge(zap_leaf_t *l, zap_leaf_t *sibling);
+
+extern zap_leaf_t *zap_leaf_chainmore(zap_leaf_t *l, zap_leaf_t *nl);
+
+extern int zap_leaf_advance(zap_leaf_t *l, zap_cursor_t *zc);
+
+extern void zap_stats_leaf(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_ZAP_LEAF_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
new file mode 100644
index 000000000000..0050316eb59e
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
@@ -0,0 +1,113 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_FS_ZFS_ACL_H
+#define	_SYS_FS_ZFS_ACL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#endif
+#include <sys/acl.h>
+#include <sys/dmu.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct znode_phys;
+
+#define	ACCESS_UNDETERMINED	-1
+
+#define	ACE_SLOT_CNT	6
+
+typedef struct zfs_znode_acl {
+	uint64_t	z_acl_extern_obj;	  /* ext acl pieces */
+	uint32_t	z_acl_count;		  /* Number of ACEs */
+	uint16_t	z_acl_version;		  /* acl version */
+	uint16_t	z_acl_state;		  /* goop */
+	ace_t		z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
+} zfs_znode_acl_t;
+
+#define	ACL_DATA_ALLOCED	0x1
+
+/*
+ * Max ACL size is prepended deny for all entries + the
+ * canonical six tacked on * the end.
+ */
+#define	MAX_ACL_SIZE	(MAX_ACL_ENTRIES * 2 + 6)
+
+typedef struct zfs_acl {
+	int		z_slots;	/* number of allocated slots for ACEs */
+	int		z_acl_count;
+	uint_t		z_state;
+	ace_t		*z_acl;
+} zfs_acl_t;
+
+#define	ZFS_ACL_SIZE(aclcnt)	(sizeof (ace_t) * (aclcnt))
+
+/*
+ * Property values for acl_mode and acl_inherit.
+ *
+ * acl_mode can take discard, noallow, groupmask and passthrough.
+ * whereas acl_inherit has secure instead of groupmask.
+ */
+
+#define	DISCARD		0
+#define	NOALLOW		1
+#define	GROUPMASK	2
+#define	PASSTHROUGH	3
+#define	SECURE		4
+
+struct znode;
+
+#ifdef _KERNEL
+void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *,
+    dmu_tx_t *, cred_t *);
+int zfs_getacl(struct znode *, vsecattr_t *, cred_t *);
+int zfs_mode_update(struct znode *, uint64_t, dmu_tx_t  *);
+int zfs_setacl(struct znode *, vsecattr_t *, cred_t *);
+void zfs_acl_rele(void *);
+void zfs_ace_byteswap(ace_t *, int);
+extern int zfs_zaccess(struct znode *, int, cred_t *);
+extern int zfs_zaccess_rwx(struct znode *, mode_t, cred_t *);
+extern int zfs_acl_access(struct znode *, int, cred_t *);
+int zfs_acl_chmod_setattr(struct znode *, uint64_t, dmu_tx_t *);
+int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
+int zfs_zaccess_rename(struct znode *, struct znode *,
+    struct znode *, struct znode *, cred_t *cr);
+int zfs_zaccess_v4_perm(struct znode *, int, cred_t *);
+void zfs_acl_free(zfs_acl_t *);
+zfs_acl_t *zfs_acl_node_read(struct znode *);
+
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+#endif	/* _SYS_FS_ZFS_ACL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_context.h b/usr/src/uts/common/fs/zfs/sys/zfs_context.h
new file mode 100644
index 000000000000..2f0e3e792d36
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h
@@ -0,0 +1,71 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_CONTEXT_H
+#define	_SYS_ZFS_CONTEXT_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/note.h>
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/atomic.h>
+#include <sys/sysmacros.h>
+#include <sys/bitmap.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/buf.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuvar.h>
+#include <sys/kobj.h>
+#include <sys/conf.h>
+#include <sys/disp.h>
+#include <sys/debug.h>
+#include <sys/random.h>
+#include <sys/byteorder.h>
+#include <sys/systm.h>
+#include <sys/list.h>
+#include <sys/uio.h>
+#include <sys/dirent.h>
+#include <sys/time.h>
+#include <vm/seg_kmem.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/zfs_debug.h>
+
+#define	CPU_SEQID	(CPU->cpu_seqid)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZFS_CONTEXT_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ctldir.h b/usr/src/uts/common/fs/zfs/sys/zfs_ctldir.h
new file mode 100644
index 000000000000..78d82ccbe251
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ctldir.h
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_ZFS_CTLDIR_H
+#define	_ZFS_CTLDIR_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/pathname.h>
+#include <sys/vnode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	ZFS_CTLDIR_NAME		".zfs"
+
+#define	zfs_has_ctldir(zdp)	\
+	((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \
+	((zdp)->z_zfsvfs->z_ctldir != NULL))
+#define	zfs_show_ctldir(zdp)	\
+	(zfs_has_ctldir(zdp) && \
+	((zdp)->z_zfsvfs->z_show_ctldir))
+
+void zfsctl_create(zfsvfs_t *);
+void zfsctl_destroy(zfsvfs_t *);
+vnode_t *zfsctl_root(znode_t *);
+void zfsctl_init(void);
+void zfsctl_fini(void);
+
+int zfsctl_rename_snapshot(const char *from, const char *to);
+int zfsctl_destroy_snapshot(const char *snapname, int force);
+int zfsctl_umount_snapshots(vfs_t *, int, cred_t *);
+
+int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+    int flags, vnode_t *rdir, cred_t *cr);
+
+int zfsctl_make_fid(zfsvfs_t *zfsvfsp, uint64_t object, uint32_t gen,
+    fid_t *fidp);
+int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
+
+#define	ZFSCTL_INO_ROOT		0x1
+#define	ZFSCTL_INO_SNAPDIR	0x2
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZFS_CTLDIR_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_debug.h b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
new file mode 100644
index 000000000000..07eb3d2da8c7
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_debug.h
@@ -0,0 +1,73 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZFS_DEBUG_H
+#define	_SYS_ZFS_DEBUG_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifndef TRUE
+#define	TRUE 1
+#endif
+
+#ifndef FALSE
+#define	FALSE 0
+#endif
+
+/*
+ * ZFS debugging
+ */
+
+#if defined(DEBUG) || !defined(_KERNEL)
+#define	ZFS_DEBUG
+#endif
+
+extern int zfs_flags;
+
+#define	ZFS_DEBUG_DPRINTF	0x0001
+#define	ZFS_DEBUG_DBUF_VERIFY	0x0002
+#define	ZFS_DEBUG_DNODE_VERIFY	0x0004
+#define	ZFS_DEBUG_SNAPNAMES	0x0008
+
+#ifdef ZFS_DEBUG
+extern void __dprintf(const char *file, const char *func,
+    int line, const char *fmt, ...);
+#define	dprintf(...) \
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) \
+		__dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__)
+#else
+#define	dprintf(...) ((void)0)
+#endif /* ZFS_DEBUG */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZFS_DEBUG_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_dir.h b/usr/src/uts/common/fs/zfs/sys/zfs_dir.h
new file mode 100644
index 000000000000..8ab760f61873
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_dir.h
@@ -0,0 +1,70 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_FS_ZFS_DIR_H
+#define	_SYS_FS_ZFS_DIR_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dmu.h>
+#include <sys/zfs_znode.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/* zfs_dirent_lock() flags */
+#define	ZNEW		0x0001		/* entry should not exist */
+#define	ZEXISTS		0x0002		/* entry should exist */
+#define	ZSHARED		0x0004		/* shared access (zfs_dirlook()) */
+#define	ZXATTR		0x0008		/* we want the xattr dir */
+#define	ZRENAMING	0x0010		/* znode is being renamed */
+
+/* mknode flags */
+#define	IS_ROOT_NODE	0x01		/* create a root node */
+#define	IS_XATTR	0x02		/* create an extended attribute node */
+#define	IS_REPLAY	0x04		/* we are replaying intent log */
+
+extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
+    int);
+extern void zfs_dirent_unlock(zfs_dirlock_t *);
+extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int);
+extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, int *);
+extern int zfs_dirlook(znode_t *, char *, vnode_t **);
+extern void zfs_mknode(znode_t *, vattr_t *, uint64_t *,
+    dmu_tx_t *, cred_t *, uint_t, znode_t **, int);
+extern void zfs_rmnode(znode_t *);
+extern boolean_t zfs_dirempty(znode_t *);
+extern void zfs_dq_add(znode_t *, dmu_tx_t *);
+extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr);
+extern int zfs_get_xattrdir(znode_t *, vnode_t **, cred_t *);
+extern int zfs_make_xattrdir(znode_t *, vattr_t *, vnode_t **, cred_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_ZFS_DIR_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
new file mode 100644
index 000000000000..cbe8bbc5cb77
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -0,0 +1,187 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_ZFS_IOCTL_H
+#define	_SYS_ZFS_IOCTL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/cred.h>
+#include <sys/dmu.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	ZFS_DRIVER_NAME "zfs"
+#define	ZFS_DS_TYPE	"zfs"
+
+/*
+ * Property values for snapdir
+ */
+#define	HIDDEN		0
+#define	VISIBLE		1
+
+typedef struct zfs_stats {
+	uint64_t	zs_atime;
+	uint64_t	zs_recordsize;
+	uint64_t	zs_readonly;
+	uint64_t	zs_devices;
+	uint64_t	zs_exec;
+	uint64_t	zs_setuid;
+	uint64_t	zs_snapdir;
+	uint64_t	zs_acl_mode;
+	uint64_t	zs_acl_inherit;
+	char		zs_mountpoint[MAXNAMELEN];
+	char		zs_atime_setpoint[MAXNAMELEN];
+	char		zs_recordsize_setpoint[MAXNAMELEN];
+	char		zs_readonly_setpoint[MAXNAMELEN];
+	char		zs_devices_setpoint[MAXNAMELEN];
+	char		zs_setuid_setpoint[MAXNAMELEN];
+	char		zs_exec_setpoint[MAXNAMELEN];
+	char		zs_mountpoint_setpoint[MAXNAMELEN];
+	char		zs_sharenfs[MAXNAMELEN];
+	char		zs_sharenfs_setpoint[MAXNAMELEN];
+	char		zs_snapdir_setpoint[MAXNAMELEN];
+	char		zs_acl_mode_setpoint[MAXNAMELEN];
+	char		zs_acl_inherit_setpoint[MAXNAMELEN];
+} zfs_stats_t;
+
+#define	DMU_BACKUP_VERSION (1ULL)
+#define	DMU_BACKUP_MAGIC 0x2F5bacbacULL
+
+/*
+ * zfs ioctl command structure
+ */
+typedef struct dmu_replay_record {
+	enum {
+		DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
+		DRR_WRITE, DRR_FREE, DRR_END,
+	} drr_type;
+	uint32_t drr_pad;
+	union {
+		struct drr_begin {
+			uint64_t drr_magic;
+			uint64_t drr_version;
+			uint64_t drr_creation_time;
+			dmu_objset_type_t drr_type;
+			uint32_t drr_pad;
+			uint64_t drr_toguid;
+			uint64_t drr_fromguid;
+			char drr_toname[MAXNAMELEN];
+		} drr_begin;
+		struct drr_end {
+			uint64_t drr_checksum;
+		} drr_end;
+		struct drr_object {
+			uint64_t drr_object;
+			dmu_object_type_t drr_type;
+			dmu_object_type_t drr_bonustype;
+			uint32_t drr_blksz;
+			uint32_t drr_bonuslen;
+			uint8_t drr_checksum;
+			uint8_t drr_compress;
+			uint8_t drr_pad[6];
+		} drr_object;
+		struct drr_freeobjects {
+			uint64_t drr_firstobj;
+			uint64_t drr_numobjs;
+		} drr_freeobjects;
+		struct drr_write {
+			uint64_t drr_object;
+			dmu_object_type_t drr_type;
+			uint32_t drr_pad;
+			uint64_t drr_offset;
+			uint64_t drr_length;
+		} drr_write;
+		struct drr_free {
+			uint64_t drr_object;
+			uint64_t drr_offset;
+			uint64_t drr_length;
+		} drr_free;
+	} drr_u;
+} dmu_replay_record_t;
+
+typedef struct zfs_cmd {
+	char		zc_name[MAXNAMELEN];
+	char		zc_prop_name[MAXNAMELEN];
+	char		zc_prop_value[MAXPATHLEN];
+	char		zc_root[MAXPATHLEN];
+	char		zc_filename[MAXPATHLEN];
+	uint32_t	zc_intsz;
+	uint32_t	zc_numints;
+	uint64_t	zc_pool_guid;
+	uint64_t	zc_config_src;	/* really (char *) */
+	uint64_t	zc_config_src_size;
+	uint64_t	zc_config_dst;	/* really (char *) */
+	uint64_t	zc_config_dst_size;
+	uint64_t	zc_cookie;
+	uint64_t	zc_cred;
+	uint64_t	zc_dev;
+	uint64_t	zc_volsize;
+	uint64_t	zc_volblocksize;
+	uint64_t	zc_objset_type;
+	zfs_stats_t	zc_zfs_stats;
+	dmu_object_info_t zc_object_info;
+	dmu_objset_stats_t zc_objset_stats;
+	struct drr_begin zc_begin_record;
+} zfs_cmd_t;
+
+#ifdef _KERNEL
+
+extern dev_info_t *zfs_dip;
+
+extern int zfs_secpolicy_write(const char *dataset, const char *, cred_t *cr);
+extern int zfs_busy(void);
+
+extern int zvol_check_volsize(zfs_cmd_t *zc);
+extern int zvol_check_volblocksize(zfs_cmd_t *zc);
+extern int zvol_get_stats(zfs_cmd_t *zc, objset_t *os);
+extern void zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx);
+extern int zvol_create_minor(zfs_cmd_t *zc);
+extern int zvol_remove_minor(zfs_cmd_t *zc);
+extern int zvol_set_volsize(zfs_cmd_t *zc);
+extern int zvol_set_volblocksize(zfs_cmd_t *zc);
+extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
+extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
+extern int zvol_strategy(buf_t *bp);
+extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr);
+extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr);
+extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr);
+extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr);
+extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
+    int *rvalp);
+extern int zvol_busy(void);
+extern void zvol_init(void);
+extern void zvol_fini(void);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZFS_IOCTL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h
new file mode 100644
index 000000000000..cd0700f6413d
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h
@@ -0,0 +1,116 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_FS_ZFS_VFSOPS_H
+#define	_SYS_FS_ZFS_VFSOPS_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/list.h>
+#include <sys/vfs.h>
+#include <sys/zil.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct zfs_delete_list {
+	kmutex_t		z_mutex;
+	kcondvar_t		z_cv;
+	kcondvar_t		z_quiesce_cv;
+	uint8_t			z_drained;
+	uint8_t			z_draining;
+	uint32_t		z_thread_target;
+	uint32_t		z_thread_count;
+	uint64_t		z_znode_count;
+	list_t			z_znodes;
+} zfs_delete_t;
+
+typedef struct zfsvfs zfsvfs_t;
+
+struct zfsvfs {
+	vfs_t		*z_vfs;		/* generic fs struct */
+	zfsvfs_t	*z_parent;	/* parent fs */
+	objset_t	*z_os;		/* objset reference */
+	uint64_t	z_root;		/* id of root znode */
+	uint64_t	z_dqueue;	/* delete queue */
+	uint64_t	z_max_blksz;	/* maximum block size for files */
+	uint64_t	z_assign;	/* TXG_NOWAIT or set by zil_replay() */
+	zilog_t		*z_log;		/* intent log pointer */
+	uint_t		z_acl_mode;	/* acl chmod/mode behavior */
+	uint_t		z_acl_inherit;	/* acl inheritance behavior */
+	boolean_t	z_atime;	/* enable atimes mount option */
+	boolean_t	z_unmounted1;	/* unmounted phase 1 */
+	boolean_t	z_unmounted2;	/* unmounted phase 2 */
+	uint32_t	z_op_cnt;	/* vnode/vfs operations ref count */
+	krwlock_t	z_um_lock;	/* rw lock for umount phase 2 */
+	zfs_delete_t 	z_delete_head;	/* zfs delete list */
+	list_t		z_all_znodes;	/* all vnodes in the fs */
+	kmutex_t	z_znodes_lock;	/* lock for z_all_znodes */
+	vnode_t		*z_ctldir;	/* .zfs directory pointer */
+	boolean_t	z_show_ctldir;	/* expose .zfs in the root dir */
+	boolean_t	z_issnap;	/* true if this is a snapshot */
+#define	ZFS_OBJ_MTX_SZ	64
+	kmutex_t	z_hold_mtx[ZFS_OBJ_MTX_SZ];	/* znode hold locks */
+};
+
+/*
+ * The total file ID size is limited to 12 bytes (including the length
+ * field) in the NFSv2 protocol.  For historical reasons, this same limit
+ * is currently being imposed by the Solaris NFSv3 implementation...
+ * although the protocol actually permits a maximum of 64 bytes.  It will
+ * not be possible to expand beyond 12 bytes without abandoning support
+ * of NFSv2 and making some changes to the Solaris NFSv3 implementation.
+ *
+ * For the time being, we will partition up the available space as follows:
+ *	2 bytes		fid length (required)
+ *	6 bytes		object number (48 bits)
+ *	4 bytes		generation number (32 bits)
+ * We reserve only 48 bits for the object number, as this is the limit
+ * currently defined and imposed by the DMU.
+ */
+typedef struct zfid_short {
+	uint16_t	zf_len;
+	uint8_t		zf_object[6];		/* obj[i] = obj >> (8 * i) */
+	uint8_t		zf_gen[4];		/* gen[i] = gen >> (8 * i) */
+} zfid_short_t;
+
+typedef struct zfid_long {
+	zfid_short_t	z_fid;
+	uint8_t		zf_setid[6];		/* obj[i] = obj >> (8 * i) */
+	uint8_t		zf_setgen[4];		/* gen[i] = gen >> (8 * i) */
+} zfid_long_t;
+
+#define	SHORT_FID_LEN	(sizeof (zfid_short_t) - sizeof (uint16_t))
+#define	LONG_FID_LEN	(sizeof (zfid_long_t) - sizeof (uint16_t))
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_ZFS_VFSOPS_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
new file mode 100644
index 000000000000..d3f28df4cdf7
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
@@ -0,0 +1,283 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_FS_ZFS_ZNODE_H
+#define	_SYS_FS_ZFS_ZNODE_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/list.h>
+#include <sys/dmu.h>
+#include <sys/zfs_vfsops.h>
+#endif
+#include <sys/zfs_acl.h>
+#include <sys/zil.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Define special zfs pflags
+ */
+#define	ZFS_XATTR	0x1		/* is an extended attribute */
+#define	ZFS_INHERIT_ACE	0x2		/* ace has inheritable ACEs */
+
+#define	MASTER_NODE_OBJ	1
+
+/*
+ * special attributes for master node.
+ */
+
+#define	ZFS_FSID		"FSID"
+#define	ZFS_DELETE_QUEUE	"DELETE_QUEUE"
+#define	ZFS_ROOT_OBJ		"ROOT"
+#define	ZFS_VERSION_OBJ		"VERSION"
+#define	ZFS_PROP_BLOCKPERPAGE	"BLOCKPERPAGE"
+#define	ZFS_PROP_NOGROWBLOCKS	"NOGROWBLOCKS"
+
+#define	ZFS_FLAG_BLOCKPERPAGE	0x1
+#define	ZFS_FLAG_NOGROWBLOCKS	0x2
+
+/*
+ * ZFS version - rev'd whenever an incompatible on-disk format change
+ * occurs.  Independent of SPA/DMU/ZAP versioning.
+ */
+
+#define	ZFS_VERSION		1ULL
+
+#define	ZFS_MAX_BLOCKSIZE	(SPA_MAXBLOCKSIZE)
+
+/* Path component length */
+/*
+ * The generic fs code uses MAXNAMELEN to represent
+ * what the largest component length is.  Unfortunately,
+ * this length includes the terminating NULL.  ZFS needs
+ * to tell the users via pathconf() and statvfs() what the
+ * true maximum length of a component is, excluding the NULL.
+ */
+#define	ZFS_MAXNAMELEN	(MAXNAMELEN - 1)
+
+/*
+ * This is the persistent portion of the znode.  It is stored
+ * in the "bonus buffer" of the file.  Short symbolic links
+ * are also stored in the bonus buffer.
+ */
+typedef struct znode_phys {
+	uint64_t zp_atime[2];		/*  0 - last file access time */
+	uint64_t zp_mtime[2];		/* 16 - last file modification time */
+	uint64_t zp_ctime[2];		/* 32 - last file change time */
+	uint64_t zp_crtime[2];		/* 48 - creation time */
+	uint64_t zp_gen;		/* 64 - generation (txg of creation) */
+	uint64_t zp_mode;		/* 72 - file mode bits */
+	uint64_t zp_size;		/* 80 - size of file */
+	uint64_t zp_parent;		/* 88 - directory parent (`..') */
+	uint64_t zp_links;		/* 96 - number of links to file */
+	uint64_t zp_xattr;		/* 104 - DMU object for xattrs */
+	uint64_t zp_rdev;		/* 112 - dev_t for VBLK & VCHR files */
+	uint64_t zp_flags;		/* 120 - persistent flags */
+	uint64_t zp_uid;		/* 128 - file owner */
+	uint64_t zp_gid;		/* 136 - owning group */
+	uint64_t zp_pad[4];		/* 144 - future */
+	zfs_znode_acl_t zp_acl;		/* 176 - 263 ACL */
+	/*
+	 * Data may pad out any remaining bytes in the znode buffer, eg:
+	 *
+	 * |<---------------------- dnode_phys (512) ------------------------>|
+	 * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
+	 *			|<---- znode (264) ---->|<---- data (56) ---->|
+	 *
+	 * At present, we only use this space to store symbolic links.
+	 */
+} znode_phys_t;
+
+/*
+ * Directory entry locks control access to directory entries.
+ * They are used to protect creates, deletes, and renames.
+ * Each directory znode has a mutex and a list of locked names.
+ */
+#ifdef _KERNEL
+typedef struct zfs_dirlock {
+	char		*dl_name;	/* directory entry being locked */
+	uint32_t	dl_sharecnt;	/* 0 if exclusive, > 0 if shared */
+	uint16_t	dl_namesize;	/* set if dl_name was allocated */
+	kcondvar_t	dl_cv;		/* wait for entry to be unlocked */
+	struct znode	*dl_dzp;	/* directory znode */
+	struct zfs_dirlock *dl_next;	/* next in z_dirlocks list */
+} zfs_dirlock_t;
+
+struct zcache_state;
+
+typedef struct znode {
+	struct zfsvfs	*z_zfsvfs;
+	vnode_t		*z_vnode;
+	list_node_t 	z_list_node;	/* deleted znodes */
+	uint64_t	z_id;		/* object ID for this znode */
+	kmutex_t	z_lock;		/* znode modification lock */
+	krwlock_t	z_map_lock;	/* page map lock */
+	krwlock_t	z_grow_lock;	/* grow block size lock */
+	krwlock_t	z_append_lock;	/* append-mode lock */
+	zfs_dirlock_t	*z_dirlocks;	/* directory entry lock list */
+	uint8_t		z_active;	/* znode is in use */
+	uint8_t		z_reap;		/* reap file at last reference */
+	uint8_t		z_atime_dirty;	/* atime needs to be synced */
+	uint8_t		z_dbuf_held;	/* Is z_dbuf already held? */
+	uint_t		z_mapcnt;	/* number of memory maps to file */
+	uint_t		z_blksz;	/* block size in bytes */
+	uint_t		z_seq;		/* modification sequence number */
+	uint64_t	z_last_itx;	/* last ZIL itx on this znode */
+	kmutex_t	z_acl_lock;	/* acl data lock */
+	list_node_t	z_link_node;	/* all znodes in fs link */
+	list_node_t	z_zcache_node;
+	struct zcache_state *z_zcache_state;
+	uint64_t	z_zcache_access;
+
+	/*
+	 * These are dmu managed fields.
+	 */
+	znode_phys_t	*z_phys;	/* pointer to persistent znode */
+	dmu_buf_t	*z_dbuf;	/* buffer containing the z_phys */
+} znode_t;
+
+/*
+ * The grow_lock is only applicable to "regular" files.
+ * The parent_lock is only applicable to directories.
+ */
+#define	z_parent_lock	z_grow_lock
+
+/*
+ * Convert between znode pointers and vnode pointers
+ */
+#define	ZTOV(ZP)	((ZP)->z_vnode)
+#define	VTOZ(VP)	((znode_t *)(VP)->v_data)
+
+/*
+ * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation.
+ * ZFS_EXIT() must be called before exitting the vop.
+ */
+#define	ZFS_ENTER(zfsvfs) \
+	{ \
+		atomic_add_32(&(zfsvfs)->z_op_cnt, 1); \
+		if ((zfsvfs)->z_unmounted1) { \
+			ZFS_EXIT(zfsvfs); \
+			return (EIO); \
+		} \
+	}
+#define	ZFS_EXIT(zfsvfs) atomic_add_32(&(zfsvfs)->z_op_cnt, -1)
+
+/*
+ * Macros for dealing with dmu_buf_hold
+ */
+#define	ZFS_OBJ_HASH(obj_num)	(obj_num & (ZFS_OBJ_MTX_SZ - 1))
+#define	ZFS_OBJ_MUTEX(zp)	\
+	(&zp->z_zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(zp->z_id)])
+#define	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \
+	mutex_enter(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]);
+
+#define	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \
+	mutex_exit(&zfsvfs->z_hold_mtx[ZFS_OBJ_HASH(obj_num)])
+
+/*
+ * Macros to encode/decode ZFS stored time values from/to struct timespec
+ */
+#define	ZFS_TIME_ENCODE(tp, stmp)		\
+{						\
+	stmp[0] = (uint64_t)(tp)->tv_sec; 	\
+	stmp[1] = (uint64_t)(tp)->tv_nsec;	\
+}
+
+#define	ZFS_TIME_DECODE(tp, stmp)		\
+{						\
+	(tp)->tv_sec = (time_t)stmp[0];		\
+	(tp)->tv_nsec = (long)stmp[1];		\
+}
+
+/*
+ * Timestamp defines
+ */
+#define	ACCESSED		(AT_ATIME)
+#define	STATE_CHANGED		(AT_CTIME)
+#define	CONTENT_MODIFIED	(AT_MTIME | AT_CTIME)
+
+#define	ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
+	if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
+		zfs_time_stamper(zp, ACCESSED, NULL)
+
+extern int	zfs_init_fs(zfsvfs_t *, znode_t **, cred_t *);
+extern void	zfs_set_dataprop(objset_t *);
+extern void	zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx);
+extern void	zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *);
+extern void	zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *);
+extern int	zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
+extern int	zfs_freesp(znode_t *, uint64_t, uint64_t, int, dmu_tx_t *,
+    cred_t *cr);
+extern void	zfs_znode_init(void);
+extern void	zfs_znode_fini(void);
+extern znode_t	*zfs_znode_alloc(zfsvfs_t *, dmu_buf_t *, uint64_t, int);
+extern int	zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
+extern void	zfs_zinactive(znode_t *);
+extern void	zfs_znode_delete(znode_t *, dmu_tx_t *);
+extern void	zfs_znode_free(znode_t *);
+extern int	zfs_delete_thread_target(zfsvfs_t *zfsvfs, int nthreads);
+extern void	zfs_delete_wait_empty(zfsvfs_t *zfsvfs);
+extern void	zfs_zcache_flush(zfsvfs_t *zfsvf);
+extern void	zfs_remove_op_tables();
+extern int	zfs_create_op_tables();
+extern int	zfs_sync(vfs_t *vfsp, short flag, cred_t *cr);
+
+extern uint64_t zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *dzp, znode_t *zp, char *name);
+extern uint64_t zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *dzp, char *name);
+extern uint64_t zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *dzp, znode_t *zp, char *name);
+extern uint64_t zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *dzp, znode_t *zp, char *name, char *link);
+extern uint64_t zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
+extern uint64_t zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *zp, offset_t off, ssize_t len, int ioflag, uio_t *uio);
+extern uint64_t zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *zp, uint64_t off, uint64_t len);
+extern uint64_t zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *zp, vattr_t *vap, uint_t mask_applied);
+extern uint64_t zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *zp, int aclcnt, ace_t *z_ace);
+
+extern zil_get_data_t zfs_get_data;
+extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
+extern int zfsfstype;
+
+#endif /* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_ZFS_ZNODE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h
new file mode 100644
index 000000000000..a03dcc6bc9e3
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h
@@ -0,0 +1,242 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_ZIL_H
+#define	_SYS_ZIL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Intent log format:
+ *
+ * Each objset has its own intent log.  The log header (zil_header_t)
+ * for objset N's intent log is kept in the Nth object of the SPA's
+ * intent_log objset.  The log header points to a chain of log blocks,
+ * each of which contains log records (i.e., transactions) followed by
+ * a log block trailer (zil_trailer_t).  The format of a log record
+ * depends on the record (or transaction) type, but all records begin
+ * with a common structure that defines the type, length, and txg.
+ */
+
+/*
+ * Intent log header - this on disk structure holds fields to manage
+ * the log.  All fields are 64 bit to easily handle cross architectures.
+ */
+typedef struct zil_header {
+	uint64_t zh_claim_txg;	/* txg in which log blocks were claimed */
+	uint64_t zh_replay_seq;	/* highest replayed sequence number */
+	blkptr_t zh_log;	/* log chain */
+	uint64_t zit_pad[6];
+} zil_header_t;
+
+/*
+ * Log block trailer - structure at the end of the header and each log block
+ *
+ * The zit_bt contains a zbt_cksum which for the intent log is
+ * the sequence number of this log block. A seq of 0 is invalid.
+ * The zbt_cksum is checked by the SPA against the sequence
+ * number passed in the blk_cksum field of the blkptr_t
+ */
+typedef struct zil_trailer {
+	uint64_t zit_pad;
+	blkptr_t zit_next_blk;	/* next block in chain */
+	uint64_t zit_nused;	/* bytes in log block used */
+	zio_block_tail_t zit_bt; /* block trailer */
+} zil_trailer_t;
+
+#define	ZIL_MIN_BLKSZ	4096
+#define	ZIL_MAX_BLKSZ	SPA_MAXBLOCKSIZE
+#define	ZIL_BLK_DATA_SZ(lwb)	((lwb)->lwb_sz - sizeof (zil_trailer_t))
+
+/*
+ * Intent log transaction types and record structures
+ */
+#define	TX_CREATE	1		/* Create file */
+#define	TX_MKDIR	2		/* Make directory */
+#define	TX_MKXATTR	3		/* Make XATTR directory */
+#define	TX_SYMLINK	4		/* Create symbolic link to a file */
+#define	TX_REMOVE	5		/* Remove file */
+#define	TX_RMDIR	6		/* Remove directory */
+#define	TX_LINK		7		/* Create hard link to a file */
+#define	TX_RENAME	8		/* Rename a file */
+#define	TX_WRITE	9		/* File write */
+#define	TX_TRUNCATE	10		/* Truncate a file */
+#define	TX_SETATTR	11		/* Set file attributes */
+#define	TX_ACL		12		/* Set acl */
+#define	TX_MAX_TYPE	13		/* Max transaction type */
+
+/*
+ * Format of log records.
+ * The fields are carefully defined to allow them to be aligned
+ * and sized the same on sparc & intel architectures.
+ * Each log record has a common structure at the beginning.
+ */
+typedef struct {			/* common log record header */
+	uint64_t	lrc_txtype;	/* intent log transaction type */
+	uint64_t	lrc_reclen;	/* transaction record length */
+	uint64_t	lrc_txg;	/* dmu transaction group number */
+	uint64_t	lrc_seq;	/* intent log sequence number */
+} lr_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_doid;	/* object id of directory */
+	uint64_t	lr_foid;	/* object id of created file object */
+	uint64_t	lr_mode;	/* mode of object */
+	uint64_t	lr_uid;		/* uid of object */
+	uint64_t	lr_gid;		/* gid of object */
+	uint64_t	lr_gen;		/* generation (txg of creation) */
+	uint64_t	lr_crtime[2];	/* creation time */
+	uint64_t	lr_rdev;	/* rdev of object to create */
+	/* name of object to create follows this */
+	/* for symlinks, link content follows name */
+} lr_create_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_doid;	/* obj id of directory */
+	/* name of object to remove follows this */
+} lr_remove_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_doid;	/* obj id of directory */
+	uint64_t	lr_link_obj;	/* obj id of link */
+	/* name of object to link follows this */
+} lr_link_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_sdoid;	/* obj id of source directory */
+	uint64_t	lr_tdoid;	/* obj id of target directory */
+	/* 2 strings: names of source and destination follow this */
+} lr_rename_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_foid;	/* file object to write */
+	uint64_t	lr_offset;	/* offset to write to */
+	uint64_t	lr_length;	/* user data length to write */
+	uint64_t	lr_blkoff;	/* offset represented by lr_blkptr */
+	blkptr_t	lr_blkptr;	/* spa block pointer for replay */
+	/* write data will follow for small writes */
+} lr_write_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_foid;	/* object id of file to truncate */
+	uint64_t	lr_offset;	/* offset to truncate from */
+	uint64_t	lr_length;	/* length to truncate */
+} lr_truncate_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_foid;	/* file object to change attributes */
+	uint64_t	lr_mask;	/* mask of attributes to set */
+	uint64_t	lr_mode;	/* mode to set */
+	uint64_t	lr_uid;		/* uid to set */
+	uint64_t	lr_gid;		/* gid to set */
+	uint64_t	lr_size;	/* size to set */
+	uint64_t	lr_atime[2];	/* access time */
+	uint64_t	lr_mtime[2];	/* modification time */
+} lr_setattr_t;
+
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_foid;	/* obj id of file */
+	uint64_t	lr_aclcnt;	/* number of acl entries */
+	/* lr_aclcnt number of ace_t entries follow this */
+} lr_acl_t;
+
+/*
+ * ZIL structure definitions, interface function prototype and globals.
+ */
+
+/*
+ * ZFS intent log transaction structure
+ */
+typedef struct itx {
+	list_node_t	itx_node;	/* linkage on zl_itx_list */
+	void		*itx_private;	/* type-specific opaque data */
+	uint8_t		itx_data_copied; /* TX_WRITE only: write data already */
+					/* copied into itx data buffer */
+	lr_t		itx_lr;		/* common part of log record */
+	/* followed by type-specific part of lr_xx_t and its immediate data */
+} itx_t;
+
+typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
+    uint64_t txg);
+typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
+    uint64_t txg);
+typedef int zil_replay_func_t();
+typedef int zil_get_data_t(void *arg, lr_write_t *lr);
+
+extern void	zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+    zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
+
+extern void	zil_init(void);
+extern void	zil_fini(void);
+
+extern zilog_t	*zil_alloc(objset_t *os, zil_header_t *zh_phys);
+extern void	zil_free(zilog_t *zilog);
+
+extern zilog_t	*zil_open(objset_t *os, zil_get_data_t *get_data);
+extern void	zil_close(zilog_t *zilog);
+
+extern void	zil_replay(objset_t *os, void *arg, uint64_t *txgp,
+    zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_wait)(void *));
+extern void	zil_destroy(zilog_t *zilog);
+
+extern itx_t	*zil_itx_create(int txtype, size_t lrsize);
+extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
+
+extern void	zil_commit(zilog_t *zilog, uint64_t seq, int ioflag);
+
+extern void	zil_claim(char *osname, void *txarg);
+extern void	zil_sync(zilog_t *zilog, dmu_tx_t *tx);
+extern void	zil_clean(zilog_t *zilog);
+
+extern int	zil_suspend(zilog_t *zilog);
+extern void	zil_resume(zilog_t *zilog);
+
+extern int zil_disable;
+extern int zil_always;
+extern int zil_purge;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZIL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zil_impl.h b/usr/src/uts/common/fs/zfs/sys/zil_impl.h
new file mode 100644
index 000000000000..6286fc5aa36f
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h
@@ -0,0 +1,111 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_ZIL_IMPL_H
+#define	_SYS_ZIL_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zil.h>
+#include <sys/dmu_objset.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum lwb_state_type {
+	UNWRITTEN,	/* buffer yet to be written */
+	SEQ_INCOMPLETE,	/* buffer written, but there's an unwritten buffer in */
+			/* the sequence before this */
+	SEQ_COMPLETE,	/* no unwritten buffers before this */
+} lwb_state_t;
+
+/*
+ * Log write buffer.
+ */
+typedef struct lwb {
+	zilog_t		*lwb_zilog;	/* back pointer to log struct */
+	blkptr_t	lwb_blk;	/* on disk address of this log blk */
+	int		lwb_nused;	/* # used bytes in buffer */
+	int		lwb_sz;		/* size of block and buffer */
+	char		*lwb_buf;	/* log write buffer */
+	uint64_t	lwb_max_txg;	/* highest txg in this lwb */
+	uint64_t	lwb_seq;	/* highest log record seq number */
+	txg_handle_t	lwb_txgh;	/* txg handle for txg_exit() */
+	list_node_t	lwb_node;	/* zilog->zl_lwb_list linkage */
+	lwb_state_t	lwb_state;	/* buffer state */
+} lwb_t;
+
+/*
+ * [vdev, seq] element for use in flushing device write caches
+ */
+typedef struct zil_vdev {
+	uint64_t	vdev;		/* device written */
+	uint64_t	seq;		/* itx sequence */
+	list_node_t	vdev_seq_node;	/* zilog->zl_vdev_list linkage */
+} zil_vdev_t;
+
+/*
+ * Stable storage intent log management structure.  One per dataset.
+ */
+struct zilog {
+	kmutex_t	zl_lock;	/* protects most zilog_t fields */
+	struct dsl_pool	*zl_dmu_pool;	/* DSL pool */
+	spa_t		*zl_spa;	/* handle for read/write log */
+	zil_header_t	*zl_header;	/* log header buffer */
+	objset_t	*zl_os;		/* object set we're logging */
+	zil_get_data_t	*zl_get_data;	/* callback to get object content */
+	uint64_t	zl_itx_seq;	/* itx sequence number */
+	uint64_t	zl_ss_seq;	/* last tx on stable storage */
+	uint64_t	zl_destroy_txg;	/* txg of last zil_destroy() */
+	uint64_t	zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */
+	uint32_t	zl_suspend;	/* log suspend count */
+	kcondvar_t	zl_cv_write;	/* for waiting to write to log */
+	kcondvar_t	zl_cv_seq;	/* for committing a sequence */
+	uint8_t		zl_stop_replay;	/* don't replay any further */
+	uint8_t		zl_stop_sync;	/* for debugging */
+	uint8_t		zl_writer;	/* boolean: write setup in progress */
+	uint8_t		zl_log_error;	/* boolean: log write error */
+	list_t		zl_itx_list;	/* in-memory itx list */
+	uint64_t	zl_itx_list_sz;	/* total size of records on list */
+	uint64_t	zl_prev_blk_sz;	/* previous log block size */
+	list_t		zl_lwb_list;	/* in-flight log write list */
+	list_t		zl_vdev_list;	/* list of [vdev, seq] pairs */
+	taskq_t		*zl_clean_taskq; /* runs lwb and itx clean tasks */
+	avl_tree_t	zl_dva_tree;	/* track DVAs during log parse */
+	kmutex_t	zl_destroy_lock; /* serializes zil_destroy() calls */
+};
+
+typedef struct zil_dva_node {
+	dva_t		zn_dva;
+	avl_node_t	zn_node;
+} zil_dva_node_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZIL_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
new file mode 100644
index 000000000000..5d3227e546f3
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -0,0 +1,298 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZIO_H
+#define	_ZIO_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/dkio.h>
+#include <sys/fs/zfs.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	ZBT_MAGIC	0x210da7ab10c7a11ULL	/* zio data bloc tail */
+
+typedef struct zio_block_tail {
+	uint64_t	zbt_magic;	/* for validation, endianness	*/
+	zio_cksum_t	zbt_cksum;	/* 256-bit checksum		*/
+} zio_block_tail_t;
+
+/*
+ * Gang block headers are self-checksumming and contain an array
+ * of block pointers.
+ */
+#define	SPA_GANGBLOCKSIZE	SPA_MINBLOCKSIZE
+#define	SPA_GBH_NBLKPTRS	((SPA_GANGBLOCKSIZE - \
+	sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
+#define	SPA_GBH_FILLER		((SPA_GANGBLOCKSIZE - \
+	sizeof (zio_block_tail_t) - \
+	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
+	sizeof (uint64_t))
+
+#define	ZIO_GET_DVA(zio)	(&(zio)->io_bp->blk_dva[(zio)->io_dva_index])
+#define	ZIO_GET_IOSIZE(zio)	\
+	(DVA_GET_GANG(ZIO_GET_DVA(zio)) ? \
+	SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp))
+
+typedef struct zio_gbh {
+	blkptr_t		zg_blkptr[SPA_GBH_NBLKPTRS];
+	uint64_t		zg_filler[SPA_GBH_FILLER];
+	zio_block_tail_t	zg_tail;
+} zio_gbh_phys_t;
+
+enum zio_checksum {
+	ZIO_CHECKSUM_INHERIT = 0,
+	ZIO_CHECKSUM_ON,
+	ZIO_CHECKSUM_OFF,
+	ZIO_CHECKSUM_LABEL,
+	ZIO_CHECKSUM_GANG_HEADER,
+	ZIO_CHECKSUM_ZILOG,
+	ZIO_CHECKSUM_FLETCHER_2,
+	ZIO_CHECKSUM_FLETCHER_4,
+	ZIO_CHECKSUM_SHA256,
+	ZIO_CHECKSUM_FUNCTIONS
+};
+
+#define	ZIO_CHECKSUM_ON_VALUE	ZIO_CHECKSUM_FLETCHER_2
+#define	ZIO_CHECKSUM_DEFAULT	ZIO_CHECKSUM_ON
+
+enum zio_compress {
+	ZIO_COMPRESS_INHERIT = 0,
+	ZIO_COMPRESS_ON,
+	ZIO_COMPRESS_OFF,
+	ZIO_COMPRESS_LZJB,
+	ZIO_COMPRESS_FUNCTIONS
+};
+
+#define	ZIO_COMPRESS_ON_VALUE	ZIO_COMPRESS_LZJB
+#define	ZIO_COMPRESS_DEFAULT	ZIO_COMPRESS_OFF
+
+#define	ZIO_PRIORITY_NOW		(zio_priority_table[0])
+#define	ZIO_PRIORITY_SYNC_READ		(zio_priority_table[1])
+#define	ZIO_PRIORITY_SYNC_WRITE		(zio_priority_table[2])
+#define	ZIO_PRIORITY_ASYNC_READ		(zio_priority_table[3])
+#define	ZIO_PRIORITY_ASYNC_WRITE	(zio_priority_table[4])
+#define	ZIO_PRIORITY_FREE		(zio_priority_table[5])
+#define	ZIO_PRIORITY_CACHE_FILL		(zio_priority_table[6])
+#define	ZIO_PRIORITY_LOG_WRITE		(zio_priority_table[7])
+#define	ZIO_PRIORITY_RESILVER		(zio_priority_table[8])
+#define	ZIO_PRIORITY_SCRUB		(zio_priority_table[9])
+#define	ZIO_PRIORITY_TABLE_SIZE		10
+
+#define	ZIO_FLAG_MUSTSUCCEED		0x0000
+#define	ZIO_FLAG_CANFAIL		0x0001
+#define	ZIO_FLAG_FAILFAST		0x0002
+#define	ZIO_FLAG_CONFIG_HELD		0x0004
+
+#define	ZIO_FLAG_DONT_CACHE		0x0010
+#define	ZIO_FLAG_DONT_QUEUE		0x0020
+#define	ZIO_FLAG_DONT_PROPAGATE		0x0040
+#define	ZIO_FLAG_DONT_RETRY		0x0080
+
+#define	ZIO_FLAG_PHYSICAL		0x0100
+#define	ZIO_FLAG_IO_BYPASS		0x0200
+#define	ZIO_FLAG_IO_REPAIR		0x0400
+#define	ZIO_FLAG_SPECULATIVE		0x0800
+
+#define	ZIO_FLAG_RESILVER		0x1000
+#define	ZIO_FLAG_SCRUB			0x2000
+
+#define	ZIO_FLAG_GANG_INHERIT		\
+	(ZIO_FLAG_CANFAIL |		\
+	ZIO_FLAG_FAILFAST |		\
+	ZIO_FLAG_CONFIG_HELD |		\
+	ZIO_FLAG_DONT_RETRY |		\
+	ZIO_FLAG_IO_REPAIR |		\
+	ZIO_FLAG_SPECULATIVE |		\
+	ZIO_FLAG_RESILVER |		\
+	ZIO_FLAG_SCRUB)
+
+#define	ZIO_FLAG_VDEV_INHERIT		\
+	(ZIO_FLAG_GANG_INHERIT |	\
+	ZIO_FLAG_DONT_CACHE |		\
+	ZIO_FLAG_PHYSICAL)
+
+/*
+ * We'll take the unused errno 'EBADE' (from the Convergent graveyard)
+ * to indicate checksum errors.
+ */
+#define	ECKSUM	EBADE
+
+typedef struct zio zio_t;
+typedef void zio_done_func_t(zio_t *zio);
+typedef struct zio_transform zio_transform_t;
+
+extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
+extern char *zio_type_name[ZIO_TYPES];
+
+struct zio {
+	/* Core information about this I/O */
+	zio_t		*io_parent;
+	zio_t		*io_root;
+	spa_t		*io_spa;
+	int		io_checksum;
+	int		io_compress;
+	int		io_dva_index;
+	uint64_t	io_txg;
+	blkptr_t	*io_bp;
+	blkptr_t	io_bp_copy;
+	zio_t		*io_child;
+	zio_t		*io_sibling_prev;
+	zio_t		*io_sibling_next;
+	zio_transform_t *io_transform_stack;
+
+	/* Callback info */
+	zio_done_func_t	*io_done;
+	void		*io_private;
+	blkptr_t	io_bp_orig;
+
+	/* Data represented by this I/O */
+	void		*io_data;
+	uint64_t	io_size;
+
+	/* Stuff for the vdev stack */
+	vdev_t		*io_vd;
+	void		*io_vsd;
+	uint64_t	io_offset;
+	uint64_t	io_deadline;
+	uint64_t	io_timestamp;
+	avl_node_t	io_offset_node;
+	avl_node_t	io_deadline_node;
+	avl_tree_t	*io_vdev_tree;
+	zio_t		*io_delegate_list;
+	zio_t		*io_delegate_next;
+	zio_t		*io_retry_next;
+	list_node_t	io_pending;
+
+	/* Internal pipeline state */
+	int		io_flags;
+	uint8_t		io_type;
+	uint8_t		io_stage;
+	uint8_t		io_stalled;
+	uint8_t		io_priority;
+	struct dk_callback io_dk_callback;
+	int		io_cmd;
+	int		io_retries;
+	int		io_error;
+	uint32_t	io_numerrors;
+	uint32_t	io_pipeline;
+	uint32_t	io_async_stages;
+	uint64_t	io_children_notready;
+	uint64_t	io_children_notdone;
+	void		*io_waiter;
+	kmutex_t	io_lock;
+	kcondvar_t	io_cv;
+};
+
+extern zio_t *zio_null(zio_t *pio, spa_t *spa,
+    zio_done_func_t *done, void *private, int flags);
+
+extern zio_t *zio_root(spa_t *spa,
+    zio_done_func_t *done, void *private, int flags);
+
+extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
+    uint64_t size, zio_done_func_t *done, void *private,
+    int priority, int flags);
+
+extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+    zio_done_func_t *done, void *private, int priority, int flags);
+
+extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
+    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+    zio_done_func_t *done, void *private, int priority, int flags);
+
+extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    zio_done_func_t *done, void *private);
+
+extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    zio_done_func_t *done, void *private);
+
+extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+    zio_done_func_t *done, void *private, int priority, int flags);
+
+extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
+    uint64_t size, void *data, int checksum,
+    zio_done_func_t *done, void *private, int priority, int flags);
+
+extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
+    uint64_t size, void *data, int checksum,
+    zio_done_func_t *done, void *private, int priority, int flags);
+
+extern int zio_alloc_blk(spa_t *spa, int checksum, uint64_t size,
+    blkptr_t *bp, uint64_t txg);
+extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
+
+extern int zio_wait(zio_t *zio);
+extern void zio_nowait(zio_t *zio);
+
+extern void *zio_buf_alloc(size_t size);
+extern void zio_buf_free(void *buf, size_t size);
+
+/*
+ * Move an I/O to the next stage of the pipeline and execute that stage.
+ * There's no locking on io_stage because there's no legitimate way for
+ * multiple threads to be attempting to process the same I/O.
+ */
+extern void zio_next_stage(zio_t *zio);
+extern void zio_next_stage_async(zio_t *zio);
+extern void zio_wait_children_done(zio_t *zio);
+
+/*
+ * Delegate I/O to a child vdev.
+ */
+extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
+    uint64_t offset, void *data, uint64_t size, int type, int priority,
+    int flags, zio_done_func_t *done, void *private);
+
+extern void zio_vdev_io_bypass(zio_t *zio);
+extern void zio_vdev_io_reissue(zio_t *zio);
+extern void zio_vdev_io_redone(zio_t *zio);
+
+extern void zio_checksum_verified(zio_t *zio);
+extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp);
+
+extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
+extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
+
+/*
+ * Initial setup and teardown.
+ */
+extern void zio_init(void);
+extern void zio_fini(void);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZIO_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
new file mode 100644
index 000000000000..ba3dc48d2804
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIO_CHECKSUM_H
+#define	_SYS_ZIO_CHECKSUM_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zio.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Signature for checksum functions.
+ */
+typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
+
+/*
+ * Information about each checksum function.
+ */
+typedef struct zio_checksum_info {
+	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
+	int		ci_correctable;	/* number of correctable bits	*/
+	int		ci_zbt;		/* uses zio block tail?	*/
+	char		*ci_name;	/* descriptive name */
+} zio_checksum_info_t;
+
+extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
+
+/*
+ * Checksum routines.
+ */
+extern zio_checksum_t fletcher_2_native;
+extern zio_checksum_t fletcher_4_native;
+
+extern zio_checksum_t fletcher_2_byteswap;
+extern zio_checksum_t fletcher_4_byteswap;
+
+extern zio_checksum_t zio_checksum_SHA256;
+
+extern void zio_checksum(uint_t checksum, zio_cksum_t *zcp,
+    void *data, uint64_t size);
+extern int zio_checksum_error(zio_t *zio);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZIO_CHECKSUM_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_compress.h b/usr/src/uts/common/fs/zfs/sys/zio_compress.h
new file mode 100644
index 000000000000..7eddf1e8d18b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zio_compress.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIO_COMPRESS_H
+#define	_SYS_ZIO_COMPRESS_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zio.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Common signature for all zio compress/decompress functions.
+ */
+typedef size_t zio_compress_func_t(void *src, void *dst,
+    size_t s_len, size_t d_len);
+typedef int zio_decompress_func_t(void *src, void *dst,
+    size_t s_len, size_t d_len);
+
+/*
+ * Information about each compression function.
+ */
+typedef struct zio_compress_info {
+	zio_compress_func_t	*ci_compress;
+	zio_decompress_func_t	*ci_decompress;
+	char			*ci_name;
+} zio_compress_info_t;
+
+extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS];
+
+/*
+ * Compression routines.
+ */
+extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len);
+extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len);
+
+/*
+ * Compress and decompress data if necessary.
+ */
+extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize,
+    void **destp, uint64_t *destsizep, uint64_t *destbufsizep);
+extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
+    void *dest, uint64_t destsize);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZIO_COMPRESS_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
new file mode 100644
index 000000000000..0b2b07de29ba
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
@@ -0,0 +1,208 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZIO_IMPL_H
+#define	_ZIO_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * I/O Groups: pipeline stage definitions.
+ */
+
+typedef enum zio_stage {
+	ZIO_STAGE_OPEN = 0,			/* RWFCI */
+	ZIO_STAGE_WAIT_CHILDREN_READY,		/* RWFCI */
+
+	ZIO_STAGE_WRITE_COMPRESS,		/* -W--- */
+	ZIO_STAGE_CHECKSUM_GENERATE,		/* -W--- */
+
+	ZIO_STAGE_GANG_PIPELINE,		/* -WFC- */
+
+	ZIO_STAGE_GET_GANG_HEADER,		/* -WFC- */
+	ZIO_STAGE_REWRITE_GANG_MEMBERS,		/* -W--- */
+	ZIO_STAGE_FREE_GANG_MEMBERS,		/* --F-- */
+	ZIO_STAGE_CLAIM_GANG_MEMBERS,		/* ---C- */
+
+	ZIO_STAGE_DVA_ALLOCATE,			/* -W--- */
+	ZIO_STAGE_DVA_FREE,			/* --F-- */
+	ZIO_STAGE_DVA_CLAIM,			/* ---C- */
+
+	ZIO_STAGE_GANG_CHECKSUM_GENERATE,	/* -W--- */
+
+	ZIO_STAGE_READY,			/* RWFCI */
+
+	ZIO_STAGE_DVA_TRANSLATE,		/* RW--- */
+
+	ZIO_STAGE_VDEV_IO_SETUP,		/* RW--I */
+	ZIO_STAGE_VDEV_IO_START,		/* RW--I */
+	ZIO_STAGE_VDEV_IO_DONE,			/* RW--I */
+	ZIO_STAGE_VDEV_IO_ASSESS,		/* RW--I */
+
+	ZIO_STAGE_WAIT_CHILDREN_DONE,		/* RWFCI */
+
+	ZIO_STAGE_CHECKSUM_VERIFY,		/* R---- */
+	ZIO_STAGE_READ_GANG_MEMBERS,		/* R---- */
+	ZIO_STAGE_READ_DECOMPRESS,		/* R---- */
+
+	ZIO_STAGE_DONE				/* RWFCI */
+} zio_stage_t;
+
+/*
+ * The stages for which there's some performance value in going async.
+ * When compression is enabled, ZIO_STAGE_WRITE_COMPRESS is ORed in as well.
+ */
+#define	ZIO_ASYNC_PIPELINE_STAGES				\
+	((1U << ZIO_STAGE_CHECKSUM_GENERATE) |			\
+	(1U << ZIO_STAGE_VDEV_IO_DONE) |			\
+	(1U << ZIO_STAGE_CHECKSUM_VERIFY) |			\
+	(1U << ZIO_STAGE_READ_DECOMPRESS))
+
+#define	ZIO_VDEV_IO_PIPELINE					\
+	((1U << ZIO_STAGE_VDEV_IO_SETUP) |			\
+	(1U << ZIO_STAGE_VDEV_IO_START) |			\
+	(1U << ZIO_STAGE_VDEV_IO_DONE) |			\
+	(1U << ZIO_STAGE_VDEV_IO_ASSESS))
+
+#define	ZIO_READ_PHYS_PIPELINE					\
+	((1U << ZIO_STAGE_OPEN) |				\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
+	(1U << ZIO_STAGE_READY) |				\
+	ZIO_VDEV_IO_PIPELINE |					\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_CHECKSUM_VERIFY) |			\
+	(1U << ZIO_STAGE_DONE))
+
+#define	ZIO_READ_PIPELINE					\
+	((1U << ZIO_STAGE_DVA_TRANSLATE) |			\
+	ZIO_READ_PHYS_PIPELINE)
+
+#define	ZIO_WRITE_PHYS_PIPELINE					\
+	((1U << ZIO_STAGE_OPEN) |				\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
+	(1U << ZIO_STAGE_CHECKSUM_GENERATE) |			\
+	(1U << ZIO_STAGE_READY) |				\
+	ZIO_VDEV_IO_PIPELINE |					\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_DONE))
+
+#define	ZIO_WRITE_COMMON_PIPELINE				\
+	((1U << ZIO_STAGE_DVA_TRANSLATE) |			\
+	ZIO_WRITE_PHYS_PIPELINE)
+
+#define	ZIO_WRITE_PIPELINE					\
+	((1U << ZIO_STAGE_WRITE_COMPRESS) |			\
+	ZIO_WRITE_COMMON_PIPELINE)
+
+#define	ZIO_GANG_STAGES						\
+	((1U << ZIO_STAGE_GET_GANG_HEADER) |			\
+	(1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) |		\
+	(1U << ZIO_STAGE_FREE_GANG_MEMBERS) |			\
+	(1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) |			\
+	(1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) |		\
+	(1U << ZIO_STAGE_READ_GANG_MEMBERS))
+
+#define	ZIO_REWRITE_PIPELINE					\
+	((1U << ZIO_STAGE_GANG_PIPELINE) |			\
+	(1U << ZIO_STAGE_GET_GANG_HEADER) |			\
+	(1U << ZIO_STAGE_REWRITE_GANG_MEMBERS) |		\
+	(1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE) |		\
+	ZIO_WRITE_COMMON_PIPELINE)
+
+#define	ZIO_WRITE_ALLOCATE_PIPELINE				\
+	((1U << ZIO_STAGE_DVA_ALLOCATE) |			\
+	ZIO_WRITE_COMMON_PIPELINE)
+
+#define	ZIO_GANG_FREE_STAGES					\
+	((1U << ZIO_STAGE_GET_GANG_HEADER) |			\
+	(1U << ZIO_STAGE_FREE_GANG_MEMBERS))
+
+#define	ZIO_FREE_PIPELINE					\
+	((1U << ZIO_STAGE_OPEN) |				\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
+	(1U << ZIO_STAGE_GANG_PIPELINE) |			\
+	(1U << ZIO_STAGE_GET_GANG_HEADER) |			\
+	(1U << ZIO_STAGE_FREE_GANG_MEMBERS) |			\
+	(1U << ZIO_STAGE_DVA_FREE) |				\
+	(1U << ZIO_STAGE_READY) |				\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_DONE))
+
+#define	ZIO_CLAIM_PIPELINE					\
+	((1U << ZIO_STAGE_OPEN) |				\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
+	(1U << ZIO_STAGE_GANG_PIPELINE) |			\
+	(1U << ZIO_STAGE_GET_GANG_HEADER) |			\
+	(1U << ZIO_STAGE_CLAIM_GANG_MEMBERS) |			\
+	(1U << ZIO_STAGE_DVA_CLAIM) |				\
+	(1U << ZIO_STAGE_READY) |				\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_DONE))
+
+#define	ZIO_IOCTL_PIPELINE					\
+	((1U << ZIO_STAGE_OPEN) |				\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_READY) |			\
+	(1U << ZIO_STAGE_READY) |				\
+	ZIO_VDEV_IO_PIPELINE |					\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_DONE))
+
+#define	ZIO_WAIT_FOR_CHILDREN_PIPELINE				\
+	((1U << ZIO_STAGE_WAIT_CHILDREN_READY) |		\
+	(1U << ZIO_STAGE_READY) |				\
+	(1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_DONE))
+
+#define	ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE			\
+	((1U << ZIO_STAGE_WAIT_CHILDREN_DONE) |			\
+	(1U << ZIO_STAGE_DONE))
+
+#define	ZIO_VDEV_CHILD_PIPELINE					\
+	(ZIO_WAIT_FOR_CHILDREN_DONE_PIPELINE |			\
+	ZIO_VDEV_IO_PIPELINE)
+
+#define	ZIO_ERROR_PIPELINE_MASK					\
+	ZIO_WAIT_FOR_CHILDREN_PIPELINE
+
+struct zio_transform {
+	void		*zt_data;
+	uint64_t	zt_size;
+	uint64_t	zt_bufsize;
+	zio_transform_t	*zt_next;
+};
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZIO_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
new file mode 100644
index 000000000000..81ab16cd3d17
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -0,0 +1,583 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/txg_impl.h>
+#include <sys/dmu_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/callb.h>
+
+/*
+ * Pool-wide transaction groups.
+ */
+
+static void txg_sync_thread(dsl_pool_t *dp);
+static void txg_quiesce_thread(dsl_pool_t *dp);
+static void txg_timelimit_thread(dsl_pool_t *dp);
+
+int txg_time = 5;	/* max 5 seconds worth of delta per txg */
+
+/*
+ * Prepare the txg subsystem.
+ */
+void
+txg_init(dsl_pool_t *dp, uint64_t txg)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	bzero(tx, sizeof (tx_state_t));
+
+	tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
+
+	rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
+
+	tx->tx_open_txg = txg;
+}
+
+/*
+ * Close down the txg subsystem.
+ */
+void
+txg_fini(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	ASSERT(tx->tx_threads == 0);
+
+	rw_destroy(&tx->tx_suspend);
+
+	kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
+
+	bzero(tx, sizeof (tx_state_t));
+}
+
+/*
+ * Start syncing transaction groups.
+ */
+void
+txg_sync_start(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	mutex_enter(&tx->tx_sync_lock);
+
+	dprintf("pool %p\n", dp);
+
+	ASSERT(tx->tx_threads == 0);
+
+	tx->tx_threads = 3;
+
+	tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
+	    dp, 0, &p0, TS_RUN, minclsyspri);
+
+	tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,
+	    dp, 0, &p0, TS_RUN, minclsyspri);
+
+	tx->tx_timelimit_thread = thread_create(NULL, 0, txg_timelimit_thread,
+	    dp, 0, &p0, TS_RUN, minclsyspri);
+
+	mutex_exit(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
+{
+	CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
+	mutex_enter(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
+{
+	ASSERT(*tpp != NULL);
+	*tpp = NULL;
+	tx->tx_threads--;
+	cv_broadcast(&tx->tx_exit_cv);
+	CALLB_CPR_EXIT(cpr);		/* drops &tx->tx_sync_lock */
+	thread_exit();
+}
+
+static void
+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, int secmax)
+{
+	CALLB_CPR_SAFE_BEGIN(cpr);
+
+	if (secmax)
+		(void) cv_timedwait(cv, &tx->tx_sync_lock, lbolt + secmax * hz);
+	else
+		cv_wait(cv, &tx->tx_sync_lock);
+
+	CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
+}
+
+/*
+ * Stop syncing transaction groups.
+ */
+void
+txg_sync_stop(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	dprintf("pool %p\n", dp);
+	/*
+	 * Finish off any work in progress.
+	 */
+	ASSERT(tx->tx_threads == 3);
+	txg_wait_synced(dp, 0);
+
+	/*
+	 * Wake all 3 sync threads (one per state) and wait for them to die.
+	 */
+	mutex_enter(&tx->tx_sync_lock);
+
+	ASSERT(tx->tx_threads == 3);
+
+	tx->tx_exiting = 1;
+
+	cv_broadcast(&tx->tx_quiesce_more_cv);
+	cv_broadcast(&tx->tx_quiesce_done_cv);
+	cv_broadcast(&tx->tx_sync_more_cv);
+	cv_broadcast(&tx->tx_timeout_exit_cv);
+
+	while (tx->tx_threads != 0)
+		cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
+
+	tx->tx_exiting = 0;
+
+	mutex_exit(&tx->tx_sync_lock);
+}
+
+uint64_t
+txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
+	uint64_t txg;
+
+	mutex_enter(&tc->tc_lock);
+
+	txg = tx->tx_open_txg;
+	tc->tc_count[txg & TXG_MASK]++;
+
+	th->th_cpu = tc;
+	th->th_txg = txg;
+
+	return (txg);
+}
+
+void
+txg_rele_to_quiesce(txg_handle_t *th)
+{
+	tx_cpu_t *tc = th->th_cpu;
+
+	mutex_exit(&tc->tc_lock);
+}
+
+void
+txg_rele_to_sync(txg_handle_t *th)
+{
+	tx_cpu_t *tc = th->th_cpu;
+	int g = th->th_txg & TXG_MASK;
+
+	mutex_enter(&tc->tc_lock);
+	ASSERT(tc->tc_count[g] != 0);
+	if (--tc->tc_count[g] == 0)
+		cv_broadcast(&tc->tc_cv[g]);
+	mutex_exit(&tc->tc_lock);
+
+	th->th_cpu = NULL;	/* defensive */
+}
+
+static void
+txg_quiesce(dsl_pool_t *dp, uint64_t txg)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	int g = txg & TXG_MASK;
+	int c;
+
+	/*
+	 * Grab all tx_cpu locks so nobody else can get into this txg.
+	 */
+	for (c = 0; c < max_ncpus; c++)
+		mutex_enter(&tx->tx_cpu[c].tc_lock);
+
+	ASSERT(txg == tx->tx_open_txg);
+	tx->tx_open_txg++;
+
+	/*
+	 * Now that we've incremented tx_open_txg, we can let threads
+	 * enter the next transaction group.
+	 */
+	for (c = 0; c < max_ncpus; c++)
+		mutex_exit(&tx->tx_cpu[c].tc_lock);
+
+	/*
+	 * Quiesce the transaction group by waiting for everyone to txg_exit().
+	 */
+	for (c = 0; c < max_ncpus; c++) {
+		tx_cpu_t *tc = &tx->tx_cpu[c];
+		mutex_enter(&tc->tc_lock);
+		while (tc->tc_count[g] != 0)
+			cv_wait(&tc->tc_cv[g], &tc->tc_lock);
+		mutex_exit(&tc->tc_lock);
+	}
+}
+
+static void
+txg_sync_thread(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	callb_cpr_t cpr;
+
+	txg_thread_enter(tx, &cpr);
+
+	for (;;) {
+		uint64_t txg;
+
+		/*
+		 * We sync when there's someone waiting on us, or the
+		 * quiesce thread has handed off a txg to us.
+		 */
+		while (!tx->tx_exiting &&
+		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
+		    tx->tx_quiesced_txg == 0) {
+			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
+			    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, 0);
+		}
+
+		/*
+		 * Wait until the quiesce thread hands off a txg to us,
+		 * prompting it to do so if necessary.
+		 */
+		while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
+			if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
+				tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
+			cv_broadcast(&tx->tx_quiesce_more_cv);
+			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
+		}
+
+		if (tx->tx_exiting)
+			txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
+
+		rw_enter(&tx->tx_suspend, RW_WRITER);
+
+		/*
+		 * Consume the quiesced txg which has been handed off to
+		 * us.  This may cause the quiescing thread to now be
+		 * able to quiesce another txg, so we must signal it.
+		 */
+		txg = tx->tx_quiesced_txg;
+		tx->tx_quiesced_txg = 0;
+		tx->tx_syncing_txg = txg;
+		cv_broadcast(&tx->tx_quiesce_more_cv);
+		rw_exit(&tx->tx_suspend);
+
+		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+			txg, tx->tx_quiesce_txg_waiting,
+			tx->tx_sync_txg_waiting);
+		mutex_exit(&tx->tx_sync_lock);
+		spa_sync(dp->dp_spa, txg);
+		mutex_enter(&tx->tx_sync_lock);
+		rw_enter(&tx->tx_suspend, RW_WRITER);
+		tx->tx_synced_txg = txg;
+		tx->tx_syncing_txg = 0;
+		rw_exit(&tx->tx_suspend);
+		cv_broadcast(&tx->tx_sync_done_cv);
+	}
+}
+
+static void
+txg_quiesce_thread(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	callb_cpr_t cpr;
+
+	txg_thread_enter(tx, &cpr);
+
+	for (;;) {
+		uint64_t txg;
+
+		/*
+		 * We quiesce when there's someone waiting on us.
+		 * However, we can only have one txg in "quiescing" or
+		 * "quiesced, waiting to sync" state.  So we wait until
+		 * the "quiesced, waiting to sync" txg has been consumed
+		 * by the sync thread.
+		 */
+		while (!tx->tx_exiting &&
+		    (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
+		    tx->tx_quiesced_txg != 0))
+			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
+
+		if (tx->tx_exiting)
+			txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
+
+		txg = tx->tx_open_txg;
+		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+		    txg, tx->tx_quiesce_txg_waiting,
+		    tx->tx_sync_txg_waiting);
+		mutex_exit(&tx->tx_sync_lock);
+		txg_quiesce(dp, txg);
+		mutex_enter(&tx->tx_sync_lock);
+
+		/*
+		 * Hand this txg off to the sync thread.
+		 */
+		dprintf("quiesce done, handing off txg %llu\n", txg);
+		tx->tx_quiesced_txg = txg;
+		cv_broadcast(&tx->tx_sync_more_cv);
+		cv_broadcast(&tx->tx_quiesce_done_cv);
+	}
+}
+
+void
+txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	mutex_enter(&tx->tx_sync_lock);
+	ASSERT(tx->tx_threads == 3);
+	if (txg == 0)
+		txg = tx->tx_open_txg;
+	if (tx->tx_sync_txg_waiting < txg)
+		tx->tx_sync_txg_waiting = txg;
+	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+	while (tx->tx_synced_txg < txg) {
+		dprintf("broadcasting sync more "
+		    "tx_synced=%llu waiting=%llu dp=%p\n",
+		    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+		cv_broadcast(&tx->tx_sync_more_cv);
+		cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
+	}
+	mutex_exit(&tx->tx_sync_lock);
+}
+
+void
+txg_wait_open(dsl_pool_t *dp, uint64_t txg)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	mutex_enter(&tx->tx_sync_lock);
+	ASSERT(tx->tx_threads == 3);
+	if (txg == 0)
+		txg = tx->tx_open_txg + 1;
+	if (tx->tx_quiesce_txg_waiting < txg)
+		tx->tx_quiesce_txg_waiting = txg;
+	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+	while (tx->tx_open_txg < txg) {
+		cv_broadcast(&tx->tx_quiesce_more_cv);
+		cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
+	}
+	mutex_exit(&tx->tx_sync_lock);
+}
+
+static void
+txg_timelimit_thread(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	callb_cpr_t cpr;
+
+	txg_thread_enter(tx, &cpr);
+
+	while (!tx->tx_exiting) {
+		uint64_t txg = tx->tx_open_txg + 1;
+
+		txg_thread_wait(tx, &cpr, &tx->tx_timeout_exit_cv, txg_time);
+
+		if (tx->tx_quiesce_txg_waiting < txg)
+			tx->tx_quiesce_txg_waiting = txg;
+
+		while (!tx->tx_exiting && tx->tx_open_txg < txg) {
+			dprintf("pushing out %llu\n", txg);
+			cv_broadcast(&tx->tx_quiesce_more_cv);
+			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
+		}
+	}
+	txg_thread_exit(tx, &cpr, &tx->tx_timelimit_thread);
+}
+
+int
+txg_stalled(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
+}
+
+void
+txg_suspend(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	/* XXX some code paths suspend when they are already suspended! */
+	rw_enter(&tx->tx_suspend, RW_READER);
+}
+
+void
+txg_resume(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	rw_exit(&tx->tx_suspend);
+}
+
+/*
+ * Per-txg object lists.
+ */
+void
+txg_list_create(txg_list_t *tl, size_t offset)
+{
+	int t;
+
+	mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	tl->tl_offset = offset;
+
+	for (t = 0; t < TXG_SIZE; t++)
+		tl->tl_head[t] = NULL;
+}
+
+void
+txg_list_destroy(txg_list_t *tl)
+{
+	int t;
+
+	for (t = 0; t < TXG_SIZE; t++)
+		ASSERT(txg_list_empty(tl, t));
+
+	mutex_destroy(&tl->tl_lock);
+}
+
+int
+txg_list_empty(txg_list_t *tl, uint64_t txg)
+{
+	return (tl->tl_head[txg & TXG_MASK] == NULL);
+}
+
+/*
+ * Add an entry to the list.
+ * Returns 0 if it's a new entry, 1 if it's already there.
+ */
+int
+txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+	int already_on_list;
+
+	mutex_enter(&tl->tl_lock);
+	already_on_list = tn->tn_member[t];
+	if (!already_on_list) {
+		tn->tn_member[t] = 1;
+		tn->tn_next[t] = tl->tl_head[t];
+		tl->tl_head[t] = tn;
+	}
+	mutex_exit(&tl->tl_lock);
+
+	return (already_on_list);
+}
+
+/*
+ * Remove the head of the list and return it.
+ */
+void *
+txg_list_remove(txg_list_t *tl, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn;
+	void *p = NULL;
+
+	mutex_enter(&tl->tl_lock);
+	if ((tn = tl->tl_head[t]) != NULL) {
+		p = (char *)tn - tl->tl_offset;
+		tl->tl_head[t] = tn->tn_next[t];
+		tn->tn_next[t] = NULL;
+		tn->tn_member[t] = 0;
+	}
+	mutex_exit(&tl->tl_lock);
+
+	return (p);
+}
+
+/*
+ * Remove a specific item from the list and return it.
+ */
+void *
+txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn, **tp;
+
+	mutex_enter(&tl->tl_lock);
+
+	for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
+		if ((char *)tn - tl->tl_offset == p) {
+			*tp = tn->tn_next[t];
+			tn->tn_next[t] = NULL;
+			tn->tn_member[t] = 0;
+			mutex_exit(&tl->tl_lock);
+			return (p);
+		}
+	}
+
+	mutex_exit(&tl->tl_lock);
+
+	return (NULL);
+}
+
+int
+txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+	return (tn->tn_member[t]);
+}
+
+/*
+ * Walk a txg list -- only safe if you know it's not changing.
+ */
+void *
+txg_list_head(txg_list_t *tl, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn = tl->tl_head[t];
+
+	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
+
+void *
+txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+	tn = tn->tn_next[t];
+
+	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
diff --git a/usr/src/uts/common/fs/zfs/uberblock.c b/usr/src/uts/common/fs/zfs/uberblock.c
new file mode 100644
index 000000000000..63bff0ae4b35
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/uberblock.c
@@ -0,0 +1,67 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/uberblock_impl.h>
+#include <sys/vdev_impl.h>
+
+/* Keep the uberblock version in a varialbe so we can get at it with mdb */
+static uint64_t uberblock_version = UBERBLOCK_VERSION;
+
+int
+uberblock_verify(uberblock_t *ub)
+{
+	if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC))
+		byteswap_uint64_array(ub, sizeof (uberblock_t));
+
+	if (ub->ub_magic != UBERBLOCK_MAGIC)
+		return (EINVAL);
+
+	if (ub->ub_version != UBERBLOCK_VERSION)
+		return (ENOTSUP);
+
+	return (0);
+}
+
+/*
+ * Update the uberblock and return a boolean value indicating whether
+ * anything changed in this transaction group.
+ */
+int
+uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
+{
+	ASSERT(ub->ub_txg < txg);
+
+	ub->ub_magic = UBERBLOCK_MAGIC;
+	ub->ub_version = UBERBLOCK_VERSION;
+	ub->ub_txg = txg;
+	ub->ub_guid_sum = rvd->vdev_guid_sum;
+	ub->ub_timestamp = gethrestime_sec();
+
+	return (ub->ub_rootbp.blk_birth == txg);
+}
diff --git a/usr/src/uts/common/fs/zfs/unique.c b/usr/src/uts/common/fs/zfs/unique.c
new file mode 100644
index 000000000000..56fbddd78ea8
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/unique.c
@@ -0,0 +1,107 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+
+static avl_tree_t unique_avl;
+static kmutex_t unique_mtx;
+
+typedef struct unique {
+	avl_node_t un_link;
+	uint64_t un_value;
+} unique_t;
+
+#define	UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1)
+
+static int
+unique_compare(const void *a, const void *b)
+{
+	const unique_t *una = a;
+	const unique_t *unb = b;
+
+	if (una->un_value < unb->un_value)
+		return (-1);
+	if (una->un_value > unb->un_value)
+		return (+1);
+	return (0);
+}
+
+void
+unique_init(void)
+{
+	avl_create(&unique_avl, unique_compare,
+	    sizeof (unique_t), offsetof(unique_t, un_link));
+}
+
+uint64_t
+unique_create(void)
+{
+	return (unique_insert(0));
+}
+
+uint64_t
+unique_insert(uint64_t value)
+{
+	avl_index_t idx;
+	unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP);
+
+	un->un_value = value;
+
+	mutex_enter(&unique_mtx);
+	while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK ||
+	    avl_find(&unique_avl, un, &idx)) {
+		mutex_exit(&unique_mtx);
+		(void) random_get_pseudo_bytes((void*)&un->un_value,
+		    sizeof (un->un_value));
+		un->un_value &= UNIQUE_MASK;
+		mutex_enter(&unique_mtx);
+	}
+
+	avl_insert(&unique_avl, un, idx);
+	mutex_exit(&unique_mtx);
+
+	return (un->un_value);
+}
+
+void
+unique_remove(uint64_t value)
+{
+	unique_t un_tofind;
+	unique_t *un;
+
+	un_tofind.un_value = value;
+	mutex_enter(&unique_mtx);
+	un = avl_find(&unique_avl, &un_tofind, NULL);
+	if (un != NULL) {
+		avl_remove(&unique_avl, un);
+		kmem_free(un, sizeof (unique_t));
+	}
+	mutex_exit(&unique_mtx);
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
new file mode 100644
index 000000000000..990c690bffd9
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -0,0 +1,1738 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/space_map.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device management.
+ */
+
+static vdev_ops_t *vdev_ops_table[] = {
+	&vdev_root_ops,
+	&vdev_raidz_ops,
+	&vdev_mirror_ops,
+	&vdev_replacing_ops,
+	&vdev_disk_ops,
+	&vdev_file_ops,
+	&vdev_missing_ops,
+	NULL
+};
+
+/*
+ * Given a vdev type, return the appropriate ops vector.
+ */
+static vdev_ops_t *
+vdev_getops(const char *type)
+{
+	vdev_ops_t *ops, **opspp;
+
+	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
+		if (strcmp(ops->vdev_op_type, type) == 0)
+			break;
+
+	return (ops);
+}
+
+/*
+ * Default asize function: return the MAX of psize with the asize of
+ * all children.  This is what's used by anything other than RAID-Z.
+ */
+uint64_t
+vdev_default_asize(vdev_t *vd, uint64_t psize)
+{
+	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift);
+	uint64_t csize;
+	uint64_t c;
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
+		asize = MAX(asize, csize);
+	}
+
+	return (asize);
+}
+
+vdev_t *
+vdev_lookup_top(spa_t *spa, uint64_t vdev)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	if (vdev < rvd->vdev_children)
+		return (rvd->vdev_child[vdev]);
+
+	return (NULL);
+}
+
+vdev_t *
+vdev_lookup_by_path(vdev_t *vd, const char *path)
+{
+	int c;
+	vdev_t *mvd;
+
+	if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
+		return (vd);
+
+	for (c = 0; c < vd->vdev_children; c++)
+		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
+		    NULL)
+			return (mvd);
+
+	return (NULL);
+}
+
+vdev_t *
+vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
+{
+	int c;
+	vdev_t *mvd;
+
+	if (vd->vdev_children == 0 && vd->vdev_guid == guid)
+		return (vd);
+
+	for (c = 0; c < vd->vdev_children; c++)
+		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
+		    NULL)
+			return (mvd);
+
+	return (NULL);
+}
+
+void
+vdev_add_child(vdev_t *pvd, vdev_t *cvd)
+{
+	size_t oldsize, newsize;
+	uint64_t id = cvd->vdev_id;
+	vdev_t **newchild;
+
+	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
+	ASSERT(cvd->vdev_parent == NULL);
+
+	cvd->vdev_parent = pvd;
+
+	if (pvd == NULL)
+		return;
+
+	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
+
+	oldsize = pvd->vdev_children * sizeof (vdev_t *);
+	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
+	newsize = pvd->vdev_children * sizeof (vdev_t *);
+
+	newchild = kmem_zalloc(newsize, KM_SLEEP);
+	if (pvd->vdev_child != NULL) {
+		bcopy(pvd->vdev_child, newchild, oldsize);
+		kmem_free(pvd->vdev_child, oldsize);
+	}
+
+	pvd->vdev_child = newchild;
+	pvd->vdev_child[id] = cvd;
+
+	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
+	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
+
+	/*
+	 * Walk up all ancestors to update guid sum.
+	 */
+	for (; pvd != NULL; pvd = pvd->vdev_parent)
+		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
+}
+
+void
+vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
+{
+	int c;
+	uint_t id = cvd->vdev_id;
+
+	ASSERT(cvd->vdev_parent == pvd);
+
+	if (pvd == NULL)
+		return;
+
+	ASSERT(id < pvd->vdev_children);
+	ASSERT(pvd->vdev_child[id] == cvd);
+
+	pvd->vdev_child[id] = NULL;
+	cvd->vdev_parent = NULL;
+
+	for (c = 0; c < pvd->vdev_children; c++)
+		if (pvd->vdev_child[c])
+			break;
+
+	if (c == pvd->vdev_children) {
+		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
+		pvd->vdev_child = NULL;
+		pvd->vdev_children = 0;
+	}
+
+	/*
+	 * Walk up all ancestors to update guid sum.
+	 */
+	for (; pvd != NULL; pvd = pvd->vdev_parent)
+		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
+}
+
+/*
+ * Remove any holes in the child array.
+ */
+void
+vdev_compact_children(vdev_t *pvd)
+{
+	vdev_t **newchild, *cvd;
+	int oldc = pvd->vdev_children;
+	int newc, c;
+
+	ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
+
+	for (c = newc = 0; c < oldc; c++)
+		if (pvd->vdev_child[c])
+			newc++;
+
+	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
+
+	for (c = newc = 0; c < oldc; c++) {
+		if ((cvd = pvd->vdev_child[c]) != NULL) {
+			newchild[newc] = cvd;
+			cvd->vdev_id = newc++;
+		}
+	}
+
+	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
+	pvd->vdev_child = newchild;
+	pvd->vdev_children = newc;
+}
+
+/*
+ * Allocate and minimally initialize a vdev_t.
+ */
+static vdev_t *
+vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
+{
+	vdev_t *vd;
+
+	while (guid == 0)
+		guid = spa_get_random(-1ULL);
+
+	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+
+	vd->vdev_spa = spa;
+	vd->vdev_id = id;
+	vd->vdev_guid = guid;
+	vd->vdev_guid_sum = guid;
+	vd->vdev_ops = ops;
+	vd->vdev_state = VDEV_STATE_CLOSED;
+
+	mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL);
+	list_create(&vd->vdev_io_pending, sizeof (zio_t),
+	    offsetof(zio_t, io_pending));
+	mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
+	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+	txg_list_create(&vd->vdev_ms_list,
+	    offsetof(struct metaslab, ms_txg_node));
+	txg_list_create(&vd->vdev_dtl_list,
+	    offsetof(struct vdev, vdev_dtl_node));
+	vd->vdev_stat.vs_timestamp = gethrtime();
+
+	return (vd);
+}
+
+/*
+ * Free a vdev_t that has been removed from service.
+ */
+static void
+vdev_free_common(vdev_t *vd)
+{
+	if (vd->vdev_path)
+		spa_strfree(vd->vdev_path);
+	if (vd->vdev_devid)
+		spa_strfree(vd->vdev_devid);
+
+	txg_list_destroy(&vd->vdev_ms_list);
+	txg_list_destroy(&vd->vdev_dtl_list);
+	mutex_enter(&vd->vdev_dtl_lock);
+	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
+	space_map_destroy(&vd->vdev_dtl_map);
+	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+	space_map_destroy(&vd->vdev_dtl_scrub);
+	mutex_exit(&vd->vdev_dtl_lock);
+	mutex_destroy(&vd->vdev_dtl_lock);
+	mutex_destroy(&vd->vdev_dirty_lock);
+	list_destroy(&vd->vdev_io_pending);
+	mutex_destroy(&vd->vdev_io_lock);
+	cv_destroy(&vd->vdev_io_cv);
+
+	kmem_free(vd, sizeof (vdev_t));
+}
+
+/*
+ * Allocate a new vdev.  The 'alloctype' is used to control whether we are
+ * creating a new vdev or loading an existing one - the behavior is slightly
+ * different for each case.
+ */
+vdev_t *
+vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype)
+{
+	vdev_ops_t *ops;
+	char *type;
+	uint64_t guid = 0;
+	vdev_t *vd;
+
+	ASSERT(spa_config_held(spa, RW_WRITER));
+
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
+		return (NULL);
+
+	if ((ops = vdev_getops(type)) == NULL)
+		return (NULL);
+
+	/*
+	 * If this is a load, get the vdev guid from the nvlist.
+	 * Otherwise, vdev_alloc_common() will generate one for us.
+	 */
+	if (alloctype == VDEV_ALLOC_LOAD) {
+		uint64_t label_id;
+
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
+		    label_id != id)
+			return (NULL);
+
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+			return (NULL);
+	}
+
+	vd = vdev_alloc_common(spa, id, guid, ops);
+
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
+		vd->vdev_path = spa_strdup(vd->vdev_path);
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
+		vd->vdev_devid = spa_strdup(vd->vdev_devid);
+
+	/*
+	 * If we're a top-level vdev, try to load the allocation parameters.
+	 */
+	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+		    &vd->vdev_ms_array);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+		    &vd->vdev_ms_shift);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
+		    &vd->vdev_ashift);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
+		    &vd->vdev_asize);
+	}
+
+	/*
+	 * If we're a leaf vdev, try to load the DTL object.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
+		    &vd->vdev_dtl.smo_object);
+	}
+
+	/*
+	 * Add ourselves to the parent's list of children.
+	 */
+	vdev_add_child(parent, vd);
+
+	return (vd);
+}
+
+void
+vdev_free(vdev_t *vd)
+{
+	int c;
+
+	/*
+	 * vdev_free() implies closing the vdev first.  This is simpler than
+	 * trying to ensure complicated semantics for all callers.
+	 */
+	vdev_close(vd);
+
+	/*
+	 * It's possible to free a vdev that's been added to the dirty
+	 * list when in the middle of spa_vdev_add().  Handle that case
+	 * correctly here.
+	 */
+	if (vd->vdev_is_dirty)
+		vdev_config_clean(vd);
+
+	/*
+	 * Free all children.
+	 */
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_free(vd->vdev_child[c]);
+
+	ASSERT(vd->vdev_child == NULL);
+	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+
+	/*
+	 * Discard allocation state.
+	 */
+	if (vd == vd->vdev_top)
+		vdev_metaslab_fini(vd);
+
+	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
+	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
+
+	/*
+	 * Remove this vdev from its parent's child list.
+	 */
+	vdev_remove_child(vd->vdev_parent, vd);
+
+	ASSERT(vd->vdev_parent == NULL);
+
+	vdev_free_common(vd);
+}
+
+/*
+ * Transfer top-level vdev state from svd to tvd.
+ */
+static void
+vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
+{
+	spa_t *spa = svd->vdev_spa;
+	metaslab_t *msp;
+	vdev_t *vd;
+	int t;
+
+	ASSERT(tvd == tvd->vdev_top);
+
+	tvd->vdev_ms_array = svd->vdev_ms_array;
+	tvd->vdev_ms_shift = svd->vdev_ms_shift;
+	tvd->vdev_ms_count = svd->vdev_ms_count;
+
+	svd->vdev_ms_array = 0;
+	svd->vdev_ms_shift = 0;
+	svd->vdev_ms_count = 0;
+
+	tvd->vdev_mg = svd->vdev_mg;
+	tvd->vdev_mg->mg_vd = tvd;
+	tvd->vdev_ms = svd->vdev_ms;
+	tvd->vdev_smo = svd->vdev_smo;
+
+	svd->vdev_mg = NULL;
+	svd->vdev_ms = NULL;
+	svd->vdev_smo = NULL;
+
+	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
+	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
+
+	svd->vdev_stat.vs_alloc = 0;
+	svd->vdev_stat.vs_space = 0;
+
+	for (t = 0; t < TXG_SIZE; t++) {
+		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
+			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
+		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
+			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
+		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
+			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
+		tvd->vdev_dirty[t] = svd->vdev_dirty[t];
+		svd->vdev_dirty[t] = 0;
+	}
+
+	if (svd->vdev_is_dirty) {
+		vdev_config_clean(svd);
+		vdev_config_dirty(tvd);
+	}
+
+	ASSERT(svd->vdev_io_retry == NULL);
+	ASSERT(list_is_empty(&svd->vdev_io_pending));
+}
+
+static void
+vdev_top_update(vdev_t *tvd, vdev_t *vd)
+{
+	int c;
+
+	if (vd == NULL)
+		return;
+
+	vd->vdev_top = tvd;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_top_update(tvd, vd->vdev_child[c]);
+}
+
+/*
+ * Add a mirror/replacing vdev above an existing vdev.
+ */
+vdev_t *
+vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
+{
+	spa_t *spa = cvd->vdev_spa;
+	vdev_t *pvd = cvd->vdev_parent;
+	vdev_t *mvd;
+
+	ASSERT(spa_config_held(spa, RW_WRITER));
+
+	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
+	vdev_remove_child(pvd, cvd);
+	vdev_add_child(pvd, mvd);
+	cvd->vdev_id = mvd->vdev_children;
+	vdev_add_child(mvd, cvd);
+	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+	mvd->vdev_asize = cvd->vdev_asize;
+	mvd->vdev_ashift = cvd->vdev_ashift;
+	mvd->vdev_state = cvd->vdev_state;
+
+	if (mvd == mvd->vdev_top)
+		vdev_top_transfer(cvd, mvd);
+
+	return (mvd);
+}
+
+/*
+ * Remove a 1-way mirror/replacing vdev from the tree.
+ */
+void
+vdev_remove_parent(vdev_t *cvd)
+{
+	vdev_t *mvd = cvd->vdev_parent;
+	vdev_t *pvd = mvd->vdev_parent;
+
+	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
+
+	ASSERT(mvd->vdev_children == 1);
+	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
+	    mvd->vdev_ops == &vdev_replacing_ops);
+
+	vdev_remove_child(mvd, cvd);
+	vdev_remove_child(pvd, mvd);
+	cvd->vdev_id = mvd->vdev_id;
+	vdev_add_child(pvd, cvd);
+	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+	if (cvd == cvd->vdev_top)
+		vdev_top_transfer(mvd, cvd);
+
+	ASSERT(mvd->vdev_children == 0);
+	vdev_free(mvd);
+}
+
+void
+vdev_metaslab_init(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	metaslab_class_t *mc = spa_metaslab_class_select(spa);
+	uint64_t c;
+	uint64_t oldc = vd->vdev_ms_count;
+	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
+	space_map_obj_t *smo = vd->vdev_smo;
+	metaslab_t **mspp = vd->vdev_ms;
+
+	dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
+
+	ASSERT(oldc <= newc);
+
+	vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP);
+	vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
+	vd->vdev_ms_count = newc;
+
+	if (vd->vdev_mg == NULL) {
+		if (txg == 0) {
+			dmu_buf_t *db;
+			uint64_t *ms_array;
+
+			ms_array = kmem_zalloc(newc * sizeof (uint64_t),
+			    KM_SLEEP);
+
+			dmu_read(spa->spa_meta_objset, vd->vdev_ms_array,
+			    0, newc * sizeof (uint64_t), ms_array);
+
+			for (c = 0; c < newc; c++) {
+				if (ms_array[c] == 0)
+					continue;
+				db = dmu_bonus_hold(spa->spa_meta_objset,
+				    ms_array[c]);
+				dmu_buf_read(db);
+				ASSERT3U(db->db_size, ==, sizeof (*smo));
+				bcopy(db->db_data, &vd->vdev_smo[c],
+				    db->db_size);
+				ASSERT3U(vd->vdev_smo[c].smo_object, ==,
+				    ms_array[c]);
+				dmu_buf_rele(db);
+			}
+			kmem_free(ms_array, newc * sizeof (uint64_t));
+		}
+		vd->vdev_mg = metaslab_group_create(mc, vd);
+	}
+
+	for (c = 0; c < oldc; c++) {
+		vd->vdev_smo[c] = smo[c];
+		vd->vdev_ms[c] = mspp[c];
+		mspp[c]->ms_smo = &vd->vdev_smo[c];
+	}
+
+	for (c = oldc; c < newc; c++)
+		metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c],
+		    c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
+
+	if (oldc != 0) {
+		kmem_free(smo, oldc * sizeof (*smo));
+		kmem_free(mspp, oldc * sizeof (*mspp));
+	}
+
+}
+
+void
+vdev_metaslab_fini(vdev_t *vd)
+{
+	uint64_t m;
+	uint64_t count = vd->vdev_ms_count;
+
+	if (vd->vdev_ms != NULL) {
+		for (m = 0; m < count; m++)
+			metaslab_fini(vd->vdev_ms[m]);
+		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
+		vd->vdev_ms = NULL;
+	}
+
+	if (vd->vdev_smo != NULL) {
+		kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t));
+		vd->vdev_smo = NULL;
+	}
+}
+
+/*
+ * Prepare a virtual device for access.
+ */
+int
+vdev_open(vdev_t *vd)
+{
+	int error;
+	vdev_knob_t *vk;
+	int c;
+	uint64_t osize = 0;
+	uint64_t asize, psize;
+	uint64_t ashift = -1ULL;
+
+	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
+	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
+	    vd->vdev_state == VDEV_STATE_OFFLINE);
+
+	if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
+		vd->vdev_fault_arg >>= 1;
+	else
+		vd->vdev_fault_mode = VDEV_FAULT_NONE;
+
+	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+
+	for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) {
+		uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset);
+
+		*valp = vk->vk_default;
+		*valp = MAX(*valp, vk->vk_min);
+		*valp = MIN(*valp, vk->vk_max);
+	}
+
+	if (vd->vdev_ops->vdev_op_leaf) {
+		vdev_cache_init(vd);
+		vdev_queue_init(vd);
+		vd->vdev_cache_active = B_TRUE;
+	}
+
+	if (vd->vdev_offline) {
+		ASSERT(vd->vdev_children == 0);
+		dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd));
+		vd->vdev_state = VDEV_STATE_OFFLINE;
+		return (ENXIO);
+	}
+
+	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
+
+	dprintf("%s = %d, osize %llu, state = %d\n",
+	    vdev_description(vd), error, osize, vd->vdev_state);
+
+	if (error) {
+		dprintf("%s in %s failed to open, error %d, aux %d\n",
+		    vdev_description(vd),
+		    vdev_description(vd->vdev_parent),
+		    error,
+		    vd->vdev_stat.vs_aux);
+
+		vd->vdev_state = VDEV_STATE_CANT_OPEN;
+		return (error);
+	}
+
+	vd->vdev_state = VDEV_STATE_HEALTHY;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY)
+			vd->vdev_state = VDEV_STATE_DEGRADED;
+
+	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
+
+	if (vd->vdev_children == 0) {
+		if (osize < SPA_MINDEVSIZE) {
+			vd->vdev_state = VDEV_STATE_CANT_OPEN;
+			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
+			return (EOVERFLOW);
+		}
+		psize = osize;
+		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
+	} else {
+		if (osize < SPA_MINDEVSIZE -
+		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
+			vd->vdev_state = VDEV_STATE_CANT_OPEN;
+			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
+			return (EOVERFLOW);
+		}
+		psize = 0;
+		asize = osize;
+	}
+
+	vd->vdev_psize = psize;
+
+	if (vd->vdev_asize == 0) {
+		/*
+		 * This is the first-ever open, so use the computed values.
+		 */
+		vd->vdev_asize = asize;
+		vd->vdev_ashift = ashift;
+	} else {
+		/*
+		 * Make sure the alignment requirement hasn't increased.
+		 */
+		if (ashift > vd->vdev_ashift) {
+			dprintf("%s: ashift grew\n", vdev_description(vd));
+			vd->vdev_state = VDEV_STATE_CANT_OPEN;
+			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+			return (EINVAL);
+		}
+
+		/*
+		 * Make sure the device hasn't shrunk.
+		 */
+		if (asize < vd->vdev_asize) {
+			dprintf("%s: device shrank\n", vdev_description(vd));
+			vd->vdev_state = VDEV_STATE_CANT_OPEN;
+			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+			return (EINVAL);
+		}
+
+		/*
+		 * If all children are healthy and the asize has increased,
+		 * then we've experienced dynamic LUN growth.
+		 */
+		if (vd->vdev_state == VDEV_STATE_HEALTHY &&
+		    asize > vd->vdev_asize) {
+			dprintf("%s: device grew\n", vdev_description(vd));
+			vd->vdev_asize = asize;
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Close a virtual device.
+ */
+void
+vdev_close(vdev_t *vd)
+{
+	ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL);
+
+	vd->vdev_ops->vdev_op_close(vd);
+
+	if (vd->vdev_cache_active) {
+		vdev_cache_fini(vd);
+		vdev_queue_fini(vd);
+		vd->vdev_cache_active = B_FALSE;
+	}
+
+	if (vd->vdev_offline)
+		vd->vdev_state = VDEV_STATE_OFFLINE;
+	else
+		vd->vdev_state = VDEV_STATE_CLOSED;
+}
+
+void
+vdev_reopen(vdev_t *vd, zio_t **rq)
+{
+	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+	int c;
+
+	if (vd == rvd) {
+		ASSERT(rq == NULL);
+		for (c = 0; c < rvd->vdev_children; c++)
+			vdev_reopen(rvd->vdev_child[c], NULL);
+		return;
+	}
+
+	/* only valid for top-level vdevs */
+	ASSERT3P(vd, ==, vd->vdev_top);
+
+	/*
+	 * vdev_state can change when spa_config_lock is held as writer,
+	 * or when it's held as reader and we're doing a vdev_reopen().
+	 * To handle the latter case, we grab rvd's io_lock to serialize
+	 * reopens.  This ensures that there's never more than one vdev
+	 * state changer active at a time.
+	 */
+	mutex_enter(&rvd->vdev_io_lock);
+
+	mutex_enter(&vd->vdev_io_lock);
+	while (list_head(&vd->vdev_io_pending) != NULL)
+		cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock);
+	vdev_close(vd);
+	(void) vdev_open(vd);
+	if (rq != NULL) {
+		*rq = vd->vdev_io_retry;
+		vd->vdev_io_retry = NULL;
+	}
+	mutex_exit(&vd->vdev_io_lock);
+
+	/*
+	 * Reassess root vdev's health.
+	 */
+	rvd->vdev_state = VDEV_STATE_HEALTHY;
+	for (c = 0; c < rvd->vdev_children; c++) {
+		uint64_t state = rvd->vdev_child[c]->vdev_state;
+		rvd->vdev_state = MIN(rvd->vdev_state, state);
+	}
+
+	mutex_exit(&rvd->vdev_io_lock);
+}
+
+int
+vdev_create(vdev_t *vd, uint64_t txg)
+{
+	int error;
+
+	/*
+	 * Normally, partial opens (e.g. of a mirror) are allowed.
+	 * For a create, however, we want to fail the request if
+	 * there are any components we can't open.
+	 */
+	error = vdev_open(vd);
+
+	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
+		vdev_close(vd);
+		return (error ? error : ENXIO);
+	}
+
+	/*
+	 * Recursively initialize all labels.
+	 */
+	if ((error = vdev_label_init(vd, txg)) != 0) {
+		vdev_close(vd);
+		return (error);
+	}
+
+	return (0);
+}
+
+/*
+ * The is the latter half of vdev_create().  It is distinct because it
+ * involves initiating transactions in order to do metaslab creation.
+ * For creation, we want to try to create all vdevs at once and then undo it
+ * if anything fails; this is much harder if we have pending transactions.
+ */
+void
+vdev_init(vdev_t *vd, uint64_t txg)
+{
+	/*
+	 * Aim for roughly 200 metaslabs per vdev.
+	 */
+	vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
+	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
+
+	/*
+	 * Initialize the vdev's metaslabs.
+	 */
+	vdev_metaslab_init(vd, txg);
+}
+
+void
+vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg)
+{
+	vdev_t *tvd = vd->vdev_top;
+
+	mutex_enter(&tvd->vdev_dirty_lock);
+	if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) {
+		tvd->vdev_dirty[txg & TXG_MASK] |= flags;
+		(void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list,
+		    tvd, txg);
+	}
+	mutex_exit(&tvd->vdev_dirty_lock);
+}
+
+void
+vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
+{
+	mutex_enter(sm->sm_lock);
+	if (!space_map_contains(sm, txg, size))
+		space_map_add(sm, txg, size);
+	mutex_exit(sm->sm_lock);
+}
+
+int
+vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
+{
+	int dirty;
+
+	/*
+	 * Quick test without the lock -- covers the common case that
+	 * there are no dirty time segments.
+	 */
+	if (sm->sm_space == 0)
+		return (0);
+
+	mutex_enter(sm->sm_lock);
+	dirty = space_map_contains(sm, txg, size);
+	mutex_exit(sm->sm_lock);
+
+	return (dirty);
+}
+
+/*
+ * Reassess DTLs after a config change or scrub completion.
+ */
+void
+vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
+{
+	int c;
+
+	ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER));
+
+	if (vd->vdev_children == 0) {
+		mutex_enter(&vd->vdev_dtl_lock);
+		/*
+		 * We're successfully scrubbed everything up to scrub_txg.
+		 * Therefore, excise all old DTLs up to that point, then
+		 * fold in the DTLs for everything we couldn't scrub.
+		 */
+		if (scrub_txg != 0) {
+			space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
+			space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
+		}
+		if (scrub_done)
+			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+		mutex_exit(&vd->vdev_dtl_lock);
+		if (txg != 0) {
+			vdev_t *tvd = vd->vdev_top;
+			vdev_dirty(tvd, VDD_DTL, txg);
+			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
+		}
+		return;
+	}
+
+	mutex_enter(&vd->vdev_dtl_lock);
+	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
+	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+	mutex_exit(&vd->vdev_dtl_lock);
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+		vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
+		mutex_enter(&vd->vdev_dtl_lock);
+		space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
+		space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
+		mutex_exit(&vd->vdev_dtl_lock);
+	}
+}
+
+static int
+vdev_dtl_load(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	space_map_obj_t *smo = &vd->vdev_dtl;
+	dmu_buf_t *db;
+	int error;
+
+	ASSERT(vd->vdev_children == 0);
+
+	if (smo->smo_object == 0)
+		return (0);
+
+	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
+	dmu_buf_read(db);
+	ASSERT3U(db->db_size, ==, sizeof (*smo));
+	bcopy(db->db_data, smo, db->db_size);
+	dmu_buf_rele(db);
+
+	mutex_enter(&vd->vdev_dtl_lock);
+	error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC,
+	    spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc);
+	mutex_exit(&vd->vdev_dtl_lock);
+
+	return (error);
+}
+
+void
+vdev_dtl_sync(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	space_map_obj_t *smo = &vd->vdev_dtl;
+	space_map_t *sm = &vd->vdev_dtl_map;
+	space_map_t smsync;
+	kmutex_t smlock;
+	avl_tree_t *t = &sm->sm_root;
+	space_seg_t *ss;
+	dmu_buf_t *db;
+	dmu_tx_t *tx;
+
+	dprintf("%s in txg %llu pass %d\n",
+	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
+
+	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+	if (vd->vdev_detached) {
+		if (smo->smo_object != 0) {
+			int err = dmu_object_free(spa->spa_meta_objset,
+			    smo->smo_object, tx);
+			ASSERT3U(err, ==, 0);
+			smo->smo_object = 0;
+		}
+		dmu_tx_commit(tx);
+		return;
+	}
+
+	if (smo->smo_object == 0) {
+		ASSERT(smo->smo_objsize == 0);
+		ASSERT(smo->smo_alloc == 0);
+		smo->smo_object = dmu_object_alloc(spa->spa_meta_objset,
+		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
+		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
+		ASSERT(smo->smo_object != 0);
+		vdev_config_dirty(vd->vdev_top);
+	}
+
+	dmu_free_range(spa->spa_meta_objset, smo->smo_object,
+	    0, smo->smo_objsize, tx);
+
+	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
+
+	space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
+	    &smlock);
+
+	mutex_enter(&smlock);
+
+	mutex_enter(&vd->vdev_dtl_lock);
+	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss))
+		space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start);
+	mutex_exit(&vd->vdev_dtl_lock);
+
+	smo->smo_objsize = 0;
+	smo->smo_alloc = smsync.sm_space;
+
+	space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx);
+	space_map_destroy(&smsync);
+
+	mutex_exit(&smlock);
+	mutex_destroy(&smlock);
+
+	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
+	dmu_buf_will_dirty(db, tx);
+	ASSERT3U(db->db_size, ==, sizeof (*smo));
+	bcopy(smo, db->db_data, db->db_size);
+	dmu_buf_rele(db);
+
+	dmu_tx_commit(tx);
+}
+
+int
+vdev_load(vdev_t *vd, int import)
+{
+	spa_t *spa = vd->vdev_spa;
+	int c, error;
+	nvlist_t *label;
+	uint64_t guid, state;
+
+	dprintf("loading %s\n", vdev_description(vd));
+
+	/*
+	 * Recursively load all children.
+	 */
+	for (c = 0; c < vd->vdev_children; c++)
+		if ((error = vdev_load(vd->vdev_child[c], import)) != 0)
+			return (error);
+
+	/*
+	 * If this is a leaf vdev, make sure its agrees with its disk labels.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf) {
+
+		if (vdev_is_dead(vd))
+			return (0);
+
+		/*
+		 * XXX state transitions don't propagate to parent here.
+		 * Also, merely setting the state isn't sufficient because
+		 * it's not persistent; a vdev_reopen() would make us
+		 * forget all about it.
+		 */
+		if ((label = vdev_label_read_config(vd)) == NULL) {
+			dprintf("can't load label config\n");
+			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			return (0);
+		}
+
+		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
+		    &guid) != 0 || guid != spa_guid(spa)) {
+			dprintf("bad or missing pool GUID (%llu)\n", guid);
+			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			nvlist_free(label);
+			return (0);
+		}
+
+		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) ||
+		    guid != vd->vdev_guid) {
+			dprintf("bad or missing vdev guid (%llu != %llu)\n",
+			    guid, vd->vdev_guid);
+			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			nvlist_free(label);
+			return (0);
+		}
+
+		/*
+		 * If we find a vdev with a matching pool guid and vdev guid,
+		 * but the pool state is not active, it indicates that the user
+		 * exported or destroyed the pool without affecting the config
+		 * cache (if / was mounted readonly, for example).  In this
+		 * case, immediately return EBADF so the caller can remove it
+		 * from the config.
+		 */
+		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+		    &state)) {
+			dprintf("missing pool state\n");
+			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			nvlist_free(label);
+			return (0);
+		}
+
+		if (state != POOL_STATE_ACTIVE &&
+		    (!import || state != POOL_STATE_EXPORTED)) {
+			dprintf("pool state not active (%llu)\n", state);
+			nvlist_free(label);
+			return (EBADF);
+		}
+
+		nvlist_free(label);
+	}
+
+	/*
+	 * If this is a top-level vdev, make sure its allocation parameters
+	 * exist and initialize its metaslabs.
+	 */
+	if (vd == vd->vdev_top) {
+
+		if (vd->vdev_ms_array == 0 ||
+		    vd->vdev_ms_shift == 0 ||
+		    vd->vdev_ashift == 0 ||
+		    vd->vdev_asize == 0) {
+			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			return (0);
+		}
+
+		vdev_metaslab_init(vd, 0);
+	}
+
+	/*
+	 * If this is a leaf vdev, load its DTL.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf) {
+		error = vdev_dtl_load(vd);
+		if (error) {
+			dprintf("can't load DTL for %s, error %d\n",
+			    vdev_description(vd), error);
+			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			return (0);
+		}
+	}
+
+	return (0);
+}
+
+void
+vdev_sync_done(vdev_t *vd, uint64_t txg)
+{
+	metaslab_t *msp;
+
+	dprintf("%s txg %llu\n", vdev_description(vd), txg);
+
+	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+		metaslab_sync_done(msp, txg);
+}
+
+void
+vdev_add_sync(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+	ASSERT(vd == vd->vdev_top);
+
+	if (vd->vdev_ms_array == 0)
+		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
+		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
+
+	ASSERT(vd->vdev_ms_array != 0);
+
+	vdev_config_dirty(vd);
+
+	dmu_tx_commit(tx);
+}
+
+void
+vdev_sync(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *lvd;
+	metaslab_t *msp;
+	uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK];
+	uint8_t dirty = *dirtyp;
+
+	mutex_enter(&vd->vdev_dirty_lock);
+	*dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL);
+	mutex_exit(&vd->vdev_dirty_lock);
+
+	dprintf("%s txg %llu pass %d\n",
+	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
+
+	if (dirty & VDD_ADD)
+		vdev_add_sync(vd, txg);
+
+	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL)
+		metaslab_sync(msp, txg);
+
+	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
+		vdev_dtl_sync(lvd, txg);
+
+	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
+}
+
+uint64_t
+vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
+{
+	return (vd->vdev_ops->vdev_op_asize(vd, psize));
+}
+
+void
+vdev_io_start(zio_t *zio)
+{
+	zio->io_vd->vdev_ops->vdev_op_io_start(zio);
+}
+
+void
+vdev_io_done(zio_t *zio)
+{
+	zio->io_vd->vdev_ops->vdev_op_io_done(zio);
+}
+
+const char *
+vdev_description(vdev_t *vd)
+{
+	if (vd == NULL || vd->vdev_ops == NULL)
+		return ("<unknown>");
+
+	if (vd->vdev_path != NULL)
+		return (vd->vdev_path);
+
+	if (vd->vdev_parent == NULL)
+		return (spa_name(vd->vdev_spa));
+
+	return (vd->vdev_ops->vdev_op_type);
+}
+
+int
+vdev_online(spa_t *spa, const char *path)
+{
+	vdev_t *vd;
+
+	spa_config_enter(spa, RW_WRITER);
+
+	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
+		spa_config_exit(spa);
+		return (ENODEV);
+	}
+
+	dprintf("ONLINE: %s\n", vdev_description(vd));
+
+	vd->vdev_offline = B_FALSE;
+
+	/*
+	 * Clear the error counts.  The idea is that you expect to see all
+	 * zeroes when everything is working, so if you've just onlined a
+	 * device, you don't want to keep hearing about errors from before.
+	 */
+	vd->vdev_stat.vs_read_errors = 0;
+	vd->vdev_stat.vs_write_errors = 0;
+	vd->vdev_stat.vs_checksum_errors = 0;
+
+	vdev_reopen(vd->vdev_top, NULL);
+
+	spa_config_exit(spa);
+
+	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+	return (0);
+}
+
+int
+vdev_offline(spa_t *spa, const char *path)
+{
+	vdev_t *vd;
+
+	spa_config_enter(spa, RW_WRITER);
+
+	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
+		spa_config_exit(spa);
+		return (ENODEV);
+	}
+
+	dprintf("OFFLINE: %s\n", vdev_description(vd));
+
+	/*
+	 * If this device's top-level vdev has a non-empty DTL,
+	 * don't allow the device to be offlined.
+	 *
+	 * XXX -- we should make this more precise by allowing the offline
+	 * as long as the remaining devices don't have any DTL holes.
+	 */
+	if (vd->vdev_top->vdev_dtl_map.sm_space != 0) {
+		spa_config_exit(spa);
+		return (EBUSY);
+	}
+
+	/*
+	 * Set this device to offline state and reopen its top-level vdev.
+	 * If this action results in the top-level vdev becoming unusable,
+	 * undo it and fail the request.
+	 */
+	vd->vdev_offline = B_TRUE;
+	vdev_reopen(vd->vdev_top, NULL);
+	if (vdev_is_dead(vd->vdev_top)) {
+		vd->vdev_offline = B_FALSE;
+		vdev_reopen(vd->vdev_top, NULL);
+		spa_config_exit(spa);
+		return (EBUSY);
+	}
+
+	spa_config_exit(spa);
+
+	return (0);
+}
+
+int
+vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg)
+{
+	vdev_t *vd;
+
+	spa_config_enter(spa, RW_WRITER);
+
+	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
+		spa_config_exit(spa);
+		return (ENODEV);
+	}
+
+	vd->vdev_fault_mode = mode;
+	vd->vdev_fault_mask = mask;
+	vd->vdev_fault_arg = arg;
+
+	spa_config_exit(spa);
+
+	return (0);
+}
+
+int
+vdev_is_dead(vdev_t *vd)
+{
+	return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
+}
+
+int
+vdev_error_inject(vdev_t *vd, zio_t *zio)
+{
+	int error = 0;
+
+	if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
+		return (0);
+
+	if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
+		return (0);
+
+	switch (vd->vdev_fault_mode) {
+	case VDEV_FAULT_RANDOM:
+		if (spa_get_random(vd->vdev_fault_arg) == 0)
+			error = EIO;
+		break;
+
+	case VDEV_FAULT_COUNT:
+		if ((int64_t)--vd->vdev_fault_arg <= 0)
+			vd->vdev_fault_mode = VDEV_FAULT_NONE;
+		error = EIO;
+		break;
+	}
+
+	if (error != 0) {
+		dprintf("returning %d for type %d on %s state %d offset %llx\n",
+		    error, zio->io_type, vdev_description(vd),
+		    vd->vdev_state, zio->io_offset);
+	}
+
+	return (error);
+}
+
+/*
+ * Get statistics for the given vdev.
+ */
+void
+vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
+{
+	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+	int c, t;
+
+	mutex_enter(&vd->vdev_stat_lock);
+	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
+	vs->vs_state = vd->vdev_state;
+	mutex_exit(&vd->vdev_stat_lock);
+
+	/*
+	 * If we're getting stats on the root vdev, aggregate the I/O counts
+	 * over all top-level vdevs (i.e. the direct children of the root).
+	 */
+	if (vd == rvd) {
+		for (c = 0; c < rvd->vdev_children; c++) {
+			vdev_t *cvd = rvd->vdev_child[c];
+			vdev_stat_t *cvs = &cvd->vdev_stat;
+
+			mutex_enter(&vd->vdev_stat_lock);
+			for (t = 0; t < ZIO_TYPES; t++) {
+				vs->vs_ops[t] += cvs->vs_ops[t];
+				vs->vs_bytes[t] += cvs->vs_bytes[t];
+			}
+			vs->vs_read_errors += cvs->vs_read_errors;
+			vs->vs_write_errors += cvs->vs_write_errors;
+			vs->vs_checksum_errors += cvs->vs_checksum_errors;
+			vs->vs_scrub_examined += cvs->vs_scrub_examined;
+			vs->vs_scrub_errors += cvs->vs_scrub_errors;
+			mutex_exit(&vd->vdev_stat_lock);
+		}
+	}
+}
+
+void
+vdev_stat_update(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_t *pvd;
+	uint64_t txg = zio->io_txg;
+	vdev_stat_t *vs = &vd->vdev_stat;
+	zio_type_t type = zio->io_type;
+	int flags = zio->io_flags;
+
+	if (zio->io_error == 0) {
+		if (!(flags & ZIO_FLAG_IO_BYPASS)) {
+			mutex_enter(&vd->vdev_stat_lock);
+			vs->vs_ops[type]++;
+			vs->vs_bytes[type] += zio->io_size;
+			mutex_exit(&vd->vdev_stat_lock);
+		}
+		if ((flags & ZIO_FLAG_IO_REPAIR) &&
+		    zio->io_delegate_list == NULL) {
+			mutex_enter(&vd->vdev_stat_lock);
+			if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))
+				vs->vs_scrub_repaired += zio->io_size;
+			else
+				vs->vs_self_healed += zio->io_size;
+			mutex_exit(&vd->vdev_stat_lock);
+		}
+		return;
+	}
+
+	if (flags & ZIO_FLAG_SPECULATIVE)
+		return;
+
+	if (!vdev_is_dead(vd)) {
+		mutex_enter(&vd->vdev_stat_lock);
+		if (type == ZIO_TYPE_READ) {
+			if (zio->io_error == ECKSUM)
+				vs->vs_checksum_errors++;
+			else
+				vs->vs_read_errors++;
+		}
+		if (type == ZIO_TYPE_WRITE)
+			vs->vs_write_errors++;
+		mutex_exit(&vd->vdev_stat_lock);
+	}
+
+	if (type == ZIO_TYPE_WRITE) {
+		if (txg == 0 || vd->vdev_children != 0)
+			return;
+		if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
+			ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
+		}
+		if (!(flags & ZIO_FLAG_IO_REPAIR)) {
+			vdev_t *tvd = vd->vdev_top;
+			if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
+				return;
+			vdev_dirty(tvd, VDD_DTL, txg);
+			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
+			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+				vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
+		}
+	}
+}
+
+void
+vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
+{
+	int c;
+	vdev_stat_t *vs = &vd->vdev_stat;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
+
+	mutex_enter(&vd->vdev_stat_lock);
+
+	if (type == POOL_SCRUB_NONE) {
+		/*
+		 * Update completion and end time.  Leave everything else alone
+		 * so we can report what happened during the previous scrub.
+		 */
+		vs->vs_scrub_complete = complete;
+		vs->vs_scrub_end = gethrestime_sec();
+	} else {
+		vs->vs_scrub_type = type;
+		vs->vs_scrub_complete = 0;
+		vs->vs_scrub_examined = 0;
+		vs->vs_scrub_repaired = 0;
+		vs->vs_scrub_errors = 0;
+		vs->vs_scrub_start = gethrestime_sec();
+		vs->vs_scrub_end = 0;
+	}
+
+	mutex_exit(&vd->vdev_stat_lock);
+}
+
+/*
+ * Report checksum errors that a vdev that didn't realize it made.
+ * This can happen, for example, when RAID-Z combinatorial reconstruction
+ * infers that one of its components returned bad data.
+ */
+void
+vdev_checksum_error(zio_t *zio, vdev_t *vd)
+{
+	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
+	    vdev_description(vd));
+
+	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+		mutex_enter(&vd->vdev_stat_lock);
+		vd->vdev_stat.vs_checksum_errors++;
+		mutex_exit(&vd->vdev_stat_lock);
+	}
+}
+
+/*
+ * Update the in-core space usage stats for this vdev and the root vdev.
+ */
+void
+vdev_space_update(vdev_t *vd, uint64_t space_delta, uint64_t alloc_delta)
+{
+	ASSERT(vd == vd->vdev_top);
+
+	do {
+		mutex_enter(&vd->vdev_stat_lock);
+		vd->vdev_stat.vs_space += space_delta;
+		vd->vdev_stat.vs_alloc += alloc_delta;
+		mutex_exit(&vd->vdev_stat_lock);
+	} while ((vd = vd->vdev_parent) != NULL);
+}
+
+/*
+ * Various knobs to tune a vdev.
+ */
+static vdev_knob_t vdev_knob[] = {
+	{
+		"cache_size",
+		"size of the read-ahead cache",
+		0,
+		1ULL << 30,
+		10ULL << 20,
+		offsetof(struct vdev, vdev_cache.vc_size)
+	},
+	{
+		"cache_bshift",
+		"log2 of cache blocksize",
+		SPA_MINBLOCKSHIFT,
+		SPA_MAXBLOCKSHIFT,
+		16,
+		offsetof(struct vdev, vdev_cache.vc_bshift)
+	},
+	{
+		"cache_max",
+		"largest block size to cache",
+		0,
+		SPA_MAXBLOCKSIZE,
+		1ULL << 14,
+		offsetof(struct vdev, vdev_cache.vc_max)
+	},
+	{
+		"min_pending",
+		"minimum pending I/Os to the disk",
+		1,
+		10000,
+		2,
+		offsetof(struct vdev, vdev_queue.vq_min_pending)
+	},
+	{
+		"max_pending",
+		"maximum pending I/Os to the disk",
+		1,
+		10000,
+		35,
+		offsetof(struct vdev, vdev_queue.vq_max_pending)
+	},
+	{
+		"agg_limit",
+		"maximum size of aggregated I/Os",
+		0,
+		SPA_MAXBLOCKSIZE,
+		SPA_MAXBLOCKSIZE,
+		offsetof(struct vdev, vdev_queue.vq_agg_limit)
+	},
+	{
+		"time_shift",
+		"deadline = pri + (lbolt >> time_shift)",
+		0,
+		63,
+		4,
+		offsetof(struct vdev, vdev_queue.vq_time_shift)
+	},
+	{
+		"ramp_rate",
+		"exponential I/O issue ramp-up rate",
+		1,
+		10000,
+		2,
+		offsetof(struct vdev, vdev_queue.vq_ramp_rate)
+	},
+};
+
+vdev_knob_t *
+vdev_knob_next(vdev_knob_t *vk)
+{
+	if (vk == NULL)
+		return (vdev_knob);
+
+	if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t))
+		return (NULL);
+
+	return (vk);
+}
+
+/*
+ * Mark a top-level vdev's config as dirty, placing it on the dirty list
+ * so that it will be written out next time the vdev configuration is synced.
+ * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
+ */
+void
+vdev_config_dirty(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	int c;
+
+	if (vd == rvd) {
+		for (c = 0; c < rvd->vdev_children; c++)
+			vdev_config_dirty(rvd->vdev_child[c]);
+	} else {
+		ASSERT(vd == vd->vdev_top);
+
+		if (!vd->vdev_is_dirty) {
+			list_insert_head(&spa->spa_dirty_list, vd);
+			vd->vdev_is_dirty = B_TRUE;
+		}
+	}
+}
+
+void
+vdev_config_clean(vdev_t *vd)
+{
+	ASSERT(vd->vdev_is_dirty);
+
+	list_remove(&vd->vdev_spa->spa_dirty_list, vd);
+	vd->vdev_is_dirty = B_FALSE;
+}
+
+/*
+ * Set a vdev's state, updating any parent's state as well.
+ */
+void
+vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
+{
+	if (state == vd->vdev_state)
+		return;
+
+	vd->vdev_state = state;
+	vd->vdev_stat.vs_aux = aux;
+
+	if (vd->vdev_parent != NULL) {
+		int c;
+		int degraded = 0, faulted = 0;
+		vdev_t *parent, *child;
+
+		parent = vd->vdev_parent;
+		for (c = 0; c < parent->vdev_children; c++) {
+			child = parent->vdev_child[c];
+			if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
+				faulted++;
+			else if (child->vdev_state == VDEV_STATE_DEGRADED)
+				degraded++;
+		}
+
+		vd->vdev_parent->vdev_ops->vdev_op_state_change(
+		    vd->vdev_parent, faulted, degraded);
+	    }
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c
new file mode 100644
index 000000000000..e1e7c1a36fcb
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_cache.c
@@ -0,0 +1,374 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+/*
+ * Virtual device read-ahead caching.
+ *
+ * This file implements a simple LRU read-ahead cache.  When the DMU reads
+ * a given block, it will often want other, nearby blocks soon thereafter.
+ * We take advantage of this by reading a larger disk region and caching
+ * the result.  In the best case, this can turn 256 back-to-back 512-byte
+ * reads into a single 128k read followed by 255 cache hits; this reduces
+ * latency dramatically.  In the worst case, it can turn an isolated 512-byte
+ * read into a 128k read, which doesn't affect latency all that much but is
+ * terribly wasteful of bandwidth.  A more intelligent version of the cache
+ * could keep track of access patterns and not do read-ahead unless it sees
+ * at least two temporally close I/Os to the same region.  It could also
+ * take advantage of semantic information about the I/O.  And it could use
+ * something faster than an AVL tree; that was chosen solely for convenience.
+ *
+ * There are five cache operations: allocate, fill, read, write, evict.
+ *
+ * (1) Allocate.  This reserves a cache entry for the specified region.
+ *     We separate the allocate and fill operations so that multiple threads
+ *     don't generate I/O for the same cache miss.
+ *
+ * (2) Fill.  When the I/O for a cache miss completes, the fill routine
+ *     places the data in the previously allocated cache entry.
+ *
+ * (3) Read.  Read data from the cache.
+ *
+ * (4) Write.  Update cache contents after write completion.
+ *
+ * (5) Evict.  When allocating a new entry, we evict the oldest (LRU) entry
+ *     if the total cache size exceeds vc_size.
+ */
+
+static int
+vdev_cache_offset_compare(const void *a1, const void *a2)
+{
+	const vdev_cache_entry_t *ve1 = a1;
+	const vdev_cache_entry_t *ve2 = a2;
+
+	if (ve1->ve_offset < ve2->ve_offset)
+		return (-1);
+	if (ve1->ve_offset > ve2->ve_offset)
+		return (1);
+	return (0);
+}
+
+static int
+vdev_cache_lastused_compare(const void *a1, const void *a2)
+{
+	const vdev_cache_entry_t *ve1 = a1;
+	const vdev_cache_entry_t *ve2 = a2;
+
+	if (ve1->ve_lastused < ve2->ve_lastused)
+		return (-1);
+	if (ve1->ve_lastused > ve2->ve_lastused)
+		return (1);
+
+	/*
+	 * Among equally old entries, sort by offset to ensure uniqueness.
+	 */
+	return (vdev_cache_offset_compare(a1, a2));
+}
+
+/*
+ * Evict the specified entry from the cache.
+ */
+static void
+vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
+{
+	ASSERT(MUTEX_HELD(&vc->vc_lock));
+	ASSERT(ve->ve_fill_io == NULL);
+	ASSERT(ve->ve_data != NULL);
+
+	dprintf("evicting %p, off %llx, LRU %llu, age %lu, hits %u, stale %u\n",
+	    vc, ve->ve_offset, ve->ve_lastused, lbolt - ve->ve_lastused,
+	    ve->ve_hits, ve->ve_missed_update);
+
+	avl_remove(&vc->vc_lastused_tree, ve);
+	avl_remove(&vc->vc_offset_tree, ve);
+	zio_buf_free(ve->ve_data, vc->vc_blocksize);
+	kmem_free(ve, sizeof (vdev_cache_entry_t));
+}
+
+/*
+ * Allocate an entry in the cache.  At the point we don't have the data,
+ * we're just creating a placeholder so that multiple threads don't all
+ * go off and read the same blocks.
+ */
+static vdev_cache_entry_t *
+vdev_cache_allocate(zio_t *zio)
+{
+	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+	uint64_t offset = P2ALIGN(zio->io_offset, vc->vc_blocksize);
+	vdev_cache_entry_t *ve;
+
+	ASSERT(MUTEX_HELD(&vc->vc_lock));
+
+	if (vc->vc_size == 0)
+		return (NULL);
+
+	/*
+	 * If adding a new entry would exceed the cache size,
+	 * evict the oldest entry (LRU).
+	 */
+	if ((avl_numnodes(&vc->vc_lastused_tree) << vc->vc_bshift) >
+	    vc->vc_size) {
+		ve = avl_first(&vc->vc_lastused_tree);
+		if (ve->ve_fill_io != NULL) {
+			dprintf("can't evict in %p, still filling\n", vc);
+			return (NULL);
+		}
+		ASSERT(ve->ve_hits != 0);
+		vdev_cache_evict(vc, ve);
+	}
+
+	ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
+	ve->ve_offset = offset;
+	ve->ve_lastused = lbolt;
+	ve->ve_data = zio_buf_alloc(vc->vc_blocksize);
+
+	avl_add(&vc->vc_offset_tree, ve);
+	avl_add(&vc->vc_lastused_tree, ve);
+
+	return (ve);
+}
+
+static void
+vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
+{
+	uint64_t cache_phase = P2PHASE(zio->io_offset, vc->vc_blocksize);
+
+	ASSERT(MUTEX_HELD(&vc->vc_lock));
+	ASSERT(ve->ve_fill_io == NULL);
+
+	if (ve->ve_lastused != lbolt) {
+		avl_remove(&vc->vc_lastused_tree, ve);
+		ve->ve_lastused = lbolt;
+		avl_add(&vc->vc_lastused_tree, ve);
+	}
+
+	ve->ve_hits++;
+	bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size);
+}
+
+/*
+ * Fill a previously allocated cache entry with data.
+ */
+static void
+vdev_cache_fill(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_cache_t *vc = &vd->vdev_cache;
+	vdev_cache_entry_t *ve = zio->io_private;
+	zio_t *dio;
+
+	ASSERT(zio->io_size == vc->vc_blocksize);
+
+	/*
+	 * Add data to the cache.
+	 */
+	mutex_enter(&vc->vc_lock);
+
+	ASSERT(ve->ve_fill_io == zio);
+	ASSERT(ve->ve_offset == zio->io_offset);
+	ASSERT(ve->ve_data == zio->io_data);
+
+	ve->ve_fill_io = NULL;
+
+	/*
+	 * Even if this cache line was invalidated by a missed write update,
+	 * any reads that were queued up before the missed update are still
+	 * valid, so we can satisfy them from this line before we evict it.
+	 */
+	for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next)
+		vdev_cache_hit(vc, ve, dio);
+
+	if (zio->io_error || ve->ve_missed_update)
+		vdev_cache_evict(vc, ve);
+
+	mutex_exit(&vc->vc_lock);
+
+	while ((dio = zio->io_delegate_list) != NULL) {
+		zio->io_delegate_list = dio->io_delegate_next;
+		dio->io_delegate_next = NULL;
+		dio->io_error = zio->io_error;
+		zio_next_stage(dio);
+	}
+}
+
+/*
+ * Read data from the cache.  Returns 0 on cache hit, errno on a miss.
+ */
+int
+vdev_cache_read(zio_t *zio)
+{
+	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+	vdev_cache_entry_t *ve, ve_search;
+	uint64_t cache_offset = P2ALIGN(zio->io_offset, vc->vc_blocksize);
+	uint64_t cache_phase = P2PHASE(zio->io_offset, vc->vc_blocksize);
+	zio_t *fio;
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
+		return (EINVAL);
+
+	if (zio->io_size > vc->vc_max)
+		return (EOVERFLOW);
+
+	/*
+	 * If the I/O straddles two or more cache blocks, don't cache it.
+	 */
+	if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1,
+	    vc->vc_blocksize))
+		return (EXDEV);
+
+	ASSERT(cache_phase + zio->io_size <= vc->vc_blocksize);
+
+	mutex_enter(&vc->vc_lock);
+
+	ve_search.ve_offset = cache_offset;
+	ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);
+
+	if (ve != NULL) {
+		if (ve->ve_missed_update) {
+			mutex_exit(&vc->vc_lock);
+			return (ESTALE);
+		}
+
+		if ((fio = ve->ve_fill_io) != NULL) {
+			zio->io_delegate_next = fio->io_delegate_list;
+			fio->io_delegate_list = zio;
+			zio_vdev_io_bypass(zio);
+			mutex_exit(&vc->vc_lock);
+			return (0);
+		}
+
+		vdev_cache_hit(vc, ve, zio);
+		zio_vdev_io_bypass(zio);
+
+		mutex_exit(&vc->vc_lock);
+		zio_next_stage(zio);
+		return (0);
+	}
+
+	ve = vdev_cache_allocate(zio);
+
+	if (ve == NULL) {
+		mutex_exit(&vc->vc_lock);
+		return (ENOMEM);
+	}
+
+	fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
+	    ve->ve_data, vc->vc_blocksize, ZIO_TYPE_READ,
+	    ZIO_PRIORITY_CACHE_FILL,
+	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
+	    vdev_cache_fill, ve);
+
+	ve->ve_fill_io = fio;
+	fio->io_delegate_list = zio;
+	zio_vdev_io_bypass(zio);
+
+	mutex_exit(&vc->vc_lock);
+	zio_nowait(fio);
+
+	return (0);
+}
+
+/*
+ * Update cache contents upon write completion.
+ */
+void
+vdev_cache_write(zio_t *zio)
+{
+	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+	vdev_cache_entry_t *ve, ve_search;
+	uint64_t io_start = zio->io_offset;
+	uint64_t io_end = io_start + zio->io_size;
+	uint64_t min_offset = P2ALIGN(io_start, vc->vc_blocksize);
+	uint64_t max_offset = P2ROUNDUP(io_end, vc->vc_blocksize);
+	avl_index_t where;
+
+	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+	mutex_enter(&vc->vc_lock);
+
+	ve_search.ve_offset = min_offset;
+	ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
+
+	if (ve == NULL)
+		ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
+
+	while (ve != NULL && ve->ve_offset < max_offset) {
+		uint64_t start = MAX(ve->ve_offset, io_start);
+		uint64_t end = MIN(ve->ve_offset + vc->vc_blocksize, io_end);
+
+		if (ve->ve_fill_io != NULL) {
+			ve->ve_missed_update = 1;
+		} else {
+			bcopy((char *)zio->io_data + start - io_start,
+			    ve->ve_data + start - ve->ve_offset, end - start);
+		}
+		ve = AVL_NEXT(&vc->vc_offset_tree, ve);
+	}
+	mutex_exit(&vc->vc_lock);
+}
+
+void
+vdev_cache_init(vdev_t *vd)
+{
+	vdev_cache_t *vc = &vd->vdev_cache;
+
+	mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
+	    sizeof (vdev_cache_entry_t),
+	    offsetof(struct vdev_cache_entry, ve_offset_node));
+
+	avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
+	    sizeof (vdev_cache_entry_t),
+	    offsetof(struct vdev_cache_entry, ve_lastused_node));
+
+	vc->vc_blocksize = 1ULL << vc->vc_bshift;
+}
+
+void
+vdev_cache_fini(vdev_t *vd)
+{
+	vdev_cache_t *vc = &vd->vdev_cache;
+	vdev_cache_entry_t *ve;
+
+	mutex_enter(&vc->vc_lock);
+	while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
+		vdev_cache_evict(vc, ve);
+	mutex_exit(&vc->vc_lock);
+
+	avl_destroy(&vc->vc_offset_tree);
+	avl_destroy(&vc->vc_lastused_tree);
+
+	mutex_destroy(&vc->vc_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
new file mode 100644
index 000000000000..9255ecf03ef7
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -0,0 +1,307 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/sunddi.h>
+
+/*
+ * Virtual device vector for disks.
+ */
+
+extern ldi_ident_t zfs_li;
+
+typedef struct vdev_disk_buf {
+	buf_t	vdb_buf;
+	zio_t	*vdb_io;
+} vdev_disk_buf_t;
+
+static int
+vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+	vdev_disk_t *dvd;
+	int error;
+
+	/*
+	 * We must have a pathname, and it must be absolute.
+	 */
+	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (EINVAL);
+	}
+
+	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+
+	/*
+	 * When opening a disk device, we want to preserve the user's original
+	 * intent.  We always want to open the device by the path the user gave
+	 * us, even if it is one of multiple paths to the save device.  But we
+	 * also want to be able to survive disks being removed/recabled.
+	 * Therefore the sequence of opening devices is:
+	 *
+	 * 1. Try opening the device by path.
+	 *
+	 * 	a. First append "s0" to see if this is a whole disk
+	 * 	b. Fall back to path otherwise
+	 *
+	 * 2. If the devid of the device matches the stored value, return
+	 *    success.
+	 *
+	 * 3. Otherwise, the device may have moved.  Try opening the device
+	 *    by the devid instead.
+	 *
+	 */
+	if (vd->vdev_devid != NULL) {
+		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
+		    &dvd->vd_minor) != 0) {
+			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+			return (EINVAL);
+		}
+	}
+
+	error = EINVAL;		/* presume failure */
+
+	if (vd->vdev_path != NULL) {
+		size_t len = strlen(vd->vdev_path) + 3;
+		char *buf = kmem_alloc(len, KM_SLEEP);
+		ddi_devid_t devid;
+
+		(void) snprintf(buf, len, "%ss0", vd->vdev_path);
+
+		/*
+		 * Try whole disk first, then slice name.
+		 */
+		if ((error = ldi_open_by_name(buf, spa_mode, kcred,
+		    &dvd->vd_lh, zfs_li)) != 0)
+			error = ldi_open_by_name(vd->vdev_path,
+			    spa_mode, kcred, &dvd->vd_lh, zfs_li);
+
+		kmem_free(buf, len);
+
+		/*
+		 * Compare the devid to the stored value.
+		 */
+		if (error == 0 && vd->vdev_devid != NULL &&
+		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
+			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
+				error = EINVAL;
+				(void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+				dvd->vd_lh = NULL;
+			}
+			ddi_devid_free(devid);
+		}
+	}
+
+	/*
+	 * If we were unable to open by path, or the devid check fails, open by
+	 * devid instead.
+	 */
+	if (error != 0 && vd->vdev_devid != NULL)
+		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
+		    spa_mode, kcred, &dvd->vd_lh, zfs_li);
+
+	if (error) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+
+	/*
+	 * Determine the actual size of the device.
+	 */
+	if (ldi_get_size(dvd->vd_lh, psize) != 0) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (EINVAL);
+	}
+
+	*ashift = SPA_MINBLOCKSHIFT;
+
+	return (0);
+}
+
+static void
+vdev_disk_close(vdev_t *vd)
+{
+	vdev_disk_t *dvd = vd->vdev_tsd;
+
+	if (dvd == NULL)
+		return;
+
+	dprintf("removing disk %s, devid %s\n",
+	    vd->vdev_path ? vd->vdev_path : "<none>",
+	    vd->vdev_devid ? vd->vdev_devid : "<none>");
+
+	if (dvd->vd_minor != NULL)
+		ddi_devid_str_free(dvd->vd_minor);
+
+	if (dvd->vd_devid != NULL)
+		ddi_devid_free(dvd->vd_devid);
+
+	if (dvd->vd_lh != NULL)
+		(void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+
+	kmem_free(dvd, sizeof (vdev_disk_t));
+	vd->vdev_tsd = NULL;
+}
+
+static void
+vdev_disk_io_intr(buf_t *bp)
+{
+	vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
+	zio_t *zio = vdb->vdb_io;
+
+	if ((zio->io_error = geterror(bp)) == 0 && bp->b_resid != 0)
+		zio->io_error = EIO;
+
+	kmem_free(vdb, sizeof (vdev_disk_buf_t));
+
+	zio_next_stage_async(zio);
+}
+
+static void
+vdev_disk_ioctl_done(void *zio_arg, int error)
+{
+	zio_t *zio = zio_arg;
+
+	zio->io_error = error;
+
+	zio_next_stage_async(zio);
+}
+
+static void
+vdev_disk_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_disk_t *dvd = vd->vdev_tsd;
+	vdev_disk_buf_t *vdb;
+	buf_t *bp;
+	int flags, error;
+
+	if (zio->io_type == ZIO_TYPE_IOCTL) {
+		zio_vdev_io_bypass(zio);
+
+		/* XXPOLICY */
+		if (vdev_is_dead(vd)) {
+			zio->io_error = ENXIO;
+			zio_next_stage_async(zio);
+			return;
+		}
+
+		switch (zio->io_cmd) {
+
+		case DKIOCFLUSHWRITECACHE:
+
+			zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done;
+			zio->io_dk_callback.dkc_cookie = zio;
+
+			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
+			    (uintptr_t)&zio->io_dk_callback,
+			    FKIOCTL, kcred, NULL);
+
+			if (error == 0) {
+				/*
+				 * The ioctl will be done asychronously,
+				 * and will call vdev_disk_ioctl_done()
+				 * upon completion.
+				 */
+				return;
+			}
+			zio->io_error = error;
+			break;
+
+		default:
+			zio->io_error = ENOTSUP;
+		}
+
+		zio_next_stage_async(zio);
+		return;
+	}
+
+	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
+		return;
+
+	if ((zio = vdev_queue_io(zio)) == NULL)
+		return;
+
+	flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
+	flags |= B_BUSY | B_NOCACHE;
+	if (zio->io_flags & ZIO_FLAG_FAILFAST)
+		flags |= B_FAILFAST;
+
+	vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
+
+	vdb->vdb_io = zio;
+	bp = &vdb->vdb_buf;
+
+	bioinit(bp);
+	bp->b_flags = flags;
+	bp->b_bcount = zio->io_size;
+	bp->b_un.b_addr = zio->io_data;
+	bp->b_lblkno = lbtodb(zio->io_offset);
+	bp->b_bufsize = zio->io_size;
+	bp->b_iodone = (int (*)())vdev_disk_io_intr;
+
+	/* XXPOLICY */
+	error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
+	if (error) {
+		zio->io_error = error;
+		bioerror(bp, error);
+		bp->b_resid = bp->b_bcount;
+		bp->b_iodone(bp);
+		return;
+	}
+
+	error = ldi_strategy(dvd->vd_lh, bp);
+	/* ldi_strategy() will return non-zero only on programming errors */
+	ASSERT(error == 0);
+}
+
+static void
+vdev_disk_io_done(zio_t *zio)
+{
+	vdev_queue_io_done(zio);
+
+	if (zio->io_type == ZIO_TYPE_WRITE)
+		vdev_cache_write(zio);
+
+	zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_disk_ops = {
+	vdev_disk_open,
+	vdev_disk_close,
+	vdev_default_asize,
+	vdev_disk_io_start,
+	vdev_disk_io_done,
+	NULL,
+	VDEV_TYPE_DISK,		/* name of this vdev type */
+	B_TRUE			/* leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
new file mode 100644
index 000000000000..a789008e1745
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -0,0 +1,223 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for files.
+ */
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+	vdev_file_t *vf;
+	vnode_t *vp;
+	vattr_t vattr;
+	int error;
+
+	/*
+	 * We must have a pathname, and it must be absolute.
+	 */
+	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (EINVAL);
+	}
+
+	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
+
+#ifdef _KERNEL
+	/*
+	 * When using a file vdev in kernel context, the underlying filesystem
+	 * will already be caching the data.  Don't cache it again here.
+	 */
+	vd->vdev_cache.vc_size = 0;
+#endif
+
+	/*
+	 * We always open the files from the root of the global zone, even if
+	 * we're in a local zone.  If the user has gotten to this point, the
+	 * administrator has already decided that the pool should be available
+	 * to local zone users, so the underlying devices should be as well.
+	 */
+	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
+	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode | FOFFMAX,
+	    0, &vp, 0, 0, rootdir);
+
+	if (error) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+
+	vf->vf_vnode = vp;
+
+#ifdef _KERNEL
+	/*
+	 * Make sure it's a regular file.
+	 */
+	if (vp->v_type != VREG) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (ENODEV);
+	}
+#endif
+
+	/*
+	 * Determine the physical size of the file.
+	 */
+	vattr.va_mask = AT_SIZE;
+	error = VOP_GETATTR(vp, &vattr, 0, kcred);
+	if (error) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+
+	*psize = vattr.va_size;
+	*ashift = SPA_MINBLOCKSHIFT;
+
+	return (0);
+}
+
+static void
+vdev_file_close(vdev_t *vd)
+{
+	vdev_file_t *vf = vd->vdev_tsd;
+
+	if (vf == NULL)
+		return;
+
+	if (vf->vf_vnode != NULL) {
+		(void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred);
+		(void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred);
+		VN_RELE(vf->vf_vnode);
+	}
+
+	kmem_free(vf, sizeof (vdev_file_t));
+	vd->vdev_tsd = NULL;
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_file_t *vf = vd->vdev_tsd;
+	ssize_t resid;
+	int error;
+
+	if (zio->io_type == ZIO_TYPE_IOCTL) {
+		zio_vdev_io_bypass(zio);
+
+		/* XXPOLICY */
+		if (vdev_is_dead(vd)) {
+			zio->io_error = ENXIO;
+			zio_next_stage_async(zio);
+			return;
+		}
+
+		switch (zio->io_cmd) {
+		case DKIOCFLUSHWRITECACHE:
+			zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
+			    kcred);
+			dprintf("fsync(%s) = %d\n", vdev_description(vd),
+			    zio->io_error);
+			break;
+		default:
+			zio->io_error = ENOTSUP;
+		}
+
+		zio_next_stage_async(zio);
+		return;
+	}
+
+	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
+		return;
+
+	if ((zio = vdev_queue_io(zio)) == NULL)
+		return;
+
+	/* XXPOLICY */
+	error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
+	if (error) {
+		zio->io_error = error;
+		zio_next_stage_async(zio);
+		return;
+	}
+
+	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
+	    UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
+	    zio->io_size, zio->io_offset, UIO_SYSSPACE,
+	    0, RLIM64_INFINITY, kcred, &resid);
+
+	if (resid != 0 && zio->io_error == 0)
+		zio->io_error = ENOSPC;
+
+	zio_next_stage_async(zio);
+}
+
+static void
+vdev_file_io_done(zio_t *zio)
+{
+	vdev_queue_io_done(zio);
+
+	if (zio->io_type == ZIO_TYPE_WRITE)
+		vdev_cache_write(zio);
+
+	zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_file_ops = {
+	vdev_file_open,
+	vdev_file_close,
+	vdev_default_asize,
+	vdev_file_io_start,
+	vdev_file_io_done,
+	NULL,
+	VDEV_TYPE_FILE,		/* name of this vdev type */
+	B_TRUE			/* leaf vdev */
+};
+
+/*
+ * From userland we access disks just like files.
+ */
+#ifndef _KERNEL
+
+vdev_ops_t vdev_disk_ops = {
+	vdev_file_open,
+	vdev_file_close,
+	vdev_default_asize,
+	vdev_file_io_start,
+	vdev_file_io_done,
+	NULL,
+	VDEV_TYPE_DISK,		/* name of this vdev type */
+	B_TRUE			/* leaf vdev */
+};
+
+#endif
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
new file mode 100644
index 000000000000..6671a68fa946
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -0,0 +1,848 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Virtual Device Labels
+ * ---------------------
+ *
+ * The vdev label serves several distinct purposes:
+ *
+ *	1. Uniquely identify this device as part of a ZFS pool and confirm its
+ *	   identity within the pool.
+ *
+ * 	2. Verify that all the devices given in a configuration are present
+ *         within the pool.
+ *
+ * 	3. Determine the uberblock for the pool.
+ *
+ * 	4. In case of an import operation, determine the configuration of the
+ *         toplevel vdev of which it is a part.
+ *
+ * 	5. If an import operation cannot find all the devices in the pool,
+ *         provide enough information to the administrator to determine which
+ *         devices are missing.
+ *
+ * It is important to note that while the kernel is responsible for writing the
+ * label, it only consumes the information in the first three cases.  The
+ * latter information is only consumed in userland when determining the
+ * configuration to import a pool.
+ *
+ *
+ * Label Organization
+ * ------------------
+ *
+ * Before describing the contents of the label, it's important to understand how
+ * the labels are written and updated with respect to the uberblock.
+ *
+ * When the pool configuration is altered, either because it was newly created
+ * or a device was added, we want to update all the labels such that we can deal
+ * with fatal failure at any point.  To this end, each disk has two labels which
+ * are updated before and after the uberblock is synced.  Assuming we have
+ * labels and an uberblock with the following transacation groups:
+ *
+ *              L1          UB          L2
+ *           +------+    +------+    +------+
+ *           |      |    |      |    |      |
+ *           | t10  |    | t10  |    | t10  |
+ *           |      |    |      |    |      |
+ *           +------+    +------+    +------+
+ *
+ * In this stable state, the labels and the uberblock were all updated within
+ * the same transaction group (10).  Each label is mirrored and checksummed, so
+ * that we can detect when we fail partway through writing the label.
+ *
+ * In order to identify which labels are valid, the labels are written in the
+ * following manner:
+ *
+ * 	1. For each vdev, update 'L1' to the new label
+ * 	2. Update the uberblock
+ * 	3. For each vdev, update 'L2' to the new label
+ *
+ * Given arbitrary failure, we can determine the correct label to use based on
+ * the transaction group.  If we fail after updating L1 but before updating the
+ * UB, we will notice that L1's transaction group is greater than the uberblock,
+ * so L2 must be valid.  If we fail after writing the uberblock but before
+ * writing L2, we will notice that L2's transaction group is less than L1, and
+ * therefore L1 is valid.
+ *
+ * Another added complexity is that not every label is updated when the config
+ * is synced.  If we add a single device, we do not want to have to re-write
+ * every label for every device in the pool.  This means that both L1 and L2 may
+ * be older than the pool uberblock, because the necessary information is stored
+ * on another vdev.
+ *
+ *
+ * On-disk Format
+ * --------------
+ *
+ * The vdev label consists of two distinct parts, and is wrapped within the
+ * vdev_label_t structure.  The label includes 8k of padding to permit legacy
+ * VTOC disk labels, but is otherwise ignored.
+ *
+ * The first half of the label is a packed nvlist which contains pool wide
+ * properties, per-vdev properties, and configuration information.  It is
+ * described in more detail below.
+ *
+ * The latter half of the label consists of a redundant array of uberblocks.
+ * These uberblocks are updated whenever a transaction group is committed,
+ * or when the configuration is updated.  When a pool is loaded, we scan each
+ * vdev for the 'best' uberblock.
+ *
+ *
+ * Configuration Information
+ * -------------------------
+ *
+ * The nvlist describing the pool and vdev contains the following elements:
+ *
+ * 	version		ZFS on-disk version
+ * 	name		Pool name
+ * 	state		Pool state
+ * 	txg		Transaction group in which this label was written
+ * 	pool_guid	Unique identifier for this pool
+ * 	vdev_tree	An nvlist describing vdev tree.
+ *
+ * Each leaf device label also contains the following:
+ *
+ * 	top_guid	Unique ID for top-level vdev in which this is contained
+ * 	guid		Unique ID for the leaf vdev
+ *
+ * The 'vs' configuration follows the format described in 'spa_config.c'.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Basic routines to read and write from a vdev label.
+ * Used throughout the rest of this file.
+ */
+uint64_t
+vdev_label_offset(uint64_t psize, int l, uint64_t offset)
+{
+	return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
+	    0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
+}
+
+static void
+vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+	uint64_t size, zio_done_func_t *done, void *private)
+{
+	ASSERT(vd->vdev_children == 0);
+
+	zio_nowait(zio_read_phys(zio, vd,
+	    vdev_label_offset(vd->vdev_psize, l, offset),
+	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
+	    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_SPECULATIVE |
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_DONT_RETRY));
+}
+
+static void
+vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+	uint64_t size, zio_done_func_t *done, void *private)
+{
+	ASSERT(vd->vdev_children == 0);
+
+	zio_nowait(zio_write_phys(zio, vd,
+	    vdev_label_offset(vd->vdev_psize, l, offset),
+	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
+	    ZIO_PRIORITY_SYNC_WRITE,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_DONT_RETRY));
+}
+
+/*
+ * Generate the nvlist representing this vdev's config.
+ */
+nvlist_t *
+vdev_config_generate(vdev_t *vd, int getstats)
+{
+	nvlist_t *nv = NULL;
+
+	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0);
+
+	VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
+	    vd->vdev_ops->vdev_op_type) == 0);
+	VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id) == 0);
+	VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
+
+	if (vd->vdev_path != NULL)
+		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH,
+		    vd->vdev_path) == 0);
+
+	if (vd->vdev_devid != NULL)
+		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID,
+		    vd->vdev_devid) == 0);
+
+	if (vd == vd->vdev_top) {
+		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+		    vd->vdev_ms_array) == 0);
+		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+		    vd->vdev_ms_shift) == 0);
+		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT,
+		    vd->vdev_ashift) == 0);
+		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
+		    vd->vdev_asize) == 0);
+	}
+
+	if (vd->vdev_dtl.smo_object != 0)
+		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
+		    vd->vdev_dtl.smo_object) == 0);
+
+	if (getstats) {
+		vdev_stat_t vs;
+		vdev_get_stats(vd, &vs);
+		VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
+		    (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+	}
+
+	if (!vd->vdev_ops->vdev_op_leaf) {
+		nvlist_t **child;
+		int c;
+
+		child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
+		    KM_SLEEP);
+
+		for (c = 0; c < vd->vdev_children; c++)
+			child[c] = vdev_config_generate(vd->vdev_child[c],
+			    getstats);
+
+		VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+		    child, vd->vdev_children) == 0);
+
+		for (c = 0; c < vd->vdev_children; c++)
+			nvlist_free(child[c]);
+
+		kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
+	}
+
+	return (nv);
+}
+
+nvlist_t *
+vdev_label_read_config(vdev_t *vd)
+{
+	nvlist_t *config = NULL;
+	vdev_phys_t *vp;
+	uint64_t version;
+	zio_t *zio;
+	int l;
+
+	if (vdev_is_dead(vd))
+		return (NULL);
+
+	vp = zio_buf_alloc(sizeof (vdev_phys_t));
+
+	for (l = 0; l < VDEV_LABELS; l++) {
+
+		zio = zio_root(vd->vdev_spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD);
+
+		vdev_label_read(zio, vd, l, vp,
+		    offsetof(vdev_label_t, vl_vdev_phys),
+		    sizeof (vdev_phys_t), NULL, NULL);
+
+		if (zio_wait(zio) == 0 &&
+		    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
+		    &config, 0) == 0 &&
+		    nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+		    &version) == 0 &&
+		    version == UBERBLOCK_VERSION)
+			break;
+
+		if (config != NULL) {
+			nvlist_free(config);
+			config = NULL;
+		}
+	}
+
+	zio_buf_free(vp, sizeof (vdev_phys_t));
+
+	return (config);
+}
+
+int
+vdev_label_init(vdev_t *vd, uint64_t crtxg)
+{
+	spa_t *spa = vd->vdev_spa;
+	nvlist_t *label;
+	vdev_phys_t *vp;
+	vdev_boot_header_t *vb;
+	uberblock_phys_t *ubphys;
+	zio_t *zio;
+	int l, c, n;
+	char *buf;
+	size_t buflen;
+	int error;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		if ((error = vdev_label_init(vd->vdev_child[c], crtxg)) != 0)
+			return (error);
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (0);
+
+	/*
+	 * Make sure each leaf device is writable, and zero its initial content.
+	 * Along the way, also make sure that no leaf is already in use.
+	 * Note that it's important to do this sequentially, not in parallel,
+	 * so that we catch cases of multiple use of the same leaf vdev in
+	 * the vdev we're creating -- e.g. mirroring a disk with itself.
+	 */
+	if (vdev_is_dead(vd))
+		return (EIO);
+
+	/*
+	 * Check whether this device is already in use.
+	 * Ignore the check if crtxg == 0, which we use for device removal.
+	 */
+	if (crtxg != 0 && (label = vdev_label_read_config(vd)) != NULL) {
+		uint64_t version, state, pool_guid, device_guid, txg;
+		uint64_t mycrtxg = 0;
+
+		(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
+		    &mycrtxg);
+
+		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION,
+		    &version) == 0 && version == UBERBLOCK_VERSION &&
+		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+		    &state) == 0 && state == POOL_STATE_ACTIVE &&
+		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
+		    &pool_guid) == 0 &&
+		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
+		    &device_guid) == 0 &&
+		    spa_guid_exists(pool_guid, device_guid) &&
+		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
+		    &txg) == 0 && (txg != 0 || mycrtxg == crtxg)) {
+			dprintf("vdev %s in use, pool_state %d\n",
+			    vdev_description(vd), state);
+			nvlist_free(label);
+			return (EBUSY);
+		}
+		nvlist_free(label);
+	}
+
+	/*
+	 * The device isn't in use, so initialize its label.
+	 */
+	vp = zio_buf_alloc(sizeof (vdev_phys_t));
+	bzero(vp, sizeof (vdev_phys_t));
+
+	/*
+	 * Generate a label describing the pool and our top-level vdev.
+	 * We mark it as being from txg 0 to indicate that it's not
+	 * really part of an active pool just yet.  The labels will
+	 * be written again with a meaningful txg by spa_sync().
+	 */
+	label = spa_config_generate(spa, vd, 0ULL, 0);
+
+	/*
+	 * Add our creation time.  This allows us to detect multiple vdev
+	 * uses as described above, and automatically expires if we fail.
+	 */
+	VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, crtxg) == 0);
+
+	buf = vp->vp_nvlist;
+	buflen = sizeof (vp->vp_nvlist);
+
+	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) {
+		nvlist_free(label);
+		zio_buf_free(vp, sizeof (vdev_phys_t));
+		return (EINVAL);
+	}
+
+	/*
+	 * Initialize boot block header.
+	 */
+	vb = zio_buf_alloc(sizeof (vdev_boot_header_t));
+	bzero(vb, sizeof (vdev_boot_header_t));
+	vb->vb_magic = VDEV_BOOT_MAGIC;
+	vb->vb_version = VDEV_BOOT_VERSION;
+	vb->vb_offset = VDEV_BOOT_OFFSET;
+	vb->vb_size = VDEV_BOOT_SIZE;
+
+	/*
+	 * Initialize uberblock template.
+	 */
+	ubphys = zio_buf_alloc(sizeof (uberblock_phys_t));
+	bzero(ubphys, sizeof (uberblock_phys_t));
+	ubphys->ubp_uberblock = spa->spa_uberblock;
+	ubphys->ubp_uberblock.ub_txg = 0;
+
+	/*
+	 * Write everything in parallel.
+	 */
+	zio = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+
+	for (l = 0; l < VDEV_LABELS; l++) {
+
+		vdev_label_write(zio, vd, l, vp,
+		    offsetof(vdev_label_t, vl_vdev_phys),
+		    sizeof (vdev_phys_t), NULL, NULL);
+
+		vdev_label_write(zio, vd, l, vb,
+		    offsetof(vdev_label_t, vl_boot_header),
+		    sizeof (vdev_boot_header_t), NULL, NULL);
+
+		for (n = 0; n < VDEV_UBERBLOCKS; n++) {
+
+			vdev_label_write(zio, vd, l, ubphys,
+			    offsetof(vdev_label_t, vl_uberblock[n]),
+			    sizeof (uberblock_phys_t), NULL, NULL);
+
+		}
+	}
+
+	error = zio_wait(zio);
+
+	nvlist_free(label);
+	zio_buf_free(ubphys, sizeof (uberblock_phys_t));
+	zio_buf_free(vb, sizeof (vdev_boot_header_t));
+	zio_buf_free(vp, sizeof (vdev_phys_t));
+
+	return (error);
+}
+
+/*
+ * ==========================================================================
+ * uberblock load/sync
+ * ==========================================================================
+ */
+
+/*
+ * Consider the following situation: txg is safely synced to disk.  We've
+ * written the first uberblock for txg + 1, and then we lose power.  When we
+ * come back up, we fail to see the uberblock for txg + 1 because, say,
+ * it was on a mirrored device and the replica to which we wrote txg + 1
+ * is now offline.  If we then make some changes and sync txg + 1, and then
+ * the missing replica comes back, then for a new seconds we'll have two
+ * conflicting uberblocks on disk with the same txg.  The solution is simple:
+ * among uberblocks with equal txg, choose the one with the latest timestamp.
+ */
+static int
+vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
+{
+	if (ub1->ub_txg < ub2->ub_txg)
+		return (-1);
+	if (ub1->ub_txg > ub2->ub_txg)
+		return (1);
+
+	if (ub1->ub_timestamp < ub2->ub_timestamp)
+		return (-1);
+	if (ub1->ub_timestamp > ub2->ub_timestamp)
+		return (1);
+
+	return (0);
+}
+
+static void
+vdev_uberblock_load_done(zio_t *zio)
+{
+	uberblock_phys_t *ubphys = zio->io_data;
+	uberblock_t *ub = &ubphys->ubp_uberblock;
+	uberblock_t *ubbest = zio->io_private;
+	spa_t *spa = zio->io_spa;
+
+	ASSERT3U(zio->io_size, ==, sizeof (uberblock_phys_t));
+
+	if (uberblock_verify(ub) == 0) {
+		mutex_enter(&spa->spa_uberblock_lock);
+		if (vdev_uberblock_compare(ub, ubbest) > 0)
+			*ubbest = *ub;
+		mutex_exit(&spa->spa_uberblock_lock);
+	}
+
+	zio_buf_free(zio->io_data, zio->io_size);
+}
+
+void
+vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
+{
+	int l, c, n;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return;
+
+	if (vdev_is_dead(vd))
+		return;
+
+	for (l = 0; l < VDEV_LABELS; l++) {
+		for (n = 0; n < VDEV_UBERBLOCKS; n++) {
+			vdev_label_read(zio, vd, l,
+			    zio_buf_alloc(sizeof (uberblock_phys_t)),
+			    offsetof(vdev_label_t, vl_uberblock[n]),
+			    sizeof (uberblock_phys_t),
+			    vdev_uberblock_load_done, ubbest);
+		}
+	}
+}
+
+/*
+ * Write the uberblock to both labels of all leaves of the specified vdev.
+ */
+static void
+vdev_uberblock_sync_done(zio_t *zio)
+{
+	uint64_t *good_writes = zio->io_root->io_private;
+
+	if (zio->io_error == 0)
+		atomic_add_64(good_writes, 1);
+}
+
+static void
+vdev_uberblock_sync(zio_t *zio, uberblock_phys_t *ubphys, vdev_t *vd,
+	uint64_t txg)
+{
+	int l, c, n;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_uberblock_sync(zio, ubphys, vd->vdev_child[c], txg);
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return;
+
+	if (vdev_is_dead(vd))
+		return;
+
+	n = txg & (VDEV_UBERBLOCKS - 1);
+
+	ASSERT(ubphys->ubp_uberblock.ub_txg == txg);
+
+	for (l = 0; l < VDEV_LABELS; l++)
+		vdev_label_write(zio, vd, l, ubphys,
+		    offsetof(vdev_label_t, vl_uberblock[n]),
+		    sizeof (uberblock_phys_t), vdev_uberblock_sync_done, NULL);
+
+	dprintf("vdev %s in txg %llu\n", vdev_description(vd), txg);
+}
+
+static int
+vdev_uberblock_sync_tree(spa_t *spa, uberblock_t *ub, vdev_t *uvd, uint64_t txg)
+{
+	uberblock_phys_t *ubphys;
+	uint64_t *good_writes;
+	zio_t *zio;
+	int error;
+
+	ubphys = zio_buf_alloc(sizeof (uberblock_phys_t));
+	bzero(ubphys, sizeof (uberblock_phys_t));
+	ubphys->ubp_uberblock = *ub;
+
+	good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+
+	zio = zio_root(spa, NULL, good_writes,
+	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+
+	vdev_uberblock_sync(zio, ubphys, uvd, txg);
+
+	error = zio_wait(zio);
+
+	if (error && *good_writes != 0) {
+		dprintf("partial success: good_writes = %llu\n", *good_writes);
+		error = 0;
+	}
+
+	/*
+	 * It's possible to have no good writes and no error if every vdev is in
+	 * the CANT_OPEN state.
+	 */
+	if (*good_writes == 0 && error == 0)
+		error = EIO;
+
+	kmem_free(good_writes, sizeof (uint64_t));
+	zio_buf_free(ubphys, sizeof (uberblock_phys_t));
+
+	return (error);
+}
+
+/*
+ * Sync out an individual vdev.
+ */
+static void
+vdev_sync_label_done(zio_t *zio)
+{
+	uint64_t *good_writes = zio->io_root->io_private;
+
+	if (zio->io_error == 0)
+		atomic_add_64(good_writes, 1);
+}
+
+static void
+vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg)
+{
+	nvlist_t *label;
+	vdev_phys_t *vp;
+	char *buf;
+	size_t buflen;
+	int c;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_sync_label(zio, vd->vdev_child[c], l, txg);
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return;
+
+	if (vdev_is_dead(vd))
+		return;
+
+	/*
+	 * Generate a label describing the top-level config to which we belong.
+	 */
+	label = spa_config_generate(vd->vdev_spa, vd, txg, 0);
+
+	vp = zio_buf_alloc(sizeof (vdev_phys_t));
+	bzero(vp, sizeof (vdev_phys_t));
+
+	buf = vp->vp_nvlist;
+	buflen = sizeof (vp->vp_nvlist);
+
+	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, 0) == 0)
+		vdev_label_write(zio, vd, l, vp,
+		    offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t),
+		    vdev_sync_label_done, NULL);
+
+	zio_buf_free(vp, sizeof (vdev_phys_t));
+	nvlist_free(label);
+
+	dprintf("%s label %d txg %llu\n", vdev_description(vd), l, txg);
+}
+
+static int
+vdev_sync_labels(vdev_t *vd, int l, uint64_t txg)
+{
+	uint64_t *good_writes;
+	zio_t *zio;
+	int error;
+
+	ASSERT(vd == vd->vdev_top);
+
+	good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+
+	zio = zio_root(vd->vdev_spa, NULL, good_writes,
+	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+
+	/*
+	 * Recursively kick off writes to all labels.
+	 */
+	vdev_sync_label(zio, vd, l, txg);
+
+	error = zio_wait(zio);
+
+	if (error && *good_writes != 0) {
+		dprintf("partial success: good_writes = %llu\n", *good_writes);
+		error = 0;
+	}
+
+	if (*good_writes == 0 && error == 0)
+		error = ENODEV;
+
+	kmem_free(good_writes, sizeof (uint64_t));
+
+	return (error);
+}
+
+/*
+ * Sync the entire vdev configuration.
+ *
+ * The order of operations is carefully crafted to ensure that
+ * if the system panics or loses power at any time, the state on disk
+ * is still transactionally consistent.  The in-line comments below
+ * describe the failure semantics at each stage.
+ *
+ * Moreover, it is designed to be idempotent: if spa_sync_labels() fails
+ * at any time, you can just call it again, and it will resume its work.
+ */
+int
+spa_sync_labels(spa_t *spa, uint64_t txg)
+{
+	uberblock_t *ub = &spa->spa_uberblock;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *vd, *uvd;
+	zio_t *zio;
+	int c, l, error;
+
+	ASSERT(ub->ub_txg <= txg);
+
+	/*
+	 * If this isn't a resync due to I/O errors, and nothing changed
+	 * in this transaction group, and the vdev configuration hasn't changed,
+	 * and this isn't an explicit sync-all, then there's nothing to do.
+	 */
+	if (ub->ub_txg < txg && uberblock_update(ub, rvd, txg) == B_FALSE &&
+	    list_is_empty(&spa->spa_dirty_list)) {
+		dprintf("nothing to sync in %s in txg %llu\n",
+		    spa_name(spa), txg);
+		return (0);
+	}
+
+	if (txg > spa_freeze_txg(spa))
+		return (0);
+
+	dprintf("syncing %s txg %llu\n", spa_name(spa), txg);
+
+	/*
+	 * Flush the write cache of every disk that's been written to
+	 * in this transaction group.  This ensures that all blocks
+	 * written in this txg will be committed to stable storage
+	 * before any uberblock that references them.
+	 */
+	zio = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+	for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
+	    vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) {
+		zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+		    NULL, NULL, ZIO_PRIORITY_NOW,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+	}
+	(void) zio_wait(zio);
+
+	/*
+	 * Sync out the even labels (L0, L2) for every dirty vdev.  If the
+	 * system dies in the middle of this process, that's OK: all of the
+	 * even labels that made it to disk will be newer than any uberblock,
+	 * and will therefore be considered invalid.  The odd labels (L1, L3),
+	 * which have not yet been touched, will still be valid.
+	 */
+	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+	    vd = list_next(&spa->spa_dirty_list, vd)) {
+		for (l = 0; l < VDEV_LABELS; l++) {
+			if (l & 1)
+				continue;
+			if ((error = vdev_sync_labels(vd, l, txg)) != 0)
+				return (error);
+		}
+	}
+
+	/*
+	 * Flush the new labels to disk.  This ensures that all even-label
+	 * updates are committed to stable storage before the uberblock update.
+	 */
+	zio = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+	    vd = list_next(&spa->spa_dirty_list, vd)) {
+		zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+		    NULL, NULL, ZIO_PRIORITY_NOW,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+	}
+	(void) zio_wait(zio);
+
+	/*
+	 * If there are any dirty vdevs, sync the uberblock to all vdevs.
+	 * Otherwise, pick one top-level vdev at random.
+	 */
+	if (!list_is_empty(&spa->spa_dirty_list))
+		uvd = rvd;
+	else
+		uvd = rvd->vdev_child[spa_get_random(rvd->vdev_children)];
+
+	/*
+	 * Sync the uberblocks.  If the system dies in the middle of this
+	 * step, there are two cases to consider, and the on-disk state
+	 * is consistent either way:
+	 *
+	 * (1)	If none of the new uberblocks made it to disk, then the
+	 *	previous uberblock will be the newest, and the odd labels
+	 *	(which had not yet been touched) will be valid with respect
+	 *	to that uberblock.
+	 *
+	 * (2)	If one or more new uberblocks made it to disk, then they
+	 *	will be the newest, and the even labels (which had all
+	 *	been successfully committed) will be valid with respect
+	 *	to the new uberblocks.
+	 */
+	if ((error = vdev_uberblock_sync_tree(spa, ub, uvd, txg)) != 0)
+		return (error);
+
+	/*
+	 * Flush the uberblocks to disk.  This ensures that the odd labels
+	 * are no longer needed (because the new uberblocks and the even
+	 * labels are safely on disk), so it is safe to overwrite them.
+	 */
+	(void) zio_wait(zio_ioctl(NULL, spa, uvd, DKIOCFLUSHWRITECACHE,
+	    NULL, NULL, ZIO_PRIORITY_NOW,
+	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+
+	/*
+	 * Sync out odd labels for every dirty vdev.  If the system dies
+	 * in the middle of this process, the even labels and the new
+	 * uberblocks will suffice to open the pool.  The next time
+	 * the pool is opened, the first thing we'll do -- before any
+	 * user data is modified -- is mark every vdev dirty so that
+	 * all labels will be brought up to date.
+	 */
+	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+	    vd = list_next(&spa->spa_dirty_list, vd)) {
+		for (l = 0; l < VDEV_LABELS; l++) {
+			if ((l & 1) == 0)
+				continue;
+			if ((error = vdev_sync_labels(vd, l, txg)) != 0)
+				return (error);
+		}
+	}
+
+	/*
+	 * Flush the new labels to disk.  This ensures that all odd-label
+	 * updates are committed to stable storage before the next
+	 * transaction group begins.
+	 */
+	zio = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL);
+	for (vd = list_head(&spa->spa_dirty_list); vd != NULL;
+	    vd = list_next(&spa->spa_dirty_list, vd)) {
+		zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+		    NULL, NULL, ZIO_PRIORITY_NOW,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+	}
+	(void) zio_wait(zio);
+
+	/*
+	 * Clear the dirty list.
+	 */
+	while (!list_is_empty(&spa->spa_dirty_list))
+		vdev_config_clean(list_head(&spa->spa_dirty_list));
+
+#ifdef DEBUG
+	for (c = 0; c < rvd->vdev_children; c++) {
+		ASSERT(rvd->vdev_child[c]->vdev_is_dirty == 0);
+	}
+#endif
+
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
new file mode 100644
index 000000000000..45eb7ce78b80
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -0,0 +1,414 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for mirroring.
+ */
+
+typedef struct mirror_map {
+	int	mm_error;
+	short	mm_tried;
+	short	mm_skipped;
+} mirror_map_t;
+
+static mirror_map_t *
+vdev_mirror_map_alloc(zio_t *zio)
+{
+	zio->io_vsd = kmem_zalloc(zio->io_vd->vdev_children *
+	    sizeof (mirror_map_t), KM_SLEEP);
+	return (zio->io_vsd);
+}
+
+static void
+vdev_mirror_map_free(zio_t *zio)
+{
+	kmem_free(zio->io_vsd,
+	    zio->io_vd->vdev_children * sizeof (mirror_map_t));
+	zio->io_vsd = NULL;
+}
+
+static int
+vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+	vdev_t *cvd;
+	uint64_t c;
+	int numerrors = 0;
+	int ret, lasterror = 0;
+
+	if (vd->vdev_children == 0) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (EINVAL);
+	}
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		cvd = vd->vdev_child[c];
+
+		if ((ret = vdev_open(cvd)) != 0) {
+			lasterror = ret;
+			numerrors++;
+			continue;
+		}
+
+		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+		*ashift = cvd->vdev_ashift;
+	}
+
+	if (numerrors == vd->vdev_children) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+		return (lasterror);
+	}
+
+	return (0);
+}
+
+static void
+vdev_mirror_close(vdev_t *vd)
+{
+	uint64_t c;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_mirror_child_done(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_private;
+
+	mm->mm_error = zio->io_error;
+	mm->mm_tried = 1;
+	mm->mm_skipped = 0;
+}
+
+static void
+vdev_mirror_scrub_done(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_private;
+
+	if (zio->io_error == 0) {
+		zio_t *pio = zio->io_parent;
+		mutex_enter(&pio->io_lock);
+		bcopy(zio->io_data, pio->io_data, pio->io_size);
+		mutex_exit(&pio->io_lock);
+	}
+
+	zio_buf_free(zio->io_data, zio->io_size);
+
+	mm->mm_error = zio->io_error;
+	mm->mm_tried = 1;
+	mm->mm_skipped = 0;
+}
+
+/*
+ * Try to find a child whose DTL doesn't contain the block we want to read.
+ * If we can't, try the read on any vdev we haven't already tried.
+ */
+static int
+vdev_mirror_child_select(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_vsd;
+	vdev_t *vd = zio->io_vd;
+	vdev_t *cvd;
+	uint64_t txg = zio->io_txg;
+	int i, c;
+
+	ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
+
+	/*
+	 * Select the child we'd like to read from absent any errors.
+	 * The current policy is to alternate sides at 8M granularity.
+	 * XXX -- investigate other policies for read distribution.
+	 */
+	c = (zio->io_offset >> (SPA_MAXBLOCKSHIFT + 6)) % vd->vdev_children;
+
+	/*
+	 * If this is a replacing vdev, always try child 0 (the source) first.
+	 */
+	if (vd->vdev_ops == &vdev_replacing_ops)
+		c = 0;
+
+	/*
+	 * Try to find a child whose DTL doesn't contain the block to read.
+	 * If a child is known to be completely inaccessible (indicated by
+	 * vdev_is_dead() returning B_TRUE), don't even try.
+	 */
+	for (i = 0; i < vd->vdev_children; i++, c++) {
+		if (c >= vd->vdev_children)
+			c = 0;
+		if (mm[c].mm_tried || mm[c].mm_skipped)
+			continue;
+		cvd = vd->vdev_child[c];
+		if (vdev_is_dead(cvd)) {
+			mm[c].mm_error = ENXIO;
+			mm[c].mm_tried = 1;	/* don't even try */
+			mm[c].mm_skipped = 1;
+			continue;
+		}
+		if (!vdev_dtl_contains(&cvd->vdev_dtl_map, txg, 1))
+			return (c);
+		mm[c].mm_error = ESTALE;
+		mm[c].mm_skipped = 1;
+	}
+
+	/*
+	 * Every device is either missing or has this txg in its DTL.
+	 * If we don't have any sibling replicas to consult, look for
+	 * any child we haven't already tried before giving up.
+	 */
+	if (vd == vd->vdev_top || vd->vdev_parent->vdev_children <= 1) {
+		for (c = 0; c < vd->vdev_children; c++) {
+			if (!mm[c].mm_tried)
+				return (c);
+		}
+	}
+
+	/*
+	 * Every child failed.  There's no place left to look.
+	 */
+	return (-1);
+}
+
+static void
+vdev_mirror_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	mirror_map_t *mm;
+	int c, children;
+
+	mm = vdev_mirror_map_alloc(zio);
+
+	if (zio->io_type == ZIO_TYPE_READ) {
+		if (zio->io_flags & ZIO_FLAG_SCRUB) {
+			/*
+			 * For scrubbing reads we need to allocate a read
+			 * buffer for each child and issue reads to all
+			 * children.  If any child succeeds, it will copy its
+			 * data into zio->io_data in vdev_mirror_scrub_done.
+			 */
+			for (c = 0; c < vd->vdev_children; c++) {
+				zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+				    vd->vdev_child[c], zio->io_offset,
+				    zio_buf_alloc(zio->io_size), zio->io_size,
+				    zio->io_type, zio->io_priority,
+				    ZIO_FLAG_CANFAIL, vdev_mirror_scrub_done,
+				    &mm[c]));
+			}
+			zio_wait_children_done(zio);
+			return;
+		}
+		/*
+		 * For normal reads just pick one child.
+		 */
+		c = vdev_mirror_child_select(zio);
+		children = (c >= 0);
+	} else {
+		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+		/*
+		 * If this is a resilvering I/O to a replacing vdev,
+		 * only the last child should be written -- unless the
+		 * first child happens to have a DTL entry here as well.
+		 * All other writes go to all children.
+		 */
+		if ((zio->io_flags & ZIO_FLAG_RESILVER) &&
+		    vd->vdev_ops == &vdev_replacing_ops &&
+		    !vdev_dtl_contains(&vd->vdev_child[0]->vdev_dtl_map,
+		    zio->io_txg, 1)) {
+			c = vd->vdev_children - 1;
+			children = 1;
+		} else {
+			c = 0;
+			children = vd->vdev_children;
+		}
+	}
+
+	while (children--) {
+		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+		    vd->vdev_child[c], zio->io_offset, zio->io_data,
+		    zio->io_size, zio->io_type, zio->io_priority,
+		    ZIO_FLAG_CANFAIL, vdev_mirror_child_done, &mm[c]));
+		c++;
+	}
+
+	zio_wait_children_done(zio);
+}
+
+static void
+vdev_mirror_io_done(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_t *cvd;
+	mirror_map_t *mm = zio->io_vsd;
+	int c;
+	int good_copies = 0;
+	int unexpected_errors = 0;
+
+	ASSERT(mm != NULL);
+
+	zio->io_error = 0;
+	zio->io_numerrors = 0;
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		if (mm[c].mm_tried && mm[c].mm_error == 0) {
+			good_copies++;
+			continue;
+		}
+
+		/*
+		 * We preserve any EIOs because those may be worth retrying;
+		 * whereas ECKSUM and ENXIO are more likely to be persistent.
+		 */
+		if (mm[c].mm_error) {
+			if (zio->io_error != EIO)
+				zio->io_error = mm[c].mm_error;
+			if (!mm[c].mm_skipped)
+				unexpected_errors++;
+			zio->io_numerrors++;
+		}
+	}
+
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		/*
+		 * XXX -- for now, treat partial writes as success.
+		 */
+		/* XXPOLICY */
+		if (good_copies != 0)
+			zio->io_error = 0;
+		ASSERT(mm != NULL);
+		vdev_mirror_map_free(zio);
+		zio_next_stage(zio);
+		return;
+	}
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+	/*
+	 * If we don't have a good copy yet, keep trying other children.
+	 */
+	/* XXPOLICY */
+	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
+		ASSERT(c >= 0 && c < vd->vdev_children);
+		cvd = vd->vdev_child[c];
+		dprintf("%s: retrying i/o (err=%d) on child %s\n",
+		    vdev_description(zio->io_vd), zio->io_error,
+		    vdev_description(cvd));
+		zio->io_error = 0;
+		zio_vdev_io_redone(zio);
+		zio_nowait(zio_vdev_child_io(zio, zio->io_bp, cvd,
+		    zio->io_offset, zio->io_data, zio->io_size,
+		    ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
+		    vdev_mirror_child_done, &mm[c]));
+		zio_wait_children_done(zio);
+		return;
+	}
+
+	/* XXPOLICY */
+	if (good_copies)
+		zio->io_error = 0;
+	else
+		ASSERT(zio->io_error != 0);
+
+	if (good_copies && (spa_mode & FWRITE) &&
+	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+		/*
+		 * Use the good data we have in hand to repair damaged children.
+		 */
+		for (c = 0; c < vd->vdev_children; c++) {
+			/*
+			 * Don't rewrite known good children.
+			 * Not only is it unnecessary, it could
+			 * actually be harmful: if the system lost
+			 * power while rewriting the only good copy,
+			 * there would be no good copies left!
+			 */
+			cvd = vd->vdev_child[c];
+
+			if (mm[c].mm_error == 0) {
+				if (mm[c].mm_tried)
+					continue;
+				if (!vdev_dtl_contains(&cvd->vdev_dtl_map,
+				    zio->io_txg, 1))
+					continue;
+				mm[c].mm_error = ESTALE;
+			}
+
+			dprintf("%s resilvered %s @ 0x%llx error %d\n",
+			    vdev_description(vd),
+			    vdev_description(cvd),
+			    zio->io_offset, mm[c].mm_error);
+
+			zio_nowait(zio_vdev_child_io(zio, zio->io_bp, cvd,
+			    zio->io_offset, zio->io_data, zio->io_size,
+			    ZIO_TYPE_WRITE, zio->io_priority,
+			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
+			    ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
+		}
+	}
+
+	vdev_mirror_map_free(zio);
+	zio_next_stage(zio);
+}
+
+static void
+vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
+{
+	if (faulted == vd->vdev_children)
+		vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+	else if (degraded + faulted != 0)
+		vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+	else
+		vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_mirror_ops = {
+	vdev_mirror_open,
+	vdev_mirror_close,
+	vdev_default_asize,
+	vdev_mirror_io_start,
+	vdev_mirror_io_done,
+	vdev_mirror_state_change,
+	VDEV_TYPE_MIRROR,	/* name of this vdev type */
+	B_FALSE			/* not a leaf vdev */
+};
+
+vdev_ops_t vdev_replacing_ops = {
+	vdev_mirror_open,
+	vdev_mirror_close,
+	vdev_default_asize,
+	vdev_mirror_io_start,
+	vdev_mirror_io_done,
+	vdev_mirror_state_change,
+	VDEV_TYPE_REPLACING,	/* name of this vdev type */
+	B_FALSE			/* not a leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c
new file mode 100644
index 000000000000..b35f4a5bcd03
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_missing.c
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * The 'missing' vdev is a special vdev type used only during import.  It
+ * signifies a placeholder in the root vdev for some vdev that we know is
+ * missing.  We pass it down to the kernel to allow the rest of the
+ * configuration to parsed and an attempt made to open all available devices.
+ * Because its GUID is always 0, we know that the guid sum will mismatch and we
+ * won't be able to open the pool anyway.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+
+/* ARGSUSED */
+static int
+vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+{
+	/*
+	 * Really this should just fail.  But then the root vdev will be in the
+	 * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is
+	 * VDEV_AUX_BAD_GUID_SUM.  So we pretend to succeed, knowing that we
+	 * will fail the GUID sum check before ever trying to open the pool.
+	 */
+	*psize = SPA_MINDEVSIZE;
+	*ashift = SPA_MINBLOCKSHIFT;
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_close(vdev_t *vd)
+{
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_io_start(zio_t *zio)
+{
+	zio->io_error = ENOTSUP;
+	zio_next_stage_async(zio);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_io_done(zio_t *zio)
+{
+	zio_next_stage(zio);
+}
+
+vdev_ops_t vdev_missing_ops = {
+	vdev_missing_open,
+	vdev_missing_close,
+	vdev_default_asize,
+	vdev_missing_io_start,
+	vdev_missing_io_done,
+	NULL,
+	VDEV_TYPE_MISSING,	/* name of this vdev type */
+	B_TRUE			/* leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
new file mode 100644
index 000000000000..09831e150459
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -0,0 +1,286 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+
+/*
+ * Virtual device vector for disk I/O scheduling.
+ */
+int
+vdev_queue_deadline_compare(const void *x1, const void *x2)
+{
+	const zio_t *z1 = x1;
+	const zio_t *z2 = x2;
+
+	if (z1->io_deadline < z2->io_deadline)
+		return (-1);
+	if (z1->io_deadline > z2->io_deadline)
+		return (1);
+
+	if (z1->io_offset < z2->io_offset)
+		return (-1);
+	if (z1->io_offset > z2->io_offset)
+		return (1);
+
+	if (z1 < z2)
+		return (-1);
+	if (z1 > z2)
+		return (1);
+
+	return (0);
+}
+
+int
+vdev_queue_offset_compare(const void *x1, const void *x2)
+{
+	const zio_t *z1 = x1;
+	const zio_t *z2 = x2;
+
+	if (z1->io_offset < z2->io_offset)
+		return (-1);
+	if (z1->io_offset > z2->io_offset)
+		return (1);
+
+	if (z1 < z2)
+		return (-1);
+	if (z1 > z2)
+		return (1);
+
+	return (0);
+}
+
+void
+vdev_queue_init(vdev_t *vd)
+{
+	vdev_queue_t *vq = &vd->vdev_queue;
+
+	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
+	    sizeof (zio_t), offsetof(struct zio, io_deadline_node));
+
+	avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
+	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+	avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
+	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+	avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
+	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
+}
+
+void
+vdev_queue_fini(vdev_t *vd)
+{
+	vdev_queue_t *vq = &vd->vdev_queue;
+
+	avl_destroy(&vq->vq_deadline_tree);
+	avl_destroy(&vq->vq_read_tree);
+	avl_destroy(&vq->vq_write_tree);
+	avl_destroy(&vq->vq_pending_tree);
+
+	mutex_destroy(&vq->vq_lock);
+}
+
+static void
+vdev_queue_agg_io_done(zio_t *aio)
+{
+	zio_t *dio;
+	uint64_t offset = 0;
+
+	while ((dio = aio->io_delegate_list) != NULL) {
+		if (aio->io_type == ZIO_TYPE_READ)
+			bcopy((char *)aio->io_data + offset, dio->io_data,
+			    dio->io_size);
+		offset += dio->io_size;
+		aio->io_delegate_list = dio->io_delegate_next;
+		dio->io_delegate_next = NULL;
+		dio->io_error = aio->io_error;
+		zio_next_stage(dio);
+	}
+	ASSERT3U(offset, ==, aio->io_size);
+
+	zio_buf_free(aio->io_data, aio->io_size);
+}
+
+#define	IS_ADJACENT(io, nio) \
+	((io)->io_offset + (io)->io_size == (nio)->io_offset)
+
+typedef void zio_issue_func_t(zio_t *);
+
+static zio_t *
+vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
+	zio_issue_func_t **funcp)
+{
+	zio_t *fio, *lio, *aio, *dio;
+	avl_tree_t *tree;
+	uint64_t size;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+	*funcp = NULL;
+
+	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
+	    avl_numnodes(&vq->vq_deadline_tree) == 0)
+		return (NULL);
+
+	fio = lio = avl_first(&vq->vq_deadline_tree);
+
+	tree = fio->io_vdev_tree;
+	size = fio->io_size;
+
+	while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
+	    size + dio->io_size <= vq->vq_agg_limit) {
+		dio->io_delegate_next = fio;
+		fio = dio;
+		size += dio->io_size;
+	}
+
+	while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
+	    size + dio->io_size <= vq->vq_agg_limit) {
+		lio->io_delegate_next = dio;
+		lio = dio;
+		size += dio->io_size;
+	}
+
+	if (fio != lio) {
+		char *buf = zio_buf_alloc(size);
+		uint64_t offset = 0;
+		int nagg = 0;
+
+		ASSERT(size <= vq->vq_agg_limit);
+
+		aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
+		    fio->io_offset, buf, size, fio->io_type,
+		    ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
+		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE,
+		    vdev_queue_agg_io_done, NULL);
+
+		aio->io_delegate_list = fio;
+
+		for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
+			ASSERT(dio->io_type == aio->io_type);
+			if (dio->io_type == ZIO_TYPE_WRITE)
+				bcopy(dio->io_data, buf + offset, dio->io_size);
+			offset += dio->io_size;
+			avl_remove(&vq->vq_deadline_tree, dio);
+			avl_remove(tree, dio);
+			zio_vdev_io_bypass(dio);
+			nagg++;
+		}
+
+		ASSERT(offset == size);
+
+		dprintf("%5s  T=%llu  off=%8llx  agg=%3d  "
+		    "old=%5llx  new=%5llx\n",
+		    zio_type_name[fio->io_type],
+		    fio->io_deadline, fio->io_offset, nagg, fio->io_size, size);
+
+		avl_add(&vq->vq_pending_tree, aio);
+
+		*funcp = zio_nowait;
+		return (aio);
+	}
+
+	avl_remove(&vq->vq_deadline_tree, fio);
+	avl_remove(tree, fio);
+
+	avl_add(&vq->vq_pending_tree, fio);
+
+	*funcp = zio_next_stage;
+
+	return (fio);
+}
+
+zio_t *
+vdev_queue_io(zio_t *zio)
+{
+	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+	zio_t *nio;
+	zio_issue_func_t *func;
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+
+	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
+		return (zio);
+
+	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+
+	if (zio->io_type == ZIO_TYPE_READ)
+		zio->io_vdev_tree = &vq->vq_read_tree;
+	else
+		zio->io_vdev_tree = &vq->vq_write_tree;
+
+	mutex_enter(&vq->vq_lock);
+
+	zio->io_deadline = (zio->io_timestamp >> vq->vq_time_shift) +
+	    zio->io_priority;
+
+	avl_add(&vq->vq_deadline_tree, zio);
+	avl_add(zio->io_vdev_tree, zio);
+
+	nio = vdev_queue_io_to_issue(vq, vq->vq_min_pending, &func);
+
+	mutex_exit(&vq->vq_lock);
+
+	if (nio == NULL || func != zio_nowait)
+		return (nio);
+
+	func(nio);
+	return (NULL);
+}
+
+void
+vdev_queue_io_done(zio_t *zio)
+{
+	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+	zio_t *nio;
+	zio_issue_func_t *func;
+	int i;
+
+	mutex_enter(&vq->vq_lock);
+
+	avl_remove(&vq->vq_pending_tree, zio);
+
+	for (i = 0; i < vq->vq_ramp_rate; i++) {
+		nio = vdev_queue_io_to_issue(vq, vq->vq_max_pending, &func);
+		if (nio == NULL)
+			break;
+		mutex_exit(&vq->vq_lock);
+		if (func == zio_next_stage)
+			zio_vdev_io_reissue(nio);
+		func(nio);
+		mutex_enter(&vq->vq_lock);
+	}
+
+	mutex_exit(&vq->vq_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
new file mode 100644
index 000000000000..54547a3c9704
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -0,0 +1,599 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for RAID-Z.
+ */
+
+/*
+ * We currently allow up to two-way replication (i.e. single-fault
+ * reconstruction) models in RAID-Z vdevs.  The blocks in such vdevs
+ * must all be multiples of two times the leaf vdev blocksize.
+ */
+#define	VDEV_RAIDZ_ALIGN	2ULL
+
+typedef struct raidz_col {
+	uint64_t	rc_col;
+	uint64_t	rc_offset;
+	uint64_t	rc_size;
+	void		*rc_data;
+	int		rc_error;
+	short		rc_tried;
+	short		rc_skipped;
+} raidz_col_t;
+
+typedef struct raidz_map {
+	uint64_t	rm_cols;
+	uint64_t	rm_bigcols;
+	uint64_t	rm_asize;
+	int		rm_missing_child;
+	int		rm_type;
+	int		rm_firstdatacol;
+	raidz_col_t	rm_col[1];
+} raidz_map_t;
+
+#define	RAIDZ_SINGLE	0
+#define	RAIDZ_PARITY	1
+
+static raidz_map_t *
+vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
+	int raid_type)
+{
+	raidz_map_t *rm;
+	uint64_t b = zio->io_offset >> unit_shift;
+	uint64_t s = zio->io_size >> unit_shift;
+	uint64_t f = b % dcols;
+	uint64_t o = (b / dcols) << unit_shift;
+	uint64_t q, r, c, bc, col, acols, coff;
+	int firstdatacol;
+
+	switch (raid_type) {
+	case RAIDZ_SINGLE:
+		q = s / dcols;
+		r = s - q * dcols;
+		bc = r;
+		firstdatacol = 0;
+		break;
+	case RAIDZ_PARITY:
+		q = s / (dcols - 1);
+		r = s - q * (dcols - 1);
+		bc = r + !!r;
+		firstdatacol = 1;
+		break;
+	}
+
+	acols = (q == 0 ? bc : dcols);
+
+	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
+
+	rm->rm_cols = acols;
+	rm->rm_bigcols = bc;
+	rm->rm_asize = 0;
+	rm->rm_missing_child = -1;
+	rm->rm_type = raid_type;
+	rm->rm_firstdatacol = firstdatacol;
+
+	for (c = 0; c < acols; c++) {
+		col = f + c;
+		coff = o;
+		if (col >= dcols) {
+			col -= dcols;
+			coff += 1ULL << unit_shift;
+		}
+		rm->rm_col[c].rc_col = col;
+		rm->rm_col[c].rc_offset = coff;
+		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
+		rm->rm_col[c].rc_data = NULL;
+		rm->rm_col[c].rc_error = 0;
+		rm->rm_col[c].rc_tried = 0;
+		rm->rm_col[c].rc_skipped = 0;
+		rm->rm_asize += rm->rm_col[c].rc_size;
+	}
+
+	rm->rm_asize = P2ROUNDUP(rm->rm_asize, VDEV_RAIDZ_ALIGN << unit_shift);
+
+	for (c = 0; c < rm->rm_firstdatacol; c++)
+		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
+
+	rm->rm_col[c].rc_data = zio->io_data;
+
+	for (c = c + 1; c < acols; c++)
+		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
+		    rm->rm_col[c - 1].rc_size;
+
+	if (raid_type == RAIDZ_PARITY) {
+		/*
+		 * To prevent hot parity disks, switch the parity and data
+		 * columns every 1MB.
+		 */
+		ASSERT(rm->rm_cols >= 2);
+		ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
+
+		if (zio->io_offset & (1ULL << 20)) {
+			col = rm->rm_col[0].rc_col;
+			o = rm->rm_col[0].rc_offset;
+			rm->rm_col[0].rc_col = rm->rm_col[1].rc_col;
+			rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
+			rm->rm_col[1].rc_col = col;
+			rm->rm_col[1].rc_offset = o;
+		}
+	}
+
+	zio->io_vsd = rm;
+	return (rm);
+}
+
+static void
+vdev_raidz_map_free(zio_t *zio)
+{
+	raidz_map_t *rm = zio->io_vsd;
+	int c;
+
+	for (c = 0; c < rm->rm_firstdatacol; c++)
+		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
+
+	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
+	zio->io_vsd = NULL;
+}
+
+static void
+vdev_raidz_reconstruct(raidz_map_t *rm, int x)
+{
+	uint64_t *dst, *src, count, xsize, csize;
+	int i, c;
+
+	for (c = 0; c < rm->rm_cols; c++) {
+		if (c == x)
+			continue;
+		src = rm->rm_col[c].rc_data;
+		dst = rm->rm_col[x].rc_data;
+		csize = rm->rm_col[c].rc_size;
+		xsize = rm->rm_col[x].rc_size;
+		count = MIN(csize, xsize) / sizeof (uint64_t);
+		if (c == !x) {
+			/*
+			 * The initial copy happens at either c == 0 or c == 1.
+			 * Both of these columns are 'big' columns, so we'll
+			 * definitely initialize all of column x.
+			 */
+			ASSERT3U(xsize, <=, csize);
+			for (i = 0; i < count; i++)
+				*dst++ = *src++;
+		} else {
+			for (i = 0; i < count; i++)
+				*dst++ ^= *src++;
+		}
+	}
+}
+
+static int
+vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+	vdev_t *cvd;
+	int c, error;
+	int lasterror = 0;
+	int numerrors = 0;
+
+	/*
+	 * XXX -- minimum children should be raid-type-specific
+	 */
+	if (vd->vdev_children < 2) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (EINVAL);
+	}
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		cvd = vd->vdev_child[c];
+
+		if ((error = vdev_open(cvd)) != 0) {
+			lasterror = error;
+			numerrors++;
+			continue;
+		}
+
+		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+		*ashift = cvd->vdev_ashift;
+	}
+
+	*asize *= vd->vdev_children;
+
+	if (numerrors > 1) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+		return (lasterror);
+	}
+
+	return (0);
+}
+
+static void
+vdev_raidz_close(vdev_t *vd)
+{
+	int c;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_close(vd->vdev_child[c]);
+}
+
+static uint64_t
+vdev_raidz_asize(vdev_t *vd, uint64_t psize)
+{
+	uint64_t asize;
+	uint64_t cols = vd->vdev_children;
+
+	/*
+	 * These calculations assume RAIDZ_PARITY.
+	 */
+	asize = psize >> vd->vdev_ashift;
+	asize += (asize + cols - 2) / (cols - 1);
+	asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << vd->vdev_ashift;
+
+	return (asize);
+}
+
+static void
+vdev_raidz_child_done(zio_t *zio)
+{
+	raidz_col_t *rc = zio->io_private;
+
+	rc->rc_error = zio->io_error;
+	rc->rc_tried = 1;
+	rc->rc_skipped = 0;
+}
+
+static void
+vdev_raidz_repair_done(zio_t *zio)
+{
+	zio_buf_free(zio->io_data, zio->io_size);
+}
+
+static void
+vdev_raidz_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_t *cvd;
+	blkptr_t *bp = zio->io_bp;
+	raidz_map_t *rm;
+	raidz_col_t *rc;
+	int c;
+
+	rm = vdev_raidz_map_alloc(zio, vd->vdev_ashift, vd->vdev_children,
+	    RAIDZ_PARITY);
+
+	if (DVA_GET_GANG(ZIO_GET_DVA(zio))) {
+		ASSERT3U(rm->rm_asize, ==,
+		    vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE));
+		ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
+	} else {
+		ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio)));
+		ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+	}
+
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+
+		/*
+		 * Generate RAID parity in virtual column 0.
+		 */
+		vdev_raidz_reconstruct(rm, 0);
+
+		for (c = 0; c < rm->rm_cols; c++) {
+			rc = &rm->rm_col[c];
+			cvd = vd->vdev_child[rc->rc_col];
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_offset, rc->rc_data, rc->rc_size,
+			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+			    vdev_raidz_child_done, rc));
+		}
+		zio_wait_children_done(zio);
+		return;
+	}
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+	for (c = rm->rm_cols - 1; c >= 0; c--) {
+		rc = &rm->rm_col[c];
+		cvd = vd->vdev_child[rc->rc_col];
+		if (vdev_is_dead(cvd)) {
+			rm->rm_missing_child = c;
+			rc->rc_error = ENXIO;
+			rc->rc_tried = 1;	/* don't even try */
+			rc->rc_skipped = 1;
+			continue;
+		}
+		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
+			rm->rm_missing_child = c;
+			rc->rc_error = ESTALE;
+			rc->rc_skipped = 1;
+			continue;
+		}
+		if (c >= rm->rm_firstdatacol || rm->rm_missing_child != -1 ||
+		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_offset, rc->rc_data, rc->rc_size,
+			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+			    vdev_raidz_child_done, rc));
+		}
+	}
+
+	zio_wait_children_done(zio);
+}
+
+static void
+vdev_raidz_io_done(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_t *cvd;
+	raidz_map_t *rm = zio->io_vsd;
+	raidz_col_t *rc;
+	blkptr_t *bp = zio->io_bp;
+	int unexpected_errors = 0;
+	int c;
+
+	ASSERT(bp != NULL);	/* XXX need to add code to enforce this */
+
+	zio->io_error = 0;
+	zio->io_numerrors = 0;
+
+	for (c = 0; c < rm->rm_cols; c++) {
+		rc = &rm->rm_col[c];
+
+		/*
+		 * We preserve any EIOs because those may be worth retrying;
+		 * whereas ECKSUM and ENXIO are more likely to be persistent.
+		 */
+		if (rc->rc_error) {
+			if (zio->io_error != EIO)
+				zio->io_error = rc->rc_error;
+			if (!rc->rc_skipped)
+				unexpected_errors++;
+			zio->io_numerrors++;
+		}
+	}
+
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		/*
+		 * If this is not a failfast write, and we were able to
+		 * write enough columns to reconstruct the data, good enough.
+		 */
+		/* XXPOLICY */
+		if (zio->io_numerrors <= rm->rm_firstdatacol &&
+		    !(zio->io_flags & ZIO_FLAG_FAILFAST))
+			zio->io_error = 0;
+
+		vdev_raidz_map_free(zio);
+		zio_next_stage(zio);
+		return;
+	}
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+	/*
+	 * If there were no I/O errors, and the data checksums correctly,
+	 * the read is complete.
+	 */
+	/* XXPOLICY */
+	if (zio->io_numerrors == 0 && zio_checksum_error(zio) == 0) {
+		ASSERT(unexpected_errors == 0);
+		ASSERT(zio->io_error == 0);
+
+		/*
+		 * We know the data's good.  If we read the parity,
+		 * verify that it's good as well.  If not, fix it.
+		 */
+		for (c = 0; c < rm->rm_firstdatacol; c++) {
+			void *orig;
+			rc = &rm->rm_col[c];
+			if (!rc->rc_tried)
+				continue;
+			orig = zio_buf_alloc(rc->rc_size);
+			bcopy(rc->rc_data, orig, rc->rc_size);
+			vdev_raidz_reconstruct(rm, c);
+			if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) {
+				vdev_checksum_error(zio,
+				    vd->vdev_child[rc->rc_col]);
+				rc->rc_error = ECKSUM;
+				unexpected_errors++;
+			}
+			zio_buf_free(orig, rc->rc_size);
+		}
+		goto done;
+	}
+
+	/*
+	 * If there was exactly one I/O error, it's the one we expected,
+	 * and the reconstructed data checksums, the read is complete.
+	 * This happens when one child is offline and vdev_fault_assess()
+	 * knows it, or when one child has stale data and the DTL knows it.
+	 */
+	if (zio->io_numerrors == 1 && (c = rm->rm_missing_child) != -1) {
+		rc = &rm->rm_col[c];
+		ASSERT(unexpected_errors == 0);
+		ASSERT(rc->rc_error == ENXIO || rc->rc_error == ESTALE);
+		vdev_raidz_reconstruct(rm, c);
+		if (zio_checksum_error(zio) == 0) {
+			zio->io_error = 0;
+			goto done;
+		}
+	}
+
+	/*
+	 * This isn't a typical error -- either we got a read error or
+	 * more than one child claimed a problem.  Read every block we
+	 * haven't already so we can try combinatorial reconstruction.
+	 */
+	unexpected_errors = 1;
+	rm->rm_missing_child = -1;
+
+	for (c = 0; c < rm->rm_cols; c++)
+		if (!rm->rm_col[c].rc_tried)
+			break;
+
+	if (c != rm->rm_cols) {
+		zio->io_error = 0;
+		zio_vdev_io_redone(zio);
+		for (c = 0; c < rm->rm_cols; c++) {
+			rc = &rm->rm_col[c];
+			if (rc->rc_tried)
+				continue;
+			zio_nowait(zio_vdev_child_io(zio, NULL,
+			    vd->vdev_child[rc->rc_col],
+			    rc->rc_offset, rc->rc_data, rc->rc_size,
+			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
+			    vdev_raidz_child_done, rc));
+		}
+		zio_wait_children_done(zio);
+		return;
+	}
+
+	/*
+	 * If there were more errors than parity disks, give up.
+	 */
+	if (zio->io_numerrors > rm->rm_firstdatacol) {
+		ASSERT(zio->io_error != 0);
+		goto done;
+	}
+
+	/*
+	 * The number of I/O errors is correctable.  Correct them here.
+	 */
+	ASSERT(zio->io_numerrors <= rm->rm_firstdatacol);
+	for (c = 0; c < rm->rm_cols; c++) {
+		rc = &rm->rm_col[c];
+		ASSERT(rc->rc_tried);
+		if (rc->rc_error) {
+			vdev_raidz_reconstruct(rm, c);
+			if (zio_checksum_error(zio) == 0)
+				zio->io_error = 0;
+			else
+				zio->io_error = rc->rc_error;
+			goto done;
+		}
+	}
+
+	/*
+	 * There were no I/O errors, but the data doesn't checksum.
+	 * Try all permutations to see if we can find one that does.
+	 */
+	ASSERT(zio->io_numerrors == 0);
+	for (c = 0; c < rm->rm_cols; c++) {
+		void *orig;
+		rc = &rm->rm_col[c];
+
+		orig = zio_buf_alloc(rc->rc_size);
+		bcopy(rc->rc_data, orig, rc->rc_size);
+		vdev_raidz_reconstruct(rm, c);
+
+		if (zio_checksum_error(zio) == 0) {
+			zio_buf_free(orig, rc->rc_size);
+			zio->io_error = 0;
+			/*
+			 * If this child didn't know that it returned bad data,
+			 * inform it.
+			 */
+			if (rc->rc_tried && rc->rc_error == 0)
+				vdev_checksum_error(zio,
+				    vd->vdev_child[rc->rc_col]);
+			rc->rc_error = ECKSUM;
+			goto done;
+		}
+
+		bcopy(orig, rc->rc_data, rc->rc_size);
+		zio_buf_free(orig, rc->rc_size);
+	}
+
+	/*
+	 * All combinations failed to checksum.
+	 */
+	zio->io_error = ECKSUM;
+
+done:
+	zio_checksum_verified(zio);
+
+	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
+	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+		/*
+		 * Use the good data we have in hand to repair damaged children.
+		 */
+		for (c = 0; c < rm->rm_cols; c++) {
+			rc = &rm->rm_col[c];
+			cvd = vd->vdev_child[rc->rc_col];
+
+			if (rc->rc_error) {
+				/*
+				 * Make a copy of the data because we're
+				 * going to free the RAID-Z map below.
+				 */
+				void *data = zio_buf_alloc(rc->rc_size);
+				bcopy(rc->rc_data, data, rc->rc_size);
+
+				dprintf("%s resilvered %s @ 0x%llx error %d\n",
+				    vdev_description(vd),
+				    vdev_description(cvd),
+				    zio->io_offset, rc->rc_error);
+
+				zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+				    rc->rc_offset, data, rc->rc_size,
+				    ZIO_TYPE_WRITE, zio->io_priority,
+				    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
+				    ZIO_FLAG_DONT_PROPAGATE,
+				    vdev_raidz_repair_done, NULL));
+			}
+		}
+	}
+
+	vdev_raidz_map_free(zio);
+	zio_next_stage(zio);
+}
+
+static void
+vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
+{
+	if (faulted > 1)
+		vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+	else if (degraded + faulted != 0)
+		vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+	else
+		vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_raidz_ops = {
+	vdev_raidz_open,
+	vdev_raidz_close,
+	vdev_raidz_asize,
+	vdev_raidz_io_start,
+	vdev_raidz_io_done,
+	vdev_raidz_state_change,
+	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
+	B_FALSE			/* not a leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c
new file mode 100644
index 000000000000..4e44b5bb051b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_root.c
@@ -0,0 +1,98 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for the pool's root vdev.
+ */
+
+static int
+vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+{
+	vdev_t *cvd;
+	int c, error;
+	int lasterror = 0;
+
+	if (vd->vdev_children == 0) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (EINVAL);
+	}
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		cvd = vd->vdev_child[c];
+
+		if ((error = vdev_open(cvd)) != 0) {
+			lasterror = error;
+			continue;
+		}
+
+		*asize += cvd->vdev_asize;
+		*ashift = MAX(*ashift, cvd->vdev_ashift);
+	}
+
+	if (lasterror)
+		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+
+	return (lasterror);
+}
+
+static void
+vdev_root_close(vdev_t *vd)
+{
+	int c;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
+{
+	if (faulted > 0)
+		vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+	else if (degraded != 0)
+		vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+	else
+		vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_root_ops = {
+	vdev_root_open,
+	vdev_root_close,
+	vdev_default_asize,
+	NULL,			/* io_start - not applicable to the root */
+	NULL,			/* io_done - not applicable to the root */
+	vdev_root_state_change,
+	VDEV_TYPE_ROOT,		/* name of this vdev type */
+	B_FALSE			/* not a leaf vdev */
+};
diff --git a/usr/src/uts/common/fs/zfs/zap.c b/usr/src/uts/common/fs/zfs/zap.c
new file mode 100644
index 000000000000..1eddb9c250cb
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zap.c
@@ -0,0 +1,1010 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+
+/*
+ * This file contains the top half of the zfs directory structure
+ * implementation. The bottom half is in zap_leaf.c.
+ *
+ * The zdir is an extendable hash data structure. There is a table of
+ * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
+ * each a constant size and hold a variable number of directory entries.
+ * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
+ *
+ * The pointer table holds a power of 2 number of pointers.
+ * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
+ * by the pointer at index i in the table holds entries whose hash value
+ * has a zd_prefix_len - bit prefix
+ */
+
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+
+#define	MIN_FREE (ZAP_LEAF_NUMCHUNKS*9/10)
+
+static void zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx);
+static int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx);
+static zap_leaf_t *zap_get_leaf_byblk(zap_t *zap, uint64_t blkid,
+    dmu_tx_t *tx, krw_t lt);
+static void zap_put_leaf(zap_leaf_t *l);
+static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
+
+
+void
+fzap_byteswap(void *vbuf, size_t size)
+{
+	uint64_t block_type;
+
+	ASSERT(size == (1<<ZAP_BLOCK_SHIFT));
+	block_type = *(uint64_t *)vbuf;
+
+	switch (block_type) {
+	case ZBT_LEAF:
+	case BSWAP_64(ZBT_LEAF):
+		zap_leaf_byteswap(vbuf);
+		return;
+	case ZBT_HEADER:
+	case BSWAP_64(ZBT_HEADER):
+	default:
+		/* it's a ptrtbl block */
+		byteswap_uint64_array(vbuf, 1<<ZAP_BLOCK_SHIFT);
+		return;
+	}
+}
+
+void
+fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	zap_leaf_t *l;
+	int i;
+	zap_phys_t *zp;
+
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	zap->zap_ismicro = FALSE;
+
+	(void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
+	    &zap->zap_f.zap_phys, zap_pageout);
+
+	mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
+
+	zp = zap->zap_f.zap_phys;
+	/*
+	 * explicitly zero it since it might be coming from an
+	 * initialized microzap
+	 */
+	ASSERT3U(sizeof (zap_phys_t), ==, zap->zap_dbuf->db_size);
+	bzero(zp, sizeof (zap_phys_t));
+	zp->zap_block_type = ZBT_HEADER;
+	zp->zap_magic = ZAP_MAGIC;
+
+	zp->zap_ptrtbl.zt_shift = ZAP_PTRTBL_MIN_SHIFT;
+
+	zp->zap_freeblk = 2;		/* block 1 will be the first leaf */
+	zp->zap_num_leafs = 1;
+	zp->zap_num_entries = 0;
+	zp->zap_salt = zap->zap_salt;
+
+	for (i = 0; i < (1<<ZAP_PTRTBL_MIN_SHIFT); i++)
+		zp->zap_leafs[i] = 1;	/* block 1 will be the first leaf */
+
+	/*
+	 * set up block 1 - the first leaf
+	 */
+	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    1<<ZAP_BLOCK_SHIFT);
+	dmu_buf_will_dirty(db, tx);
+
+	l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+	l->l_dbuf = db;
+	l->l_phys = db->db_data;
+
+	zap_leaf_init(l);
+
+	kmem_free(l, sizeof (zap_leaf_t));
+	dmu_buf_rele(db);
+}
+
+static int
+zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
+{
+	if (RW_WRITE_HELD(&zap->zap_rwlock))
+		return (1);
+	if (rw_tryupgrade(&zap->zap_rwlock)) {
+		dmu_buf_will_dirty(zap->zap_dbuf, tx);
+		return (1);
+	}
+	return (0);
+}
+
+/*
+ * Generic routines for dealing with the pointer & cookie tables.
+ */
+
+static void
+zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
+    void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
+    dmu_tx_t *tx)
+{
+	uint64_t b, newblk;
+	dmu_buf_t *db_old, *db_new;
+	int hepb = 1<<(ZAP_BLOCK_SHIFT-4);
+	/* hepb = half the number of entries in a block */
+
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	ASSERT(tbl->zt_blk != 0);
+	ASSERT(tbl->zt_numblks > 0);
+
+	if (tbl->zt_nextblk != 0) {
+		newblk = tbl->zt_nextblk;
+	} else {
+		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2, tx);
+		tbl->zt_nextblk = newblk;
+		ASSERT3U(tbl->zt_blks_copied, ==, 0);
+		dmu_prefetch(zap->zap_objset, zap->zap_object,
+		    tbl->zt_blk << ZAP_BLOCK_SHIFT, tbl->zt_numblks <<
+		    ZAP_BLOCK_SHIFT);
+	}
+
+	/*
+	 * Copy the ptrtbl from the old to new location, leaving the odd
+	 * entries blank as we go.
+	 */
+
+	b = tbl->zt_blks_copied;
+	db_old = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (tbl->zt_blk + b) << ZAP_BLOCK_SHIFT);
+	dmu_buf_read(db_old);
+
+	/* first half of entries in old[b] go to new[2*b+0] */
+	db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (newblk + 2*b+0) << ZAP_BLOCK_SHIFT);
+	dmu_buf_will_dirty(db_new, tx);
+	transfer_func(db_old->db_data, db_new->db_data, hepb);
+	dmu_buf_rele(db_new);
+
+	/* second half of entries in old[b] go to new[2*b+1] */
+	db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (newblk + 2*b+1) << ZAP_BLOCK_SHIFT);
+	dmu_buf_will_dirty(db_new, tx);
+	transfer_func((uint64_t *)db_old->db_data + hepb,
+	    db_new->db_data, hepb);
+	dmu_buf_rele(db_new);
+
+	dmu_buf_rele(db_old);
+
+	tbl->zt_blks_copied++;
+
+	dprintf("copied block %llu of %llu\n",
+	    tbl->zt_blks_copied, tbl->zt_numblks);
+
+	if (tbl->zt_blks_copied == tbl->zt_numblks) {
+		dmu_free_range(zap->zap_objset, zap->zap_object,
+		    tbl->zt_blk << ZAP_BLOCK_SHIFT,
+		    tbl->zt_numblks << ZAP_BLOCK_SHIFT, tx);
+
+		tbl->zt_blk = newblk;
+		tbl->zt_numblks *= 2;
+		tbl->zt_shift++;
+		tbl->zt_nextblk = 0;
+		tbl->zt_blks_copied = 0;
+
+		dprintf("finished; numblocks now %llu (%lluk entries)\n",
+		    tbl->zt_numblks, 1<<(tbl->zt_shift-10));
+	}
+}
+
+static uint64_t
+zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
+    dmu_tx_t *tx)
+{
+	uint64_t blk, off, oldval;
+	dmu_buf_t *db;
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	ASSERT(tbl->zt_blk != 0);
+
+	dprintf("storing %llx at index %llx\n", val, idx);
+
+	blk = idx >> (ZAP_BLOCK_SHIFT-3);
+	off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1);
+
+	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (tbl->zt_blk + blk) << ZAP_BLOCK_SHIFT);
+	dmu_buf_will_dirty(db, tx);
+	oldval = ((uint64_t *)db->db_data)[off];
+	((uint64_t *)db->db_data)[off] = val;
+	dmu_buf_rele(db);
+
+	if (tbl->zt_nextblk != 0) {
+		idx *= 2;
+		blk = idx >> (ZAP_BLOCK_SHIFT-3);
+		off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1);
+
+		db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		    (tbl->zt_nextblk + blk) << ZAP_BLOCK_SHIFT);
+		dmu_buf_will_dirty(db, tx);
+		((uint64_t *)db->db_data)[off] = val;
+		((uint64_t *)db->db_data)[off+1] = val;
+		dmu_buf_rele(db);
+	}
+
+	return (oldval);
+}
+
+static uint64_t
+zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx)
+{
+	uint64_t blk, off, val;
+	dmu_buf_t *db;
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	blk = idx >> (ZAP_BLOCK_SHIFT-3);
+	off = idx & ((1<<(ZAP_BLOCK_SHIFT-3))-1);
+
+	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (tbl->zt_blk + blk) << ZAP_BLOCK_SHIFT);
+	dmu_buf_read(db);
+	val = ((uint64_t *)db->db_data)[off];
+	dmu_buf_rele(db);
+	return (val);
+}
+
+/*
+ * Routines for growing the ptrtbl.
+ */
+
+static void
+zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
+{
+	int i;
+	for (i = 0; i < n; i++) {
+		uint64_t lb = src[i];
+		dst[2*i+0] = lb;
+		dst[2*i+1] = lb;
+	}
+}
+
+static void
+zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
+{
+	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == 32)
+		return;
+
+	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+		/*
+		 * The ptrtbl can no longer be contained in the
+		 * header block.  Give it its own entire block, which
+		 * will quadruple the size of the ptrtbl.
+		 */
+		uint64_t newblk;
+		dmu_buf_t *db_new;
+
+		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+		    ZAP_PTRTBL_MIN_SHIFT);
+		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
+
+		newblk = zap_allocate_blocks(zap, 1, tx);
+		db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		    newblk << ZAP_BLOCK_SHIFT);
+
+		dmu_buf_will_dirty(db_new, tx);
+		zap_ptrtbl_transfer(zap->zap_f.zap_phys->zap_leafs,
+		    db_new->db_data, 1 << ZAP_PTRTBL_MIN_SHIFT);
+		dmu_buf_rele(db_new);
+
+		zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
+		zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
+		zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
+
+		ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
+		    (ZAP_BLOCK_SHIFT-3));
+	} else {
+		zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		    zap_ptrtbl_transfer, tx);
+	}
+}
+
+static void
+zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
+{
+	dmu_buf_will_dirty(zap->zap_dbuf, tx);
+	mutex_enter(&zap->zap_f.zap_num_entries_mtx);
+
+	ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
+
+	zap->zap_f.zap_phys->zap_num_entries += delta;
+
+	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+}
+
+uint64_t
+zap_allocate_blocks(zap_t *zap, int nblocks, dmu_tx_t *tx)
+{
+	uint64_t newblk;
+	ASSERT(tx != NULL);
+	if (!RW_WRITE_HELD(&zap->zap_rwlock)) {
+		dmu_buf_will_dirty(zap->zap_dbuf, tx);
+	}
+	newblk = atomic_add_64_nv(&zap->zap_f.zap_phys->zap_freeblk, nblocks) -
+	    nblocks;
+	return (newblk);
+}
+
+
+/*
+ * This function doesn't increment zap_num_leafs because it's used to
+ * allocate a leaf chain, which doesn't count against zap_num_leafs.
+ * The directory must be held exclusively for this tx.
+ */
+zap_leaf_t *
+zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
+{
+	void *winner;
+	zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
+
+	ASSERT(tx != NULL);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	/* hence we already dirtied zap->zap_dbuf */
+
+	rw_init(&l->l_rwlock, 0, 0, 0);
+	rw_enter(&l->l_rwlock, RW_WRITER);
+	l->l_blkid = zap_allocate_blocks(zap, 1, tx);
+	l->l_next = NULL;
+	l->l_dbuf = NULL;
+	l->l_phys = NULL;
+
+	l->l_dbuf = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    l->l_blkid << ZAP_BLOCK_SHIFT);
+	winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
+	ASSERT(winner == NULL);
+	dmu_buf_will_dirty(l->l_dbuf, tx);
+
+	zap_leaf_init(l);
+
+	return (l);
+}
+
+/* ARGSUSED */
+void
+zap_destroy_leaf(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
+{
+	/* uint64_t offset = l->l_blkid << ZAP_BLOCK_SHIFT; */
+	rw_exit(&l->l_rwlock);
+	dmu_buf_rele(l->l_dbuf);
+	/* XXX there are still holds on this block, so we can't free it? */
+	/* dmu_free_range(zap->zap_objset, zap->zap_object, */
+	    /* offset,  1<<ZAP_BLOCK_SHIFT, tx); */
+}
+
+int
+fzap_count(zap_t *zap, uint64_t *count)
+{
+	ASSERT(!zap->zap_ismicro);
+	mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
+	*count = zap->zap_f.zap_phys->zap_num_entries;
+	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+	return (0);
+}
+
+/*
+ * Routines for obtaining zap_leaf_t's
+ */
+
+static void
+zap_put_leaf(zap_leaf_t *l)
+{
+	zap_leaf_t *nl = l->l_next;
+	while (nl) {
+		zap_leaf_t *nnl = nl->l_next;
+		rw_exit(&nl->l_rwlock);
+		dmu_buf_rele(nl->l_dbuf);
+		nl = nnl;
+	}
+	rw_exit(&l->l_rwlock);
+	dmu_buf_rele(l->l_dbuf);
+}
+
+_NOTE(ARGSUSED(0))
+static void
+zap_leaf_pageout(dmu_buf_t *db, void *vl)
+{
+	zap_leaf_t *l = vl;
+
+	rw_destroy(&l->l_rwlock);
+	kmem_free(l, sizeof (zap_leaf_t));
+}
+
+static zap_leaf_t *
+zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
+{
+	zap_leaf_t *l, *winner;
+
+	ASSERT(blkid != 0);
+
+	l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
+	rw_init(&l->l_rwlock, 0, 0, 0);
+	rw_enter(&l->l_rwlock, RW_WRITER);
+	l->l_blkid = blkid;
+	l->l_next = NULL;
+	l->l_dbuf = db;
+	l->l_phys = NULL;
+
+	winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
+
+	rw_exit(&l->l_rwlock);
+	if (winner != NULL) {
+		/* someone else set it first */
+		zap_leaf_pageout(NULL, l);
+		l = winner;
+	}
+
+	return (l);
+}
+
+static zap_leaf_t *
+zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
+{
+	dmu_buf_t *db;
+	zap_leaf_t *l;
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    blkid << ZAP_BLOCK_SHIFT);
+
+	ASSERT3U(db->db_object, ==, zap->zap_object);
+	ASSERT3U(db->db_offset, ==, blkid << ZAP_BLOCK_SHIFT);
+	ASSERT3U(db->db_size, ==, 1 << ZAP_BLOCK_SHIFT);
+	ASSERT(blkid != 0);
+
+	dmu_buf_read(db);
+	l = dmu_buf_get_user(db);
+
+	if (l == NULL)
+		l = zap_open_leaf(blkid, db);
+
+	rw_enter(&l->l_rwlock, lt);
+	/*
+	 * Must lock before dirtying, otherwise l->l_phys could change,
+	 * causing ASSERT below to fail.
+	 */
+	if (lt == RW_WRITER)
+		dmu_buf_will_dirty(db, tx);
+	ASSERT3U(l->l_blkid, ==, blkid);
+	ASSERT3P(l->l_dbuf, ==, db);
+	ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
+	ASSERT3U(l->lh_block_type, ==, ZBT_LEAF);
+	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
+
+	return (l);
+}
+
+static zap_leaf_t *
+zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
+{
+	zap_leaf_t *l, *nl;
+
+	l = zap_get_leaf_byblk_impl(zap, blkid, tx, lt);
+
+	nl = l;
+	while (nl->lh_next != 0) {
+		zap_leaf_t *nnl;
+		nnl = zap_get_leaf_byblk_impl(zap, nl->lh_next, tx, lt);
+		nl->l_next = nnl;
+		nl = nnl;
+	}
+
+	return (l);
+}
+
+static uint64_t
+zap_idx_to_blk(zap_t *zap, uint64_t idx)
+{
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+		ASSERT3U(idx, <,
+		    (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
+		return (zap->zap_f.zap_phys->zap_leafs[idx]);
+	} else {
+		return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		    idx));
+	}
+}
+
+static void
+zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
+{
+	ASSERT(tx != NULL);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
+		zap->zap_f.zap_phys->zap_leafs[idx] = blk;
+	} else {
+		(void) zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		    idx, blk, tx);
+	}
+}
+
+static zap_leaf_t *
+zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt)
+{
+	uint64_t idx;
+	zap_leaf_t *l;
+
+	ASSERT(zap->zap_dbuf == NULL ||
+	    zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
+	ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
+	idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+	l = zap_get_leaf_byblk(zap, zap_idx_to_blk(zap, idx), tx, lt);
+
+	ASSERT3U(ZAP_HASH_IDX(h, l->lh_prefix_len), ==, l->lh_prefix);
+
+	return (l);
+}
+
+
+static zap_leaf_t *
+zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
+{
+	zap_leaf_t *nl;
+	int prefix_diff, i, err;
+	uint64_t sibling;
+
+	ASSERT3U(l->lh_prefix_len, <=,
+	    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
+
+	if (zap_tryupgradedir(zap, tx) == 0) {
+		/* failed to upgrade */
+		int old_prefix_len = l->lh_prefix_len;
+		objset_t *os = zap->zap_objset;
+		uint64_t object = zap->zap_object;
+
+		zap_put_leaf(l);
+		zap_unlockdir(zap);
+		err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap);
+		ASSERT3U(err, ==, 0);
+		ASSERT(!zap->zap_ismicro);
+		l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+
+		if (l->lh_prefix_len != old_prefix_len)
+			/* it split while our locks were down */
+			return (l);
+	}
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	if (l->lh_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+		/* There's only one pointer to us. Chain on another leaf blk. */
+		(void) zap_leaf_chainmore(l, zap_create_leaf(zap, tx));
+		dprintf("chaining leaf %x/%d\n", l->lh_prefix,
+		    l->lh_prefix_len);
+		return (l);
+	}
+
+	ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
+
+	/* There's more than one pointer to us. Split this leaf. */
+	nl = zap_leaf_split(zap, l, tx);
+
+	/* set sibling pointers */
+	prefix_diff =
+	    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
+	sibling = (ZAP_HASH_IDX(hash, l->lh_prefix_len) | 1) << prefix_diff;
+	for (i = 0; i < (1ULL<<prefix_diff); i++) {
+		ASSERT3U(zap_idx_to_blk(zap, sibling+i), ==, l->l_blkid);
+		zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
+		/* dprintf("set %d to %u %x\n", sibling+i, nl->l_blkid, nl); */
+	}
+
+	zap->zap_f.zap_phys->zap_num_leafs++;
+
+	if (hash & (1ULL << (64 - l->lh_prefix_len))) {
+		/* we want the sibling */
+		zap_put_leaf(l);
+		l = nl;
+	} else {
+		zap_put_leaf(nl);
+	}
+
+	return (l);
+}
+
+static void
+zap_put_leaf_maybe_grow_ptrtbl(zap_t *zap,
+    zap_leaf_t *l, dmu_tx_t *tx)
+{
+	int shift, err;
+
+again:
+	shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+
+	if (l->lh_prefix_len == shift &&
+	    (l->l_next != NULL || l->lh_nfree < MIN_FREE)) {
+		/* this leaf will soon make us grow the pointer table */
+
+		if (zap_tryupgradedir(zap, tx) == 0) {
+			objset_t *os = zap->zap_objset;
+			uint64_t zapobj = zap->zap_object;
+			uint64_t blkid = l->l_blkid;
+
+			zap_put_leaf(l);
+			zap_unlockdir(zap);
+			err = zap_lockdir(os, zapobj, tx,
+			    RW_WRITER, FALSE, &zap);
+			ASSERT3U(err, ==, 0);
+			l = zap_get_leaf_byblk(zap, blkid, tx, RW_READER);
+			goto again;
+		}
+
+		zap_put_leaf(l);
+		zap_grow_ptrtbl(zap, tx);
+	} else {
+		zap_put_leaf(l);
+	}
+}
+
+
+static int
+fzap_checksize(uint64_t integer_size, uint64_t num_integers)
+{
+	/* Only integer sizes supported by C */
+	switch (integer_size) {
+	case 1:
+	case 2:
+	case 4:
+	case 8:
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	/* Make sure we won't overflow */
+	if (integer_size * num_integers < num_integers)
+		return (EINVAL);
+	if (integer_size * num_integers > DMU_MAX_ACCESS)
+		return (EINVAL);
+
+	return (0);
+}
+
+/*
+ * Routines for maniplulating attributes.
+ */
+int
+fzap_lookup(zap_t *zap, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	zap_leaf_t *l;
+	int err;
+	uint64_t hash;
+	zap_entry_handle_t zeh;
+
+	err = fzap_checksize(integer_size, num_integers);
+	if (err != 0)
+		return (err);
+
+	hash = zap_hash(zap, name);
+	l = zap_deref_leaf(zap, hash, NULL, RW_READER);
+	err = zap_leaf_lookup(l, name, hash, &zeh);
+	if (err != 0)
+		goto out;
+	err = zap_entry_read(&zeh, integer_size, num_integers, buf);
+out:
+	zap_put_leaf(l);
+	return (err);
+}
+
+int
+fzap_add_cd(zap_t *zap, const char *name,
+    uint64_t integer_size, uint64_t num_integers,
+    const void *val, uint32_t cd, dmu_tx_t *tx, zap_leaf_t **lp)
+{
+	zap_leaf_t *l;
+	uint64_t hash;
+	int err;
+	zap_entry_handle_t zeh;
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	ASSERT(!zap->zap_ismicro);
+	ASSERT(fzap_checksize(integer_size, num_integers) == 0);
+
+	hash = zap_hash(zap, name);
+	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+retry:
+	err = zap_leaf_lookup(l, name, hash, &zeh);
+	if (err == 0) {
+		err = EEXIST;
+		goto out;
+	}
+	ASSERT(err == ENOENT);
+
+	/* XXX If this leaf is chained, split it if we can. */
+	err = zap_entry_create(l, name, hash, cd,
+	    integer_size, num_integers, val, &zeh);
+
+	if (err == 0) {
+		zap_increment_num_entries(zap, 1, tx);
+	} else if (err == EAGAIN) {
+		l = zap_expand_leaf(zap, l, hash, tx);
+		goto retry;
+	}
+
+out:
+	if (lp)
+		*lp = l;
+	else
+		zap_put_leaf(l);
+	return (err);
+}
+
+int
+fzap_add(zap_t *zap, const char *name,
+    uint64_t integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	int err;
+	zap_leaf_t *l;
+
+	err = fzap_checksize(integer_size, num_integers);
+	if (err != 0)
+		return (err);
+
+	err = fzap_add_cd(zap, name, integer_size, num_integers,
+	    val, ZAP_MAXCD, tx, &l);
+
+	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
+	return (err);
+}
+
+int
+fzap_update(zap_t *zap, const char *name,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+	zap_leaf_t *l;
+	uint64_t hash;
+	int err, create;
+	zap_entry_handle_t zeh;
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	err = fzap_checksize(integer_size, num_integers);
+	if (err != 0)
+		return (err);
+
+	hash = zap_hash(zap, name);
+	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+retry:
+	err = zap_leaf_lookup(l, name, hash, &zeh);
+	create = (err == ENOENT);
+	ASSERT(err == 0 || err == ENOENT);
+
+	/* XXX If this leaf is chained, split it if we can. */
+
+	if (create) {
+		err = zap_entry_create(l, name, hash, ZAP_MAXCD,
+		    integer_size, num_integers, val, &zeh);
+		if (err == 0)
+			zap_increment_num_entries(zap, 1, tx);
+	} else {
+		err = zap_entry_update(&zeh, integer_size, num_integers, val);
+	}
+
+	if (err == EAGAIN) {
+		l = zap_expand_leaf(zap, l, hash, tx);
+		goto retry;
+	}
+
+	zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
+	return (err);
+}
+
+int
+fzap_length(zap_t *zap, const char *name,
+    uint64_t *integer_size, uint64_t *num_integers)
+{
+	zap_leaf_t *l;
+	int err;
+	uint64_t hash;
+	zap_entry_handle_t zeh;
+
+	hash = zap_hash(zap, name);
+	l = zap_deref_leaf(zap, hash, NULL, RW_READER);
+	err = zap_leaf_lookup(l, name, hash, &zeh);
+	if (err != 0)
+		goto out;
+
+	if (integer_size)
+		*integer_size = zeh.zeh_integer_size;
+	if (num_integers)
+		*num_integers = zeh.zeh_num_integers;
+out:
+	zap_put_leaf(l);
+	return (err);
+}
+
+int
+fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx)
+{
+	zap_leaf_t *l;
+	uint64_t hash;
+	int err;
+	zap_entry_handle_t zeh;
+
+	hash = zap_hash(zap, name);
+	l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+	err = zap_leaf_lookup(l, name, hash, &zeh);
+	if (err == 0) {
+		zap_entry_remove(&zeh);
+		zap_increment_num_entries(zap, -1, tx);
+	}
+	zap_put_leaf(l);
+	dprintf("fzap_remove: ds=%p obj=%llu name=%s err=%d\n",
+	    zap->zap_objset, zap->zap_object, name, err);
+	return (err);
+}
+
+int
+zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name)
+{
+	zap_cursor_t zc;
+	zap_attribute_t *za;
+	int err;
+
+	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+	for (zap_cursor_init(&zc, os, zapobj);
+	    (err = zap_cursor_retrieve(&zc, za)) == 0;
+	    zap_cursor_advance(&zc)) {
+		if (za->za_first_integer == value) {
+			(void) strcpy(name, za->za_name);
+			break;
+		}
+	}
+	kmem_free(za, sizeof (zap_attribute_t));
+	return (err);
+}
+
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+int
+fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
+{
+	int err = ENOENT;
+	zap_entry_handle_t zeh;
+	zap_leaf_t *l;
+
+	/* retrieve the next entry at or after zc_hash/zc_cd */
+	/* if no entry, return ENOENT */
+
+again:
+	l = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER);
+	err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
+
+	if (err == ENOENT) {
+		uint64_t nocare = (1ULL << (64 - l->lh_prefix_len)) - 1;
+		zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
+		zc->zc_cd = 0;
+		if (l->lh_prefix_len == 0 || zc->zc_hash == 0) {
+			zc->zc_hash = -1ULL;
+		} else {
+			zap_put_leaf(l);
+			goto again;
+		}
+	}
+
+	if (err == 0) {
+		zc->zc_hash = zeh.zeh_hash;
+		zc->zc_cd = zeh.zeh_cd;
+		za->za_integer_length = zeh.zeh_integer_size;
+		za->za_num_integers = zeh.zeh_num_integers;
+		if (zeh.zeh_num_integers == 0) {
+			za->za_first_integer = 0;
+		} else {
+			err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
+			ASSERT(err == 0 || err == EOVERFLOW);
+		}
+		err = zap_entry_read_name(&zeh,
+		    sizeof (za->za_name), za->za_name);
+		ASSERT(err == 0);
+	}
+	zap_put_leaf(l);
+	return (err);
+}
+
+
+static void
+zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
+{
+	int i;
+	uint64_t lastblk = 0;
+
+	/*
+	 * NB: if a leaf has more pointers than an entire ptrtbl block
+	 * can hold, then it'll be accounted for more than once, since
+	 * we won't have lastblk.
+	 */
+	for (i = 0; i < len; i++) {
+		zap_leaf_t *l;
+
+		if (tbl[i] == lastblk)
+			continue;
+		lastblk = tbl[i];
+
+		l = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER);
+
+		zap_stats_leaf(zap, l, zs);
+		zap_put_leaf(l);
+	}
+}
+
+void
+fzap_get_stats(zap_t *zap, zap_stats_t *zs)
+{
+	zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+	zs->zs_blocksize = 1ULL << ZAP_BLOCK_SHIFT;
+	zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
+	zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
+	zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
+
+	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+		/* the ptrtbl is entirely in the header block. */
+		zap_stats_ptrtbl(zap, zap->zap_f.zap_phys->zap_leafs,
+		    1 << ZAP_PTRTBL_MIN_SHIFT, zs);
+	} else {
+		int b;
+
+		dmu_prefetch(zap->zap_objset, zap->zap_object,
+		    zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << ZAP_BLOCK_SHIFT,
+		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
+			ZAP_BLOCK_SHIFT);
+
+		for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
+		    b++) {
+			dmu_buf_t *db;
+
+			db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+			    (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) <<
+			    ZAP_BLOCK_SHIFT);
+			dmu_buf_read(db);
+			zap_stats_ptrtbl(zap, db->db_data,
+			    1<<(ZAP_BLOCK_SHIFT-3), zs);
+			dmu_buf_rele(db);
+		}
+	}
+}
diff --git a/usr/src/uts/common/fs/zfs/zap_leaf.c b/usr/src/uts/common/fs/zfs/zap_leaf.c
new file mode 100644
index 000000000000..82b786d05a4b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zap_leaf.c
@@ -0,0 +1,883 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * The 512-byte leaf is broken into 32 16-byte chunks.
+ * chunk number n means l_chunk[n], even though the header precedes it.
+ * the names are stored null-terminated.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+
+#define	CHAIN_END 0xffff /* end of the chunk chain */
+
+/* somewhat arbitrary, could go up to around 100k ... */
+#define	MAX_ARRAY_BYTES (8<<10)
+
+#define	NCHUNKS(bytes) (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES)
+
+/*
+ * XXX This will >> by a negative number when
+ * lh_prefix_len > 64-ZAP_LEAF_HASH_SHIFT.
+ */
+#define	LEAF_HASH(l, h) \
+	((ZAP_LEAF_HASH_NUMENTRIES-1) & \
+		((h) >> (64 - ZAP_LEAF_HASH_SHIFT-(l)->lh_prefix_len)))
+
+#define	LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
+
+/* #define	MEMCHECK */
+
+
+static void
+zap_memset(void *a, int c, size_t n)
+{
+	char *cp = a;
+	char *cpend = cp + n;
+
+	while (cp < cpend)
+		*cp++ = c;
+}
+
+static void
+stv(int len, void *addr, uint64_t value)
+{
+	switch (len) {
+	case 1:
+		*(uint8_t *)addr = value;
+		return;
+	case 2:
+		*(uint16_t *)addr = value;
+		return;
+	case 4:
+		*(uint32_t *)addr = value;
+		return;
+	case 8:
+		*(uint64_t *)addr = value;
+		return;
+	}
+	ASSERT(!"bad int len");
+}
+
+static uint64_t
+ldv(int len, const void *addr)
+{
+	switch (len) {
+	case 1:
+		return (*(uint8_t *)addr);
+	case 2:
+		return (*(uint16_t *)addr);
+	case 4:
+		return (*(uint32_t *)addr);
+	case 8:
+		return (*(uint64_t *)addr);
+	}
+	ASSERT(!"bad int len");
+	return (0xFEEDFACEDEADBEEF);
+}
+
+void
+zap_leaf_byteswap(zap_leaf_phys_t *buf)
+{
+	int i;
+
+	buf->l_hdr.lhr_block_type = 	BSWAP_64(buf->l_hdr.lhr_block_type);
+	buf->l_hdr.lhr_next = 		BSWAP_64(buf->l_hdr.lhr_next);
+	buf->l_hdr.lhr_prefix = 	BSWAP_64(buf->l_hdr.lhr_prefix);
+	buf->l_hdr.lhr_magic = 		BSWAP_32(buf->l_hdr.lhr_magic);
+	buf->l_hdr.lhr_nfree = 		BSWAP_16(buf->l_hdr.lhr_nfree);
+	buf->l_hdr.lhr_nentries = 	BSWAP_16(buf->l_hdr.lhr_nentries);
+	buf->l_hdr.lhr_prefix_len = 	BSWAP_16(buf->l_hdr.lhr_prefix_len);
+	buf->l_hdr.lh_freelist = 	BSWAP_16(buf->l_hdr.lh_freelist);
+
+	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES; i++)
+		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
+
+	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
+		struct zap_leaf_entry *le;
+
+		switch (buf->l_chunk[i].l_free.lf_type) {
+		case ZAP_LEAF_ENTRY:
+			le = &buf->l_chunk[i].l_entry;
+
+			le->le_type = BSWAP_8(le->le_type);
+			le->le_int_size = BSWAP_8(le->le_int_size);
+			le->le_next = BSWAP_16(le->le_next);
+			le->le_name_chunk = BSWAP_16(le->le_name_chunk);
+			le->le_name_length = BSWAP_16(le->le_name_length);
+			le->le_value_chunk = BSWAP_16(le->le_value_chunk);
+			le->le_value_length = BSWAP_16(le->le_value_length);
+			le->le_cd = BSWAP_32(le->le_cd);
+			le->le_hash = BSWAP_64(le->le_hash);
+			break;
+		case ZAP_LEAF_FREE:
+			buf->l_chunk[i].l_free.lf_type =
+			    BSWAP_8(buf->l_chunk[i].l_free.lf_type);
+			buf->l_chunk[i].l_free.lf_next =
+			    BSWAP_16(buf->l_chunk[i].l_free.lf_next);
+			break;
+		case ZAP_LEAF_ARRAY:
+			/* zap_leaf_array */
+			buf->l_chunk[i].l_array.la_type =
+			    BSWAP_8(buf->l_chunk[i].l_array.la_type);
+			buf->l_chunk[i].l_array.la_next =
+			    BSWAP_16(buf->l_chunk[i].l_array.la_next);
+			/* la_array doesn't need swapping */
+			break;
+		default:
+			ASSERT(!"bad leaf type");
+		}
+	}
+}
+
+void
+zap_leaf_init(zap_leaf_t *l)
+{
+	int i;
+
+	ASSERT3U(sizeof (zap_leaf_phys_t), ==, l->l_dbuf->db_size);
+	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
+	zap_memset(&l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
+	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
+		l->l_phys->l_chunk[i].l_free.lf_type = ZAP_LEAF_FREE;
+		l->l_phys->l_chunk[i].l_free.lf_next = i+1;
+	}
+	l->l_phys->l_chunk[ZAP_LEAF_NUMCHUNKS-1].l_free.lf_next = CHAIN_END;
+	l->lh_block_type = ZBT_LEAF;
+	l->lh_magic = ZAP_LEAF_MAGIC;
+	l->lh_nfree = ZAP_LEAF_NUMCHUNKS;
+}
+
+zap_leaf_t *
+zap_leaf_chainmore(zap_leaf_t *l, zap_leaf_t *nl)
+{
+	nl->lh_prefix = l->lh_prefix;
+	nl->lh_prefix_len = l->lh_prefix_len;
+	nl->l_next = l->l_next;
+	l->l_next = nl;
+	nl->lh_next = l->lh_next;
+	l->lh_next = nl->l_blkid;
+	return (nl);
+}
+
+/*
+ * Routines which manipulate leaf chunks (l_chunk[]).
+ */
+
+static uint16_t
+zap_leaf_chunk_alloc(zap_leaf_t *l)
+{
+	int chunk;
+
+	ASSERT(l->lh_nfree > 0);
+
+	chunk = l->l_phys->l_hdr.lh_freelist;
+	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+	ASSERT3U(l->l_phys->l_chunk[chunk].l_free.lf_type, ==, ZAP_LEAF_FREE);
+
+	l->l_phys->l_hdr.lh_freelist = l->l_phys->l_chunk[chunk].l_free.lf_next;
+
+#ifdef MEMCHECK
+	zap_memset(&l->l_phys->l_chunk[chunk], 0xa1,
+	    sizeof (l->l_phys->l_chunk[chunk]));
+#endif
+
+	l->lh_nfree--;
+
+	return (chunk);
+}
+
+static void
+zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
+{
+	struct zap_leaf_free *zlf = &l->l_phys->l_chunk[chunk].l_free;
+	ASSERT3U(l->lh_nfree, <, ZAP_LEAF_NUMCHUNKS);
+	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+	ASSERT(zlf->lf_type != ZAP_LEAF_FREE);
+
+#ifdef MEMCHECK
+	zap_memset(&l->l_phys->l_chunk[chunk], 0xf4,
+	    sizeof (l->l_phys->l_chunk[chunk]));
+#endif
+
+	zlf->lf_type = ZAP_LEAF_FREE;
+	zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
+	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
+	l->l_phys->l_hdr.lh_freelist = chunk;
+
+	l->lh_nfree++;
+}
+
+
+/*
+ * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
+ */
+
+static uint16_t
+zap_leaf_array_create(const zap_entry_handle_t *zeh, const char *buf,
+	int integer_size, int num_integers)
+{
+	uint16_t chunk_head;
+	uint16_t *chunkp = &chunk_head;
+	int byten = 0;
+	uint64_t value;
+	int shift = (integer_size-1)*8;
+	int len = num_integers;
+	zap_leaf_t *l = zeh->zeh_found_leaf;
+
+	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
+
+	while (len > 0) {
+		uint16_t chunk = zap_leaf_chunk_alloc(l);
+		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
+		int i;
+
+		la->la_type = ZAP_LEAF_ARRAY;
+		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
+			if (byten == 0)
+				value = ldv(integer_size, buf);
+			la->la_array[i] = (value & (0xff << shift)) >> shift;
+			value <<= 8;
+			if (++byten == integer_size) {
+				byten = 0;
+				buf += integer_size;
+				if (--len == 0)
+					break;
+			}
+		}
+
+		*chunkp = chunk;
+		chunkp = &la->la_next;
+	}
+	*chunkp = CHAIN_END;
+
+	return (chunk_head);
+}
+
+static void
+zap_leaf_array_free(zap_entry_handle_t *zeh, uint16_t *chunkp)
+{
+	uint16_t chunk = *chunkp;
+	zap_leaf_t *l = zeh->zeh_found_leaf;
+
+	*chunkp = CHAIN_END;
+
+	while (chunk != CHAIN_END) {
+		int nextchunk = l->l_phys->l_chunk[chunk].l_array.la_next;
+		ASSERT3U(l->l_phys->l_chunk[chunk].l_array.la_type, ==,
+		    ZAP_LEAF_ARRAY);
+		zap_leaf_chunk_free(l, chunk);
+		chunk = nextchunk;
+	}
+}
+
+/* array_len and buf_len are in integers, not bytes */
+static void
+zap_leaf_array_read(const zap_entry_handle_t *zeh, uint16_t chunk,
+    int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
+    char *buf)
+{
+	int len = MIN(array_len, buf_len);
+	int byten = 0;
+	uint64_t value = 0;
+	zap_leaf_t *l = zeh->zeh_found_leaf;
+
+	ASSERT3U(array_int_len, <=, buf_int_len);
+
+	while (len > 0) {
+		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
+		int i;
+
+		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
+			value = (value << 8) | la->la_array[i];
+			byten++;
+			if (byten == array_int_len) {
+				stv(buf_int_len, buf, value);
+				byten = 0;
+				len--;
+				if (len == 0)
+					return;
+				buf += buf_int_len;
+			}
+		}
+		chunk = la->la_next;
+	}
+}
+
+/*
+ * Only to be used on 8-bit arrays.
+ * array_len is actual len in bytes (not encoded le_value_length).
+ * buf is null-terminated.
+ */
+static int
+zap_leaf_array_equal(const zap_entry_handle_t *zeh, int chunk,
+    int array_len, const char *buf)
+{
+	int bseen = 0;
+	zap_leaf_t *l = zeh->zeh_found_leaf;
+
+	while (bseen < array_len) {
+		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
+		int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
+		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+		if (bcmp(la->la_array, buf + bseen, toread))
+			break;
+		chunk = la->la_next;
+		bseen += toread;
+	}
+	return (bseen == array_len);
+}
+
+/*
+ * Routines which manipulate leaf entries.
+ */
+
+int
+zap_leaf_lookup(zap_leaf_t *l,
+    const char *name, uint64_t h, zap_entry_handle_t *zeh)
+{
+	uint16_t *chunkp;
+	struct zap_leaf_entry *le;
+
+	zeh->zeh_head_leaf = l;
+
+again:
+	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
+
+	for (chunkp = LEAF_HASH_ENTPTR(l, h);
+	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
+		uint16_t chunk = *chunkp;
+		le = &l->l_phys->l_chunk[chunk].l_entry;
+
+		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+		ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+		if (le->le_hash != h)
+			continue;
+
+		zeh->zeh_found_leaf = l;
+		if (zap_leaf_array_equal(zeh, le->le_name_chunk,
+		    le->le_name_length, name)) {
+			zeh->zeh_num_integers = le->le_value_length;
+			zeh->zeh_integer_size = le->le_int_size;
+			zeh->zeh_cd = le->le_cd;
+			zeh->zeh_hash = le->le_hash;
+			zeh->zeh_chunkp = chunkp;
+			zeh->zeh_found_leaf = l;
+			return (0);
+		}
+	}
+
+	if (l->l_next) {
+		l = l->l_next;
+		goto again;
+	}
+
+	return (ENOENT);
+}
+
+/* Return (h1,cd1 >= h2,cd2) */
+static int
+hcd_gteq(uint64_t h1, uint32_t cd1, uint64_t h2, uint32_t cd2)
+{
+	if (h1 > h2)
+		return (TRUE);
+	if (h1 == h2 && cd1 >= cd2)
+		return (TRUE);
+	return (FALSE);
+}
+
+int
+zap_leaf_lookup_closest(zap_leaf_t *l,
+    uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
+{
+	uint16_t chunk;
+	uint64_t besth = -1ULL;
+	uint32_t bestcd = ZAP_MAXCD;
+	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES-1;
+	uint16_t lh;
+	struct zap_leaf_entry *le;
+
+	zeh->zeh_head_leaf = l;
+
+again:
+	ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
+
+	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
+		for (chunk = l->l_phys->l_hash[lh];
+		    chunk != CHAIN_END; chunk = le->le_next) {
+			le = &l->l_phys->l_chunk[chunk].l_entry;
+
+			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+			ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+			if (hcd_gteq(le->le_hash, le->le_cd, h, cd) &&
+			    hcd_gteq(besth, bestcd, le->le_hash, le->le_cd)) {
+				ASSERT3U(bestlh, >=, lh);
+				bestlh = lh;
+				besth = le->le_hash;
+				bestcd = le->le_cd;
+
+				zeh->zeh_num_integers = le->le_value_length;
+				zeh->zeh_integer_size = le->le_int_size;
+				zeh->zeh_cd = le->le_cd;
+				zeh->zeh_hash = le->le_hash;
+				zeh->zeh_fakechunk = chunk;
+				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
+				zeh->zeh_found_leaf = l;
+			}
+		}
+	}
+
+	if (l->l_next) {
+		l = l->l_next;
+		goto again;
+	}
+
+	return (bestcd == ZAP_MAXCD ? ENOENT : 0);
+}
+
+int
+zap_entry_read(const zap_entry_handle_t *zeh,
+    uint8_t integer_size, uint64_t num_integers, void *buf)
+{
+	struct zap_leaf_entry *le;
+
+	le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
+	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+	if (le->le_int_size > integer_size)
+		return (EINVAL);
+
+	zap_leaf_array_read(zeh, le->le_value_chunk, le->le_int_size,
+	    le->le_value_length, integer_size, num_integers, buf);
+
+	if (zeh->zeh_num_integers > num_integers)
+		return (EOVERFLOW);
+	return (0);
+
+}
+
+int
+zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
+{
+	struct zap_leaf_entry *le;
+
+	le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
+	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+	zap_leaf_array_read(zeh, le->le_name_chunk, 1,
+	    le->le_name_length, 1, buflen, buf);
+	if (le->le_name_length > buflen)
+		return (EOVERFLOW);
+	return (0);
+}
+
+int
+zap_entry_update(zap_entry_handle_t *zeh,
+	uint8_t integer_size, uint64_t num_integers, const void *buf)
+{
+	int delta_chunks;
+	struct zap_leaf_entry *le;
+	le = &zeh->zeh_found_leaf->l_phys->l_chunk[*zeh->zeh_chunkp].l_entry;
+
+	delta_chunks = NCHUNKS(num_integers * integer_size) -
+	    NCHUNKS(le->le_value_length * le->le_int_size);
+
+	if (zeh->zeh_found_leaf->lh_nfree < delta_chunks)
+		return (EAGAIN);
+
+	/*
+	 * We should search other chained leaves (via
+	 * zap_entry_remove,create?) otherwise returning EAGAIN will
+	 * just send us into an infinite loop if we have to chain
+	 * another leaf block, rather than being able to split this
+	 * block.
+	 */
+
+	zap_leaf_array_free(zeh, &le->le_value_chunk);
+	le->le_value_chunk =
+	    zap_leaf_array_create(zeh, buf, integer_size, num_integers);
+	le->le_value_length = (num_integers*integer_size > MAX_ARRAY_BYTES) ?
+	    (MAX_ARRAY_BYTES + 1) : (num_integers);
+	le->le_int_size = integer_size;
+	return (0);
+}
+
+void
+zap_entry_remove(zap_entry_handle_t *zeh)
+{
+	uint16_t entry_chunk;
+	struct zap_leaf_entry *le;
+	zap_leaf_t *l = zeh->zeh_found_leaf;
+
+	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
+
+	entry_chunk = *zeh->zeh_chunkp;
+	le = &l->l_phys->l_chunk[entry_chunk].l_entry;
+	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+	zap_leaf_array_free(zeh, &le->le_name_chunk);
+	zap_leaf_array_free(zeh, &le->le_value_chunk);
+
+	*zeh->zeh_chunkp = le->le_next;
+	zap_leaf_chunk_free(l, entry_chunk);
+
+	l->lh_nentries--;
+}
+
+int
+zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
+    uint8_t integer_size, uint64_t num_integers, const void *buf,
+    zap_entry_handle_t *zeh)
+{
+	uint16_t chunk;
+	uint16_t *chunkp;
+	struct zap_leaf_entry *le;
+	uint64_t namelen, valuelen;
+	int numchunks;
+
+	valuelen = integer_size * num_integers;
+	namelen = strlen(name) + 1;
+	ASSERT(namelen >= 2);
+
+	zeh->zeh_head_leaf = l;
+
+	if (namelen > MAXNAMELEN)
+		return (ENAMETOOLONG);
+	/* find the first leaf in the chain that has sufficient free space */
+	numchunks = 1 + NCHUNKS(namelen) + NCHUNKS(valuelen);
+	if (numchunks > ZAP_LEAF_NUMCHUNKS)
+		return (E2BIG);
+
+	if (cd == ZAP_MAXCD) {
+		for (cd = 0; cd < ZAP_MAXCD; cd++) {
+			zap_leaf_t *ll;
+			for (ll = l; ll; ll = ll->l_next) {
+				for (chunk = *LEAF_HASH_ENTPTR(ll, h);
+				    chunk != CHAIN_END; chunk = le->le_next) {
+					le = &ll->l_phys->l_chunk
+					    [chunk].l_entry;
+					if (le->le_hash == h &&
+					    le->le_cd == cd) {
+						break;
+					}
+				}
+				/*
+				 * if this cd is in use, no need to
+				 * check more chained leafs
+				 */
+				if (chunk != CHAIN_END)
+					break;
+			}
+			/* If this cd is not in use, we are good. */
+			if (chunk == CHAIN_END)
+				break;
+		}
+		/* If we tried all the cd's, we lose. */
+		if (cd == ZAP_MAXCD)
+			return (ENOSPC);
+	}
+
+	for (; l; l = l->l_next)
+		if (l->lh_nfree >= numchunks)
+			break;
+	if (l == NULL)
+		return (EAGAIN);
+
+	zeh->zeh_found_leaf = l;
+
+	/* make the entry */
+	chunk = zap_leaf_chunk_alloc(l);
+	le = &l->l_phys->l_chunk[chunk].l_entry;
+	le->le_type = ZAP_LEAF_ENTRY;
+	le->le_name_chunk = zap_leaf_array_create(zeh, name, 1, namelen);
+	le->le_name_length = namelen;
+	le->le_value_chunk =
+	    zap_leaf_array_create(zeh, buf, integer_size, num_integers);
+	le->le_value_length = (num_integers*integer_size > MAX_ARRAY_BYTES) ?
+	    (MAX_ARRAY_BYTES + 1) : (num_integers);
+	le->le_int_size = integer_size;
+	le->le_hash = h;
+	le->le_cd = cd;
+
+	/* link it into the hash chain */
+	chunkp = LEAF_HASH_ENTPTR(l, h);
+	le->le_next = *chunkp;
+	*chunkp = chunk;
+
+	l->lh_nentries++;
+
+	zeh->zeh_num_integers = num_integers;
+	zeh->zeh_integer_size = le->le_int_size;
+	zeh->zeh_cd = le->le_cd;
+	zeh->zeh_hash = le->le_hash;
+	zeh->zeh_chunkp = chunkp;
+
+	return (0);
+}
+
+/*
+ * Routines for transferring entries between leafs.
+ */
+
+static void
+zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
+{
+	struct zap_leaf_entry *le = &l->l_phys->l_chunk[entry].l_entry;
+	uint16_t *ptr = LEAF_HASH_ENTPTR(l, le->le_hash);
+	le->le_next = *ptr;
+	*ptr = entry;
+}
+
+static void
+zap_leaf_rehash_entries(zap_leaf_t *l)
+{
+	int i;
+
+	if (l->lh_nentries == 0)
+		return;
+
+	/* break existing hash chains */
+	zap_memset(l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
+
+	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
+		struct zap_leaf_entry *le = &l->l_phys->l_chunk[i].l_entry;
+		if (le->le_type != ZAP_LEAF_ENTRY)
+			continue;
+		zap_leaf_rehash_entry(l, i);
+	}
+}
+
+static uint16_t
+zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
+{
+	uint16_t new_chunk;
+	uint16_t *nchunkp = &new_chunk;
+
+	while (chunk != CHAIN_END) {
+		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
+		struct zap_leaf_array *nla =
+		    &nl->l_phys->l_chunk[nchunk].l_array;
+		struct zap_leaf_array *la =
+		    &l->l_phys->l_chunk[chunk].l_array;
+		int nextchunk = la->la_next;
+
+		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
+		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS);
+
+		*nla = *la;
+
+		zap_leaf_chunk_free(l, chunk);
+		chunk = nextchunk;
+		*nchunkp = nchunk;
+		nchunkp = &nla->la_next;
+	}
+	*nchunkp = CHAIN_END;
+	return (new_chunk);
+}
+
+static void
+zap_leaf_transfer_entry(zap_t *zap, zap_leaf_t *l, int entry, zap_leaf_t *nhl,
+    dmu_tx_t *tx)
+{
+	zap_leaf_t *nl;
+	struct zap_leaf_entry *le, *nle;
+	uint16_t chunk, nchunks;
+
+	le = &l->l_phys->l_chunk[entry].l_entry;
+	ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
+
+	/* find a leaf in the destination leaf chain with enough free space */
+	nchunks = 1 + NCHUNKS(le->le_name_length) +
+	    NCHUNKS(le->le_value_length * le->le_int_size);
+	for (nl = nhl; nl; nl = nl->l_next)
+		if (nl->lh_nfree >= nchunks)
+			break;
+	if (nl == NULL) {
+		nl = zap_leaf_chainmore(nhl, zap_create_leaf(zap, tx));
+		dprintf("transfer_entry: chaining leaf %x/%d\n",
+		    nl->lh_prefix, nl->lh_prefix_len);
+	}
+
+	chunk = zap_leaf_chunk_alloc(nl);
+	nle = &nl->l_phys->l_chunk[chunk].l_entry;
+	*nle = *le;
+
+	zap_leaf_rehash_entry(nl, chunk);
+
+	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
+	nle->le_value_chunk =
+	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
+
+	zap_leaf_chunk_free(l, entry);
+
+	l->lh_nentries--;
+	nl->lh_nentries++;
+}
+
+/*
+ * Transfer entries whose hash bit 'bit' is 1 to nl1, and 0 to nl0.
+ * Ignore leaf chaining in source (l), but chain in destinations.
+ * We'll re-chain all the entries in l as we go along.
+ */
+static void
+zap_leaf_transfer_entries(zap_t *zap, zap_leaf_t *l,
+    zap_leaf_t *nl0, zap_leaf_t *nl1, int bit, dmu_tx_t *tx)
+{
+	int i;
+
+	ASSERT(bit < 64 && bit >= 0);
+	/* break existing hash chains */
+	zap_memset(l->l_phys->l_hash, CHAIN_END, sizeof (l->l_phys->l_hash));
+
+	if (nl0 != l)
+		zap_leaf_rehash_entries(nl0);
+	if (nl1 != nl0)
+		zap_leaf_rehash_entries(nl1);
+
+	for (i = 0; i < ZAP_LEAF_NUMCHUNKS; i++) {
+		struct zap_leaf_entry *le = &l->l_phys->l_chunk[i].l_entry;
+		if (le->le_type != ZAP_LEAF_ENTRY)
+			continue;
+
+		/*
+		 * We could find entries via hashtable instead. That
+		 * would be O(hashents+numents) rather than
+		 * O(numblks+numents), but this accesses memory more
+		 * sequentially, and when we're called, the block is
+		 * usually pretty full.
+		 */
+
+		if (le->le_hash & (1ULL << bit)) {
+			zap_leaf_transfer_entry(zap, l, i, nl1, tx);
+		} else {
+			if (nl0 == l)
+				zap_leaf_rehash_entry(l, i);
+			else
+				zap_leaf_transfer_entry(zap, l, i, nl0, tx);
+		}
+	}
+
+}
+
+/*
+ * nl will contain the entries whose hash prefix ends in 1
+ * handles leaf chaining
+ */
+zap_leaf_t *
+zap_leaf_split(zap_t *zap, zap_leaf_t *hl, dmu_tx_t *tx)
+{
+	zap_leaf_t *l = hl;
+	int bit = 64 - 1 - hl->lh_prefix_len;
+	zap_leaf_t *nl = zap_create_leaf(zap, tx);
+
+	/* set new prefix and prefix_len */
+	hl->lh_prefix <<= 1;
+	hl->lh_prefix_len++;
+	nl->lh_prefix = hl->lh_prefix | 1;
+	nl->lh_prefix_len = hl->lh_prefix_len;
+
+	/* transfer odd entries from first leaf in hl chain to nl */
+	zap_leaf_transfer_entries(zap, hl, hl, nl, bit, tx);
+
+	/* take rest of chain off hl */
+	l = hl->l_next;
+	hl->l_next = NULL;
+	hl->lh_next = 0;
+
+	/* transfer even entries from hl chain back to hl, odd entries to nl */
+	while (l) {
+		zap_leaf_t *next = l->l_next;
+		zap_leaf_transfer_entries(zap, l, hl, nl, bit, tx);
+		zap_destroy_leaf(zap, l, tx);
+		l = next;
+	}
+
+	return (nl);
+}
+
+void
+zap_stats_leaf(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
+{
+	int n, nchained = 0;
+
+	n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
+	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+	zs->zs_leafs_with_2n_pointers[n]++;
+
+	do {
+		int i;
+
+		n = l->lh_nentries/5;
+		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+		zs->zs_blocks_with_n5_entries[n]++;
+
+		n = ((1<<ZAP_BLOCK_SHIFT) -
+		    l->lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
+		    (1<<ZAP_BLOCK_SHIFT);
+		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+		zs->zs_blocks_n_tenths_full[n]++;
+
+		for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES; i++) {
+			int nentries = 0;
+			int chunk = l->l_phys->l_hash[i];
+
+			while (chunk != CHAIN_END) {
+				struct zap_leaf_entry *le =
+				    &l->l_phys->l_chunk[chunk].l_entry;
+
+				n = 1 + NCHUNKS(le->le_name_length) +
+				    NCHUNKS(le->le_value_length *
+					le->le_int_size);
+				n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+				zs->zs_entries_using_n_chunks[n]++;
+
+				chunk = le->le_next;
+				nentries++;
+			}
+
+			n = nentries;
+			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+			zs->zs_buckets_with_n_entries[n]++;
+		}
+
+		nchained++;
+		l = l->l_next;
+	} while (l);
+
+	n = nchained-1;
+	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+	zs->zs_leafs_with_n_chained[n]++;
+}
diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c
new file mode 100644
index 000000000000..998b67c50f02
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zap_micro.c
@@ -0,0 +1,823 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/avl.h>
+
+
+static uint64_t mzap_write_cookie(zap_t *zap, uint64_t cookie,
+    uint64_t entptr);
+static void mzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+
+
+static void
+mzap_byteswap(mzap_phys_t *buf, size_t size)
+{
+	int i, max;
+	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
+	buf->mz_salt = BSWAP_64(buf->mz_salt);
+	max = (size / MZAP_ENT_LEN) - 1;
+	for (i = 0; i < max; i++) {
+		buf->mz_chunk[i].mze_value =
+		    BSWAP_64(buf->mz_chunk[i].mze_value);
+		buf->mz_chunk[i].mze_cd =
+		    BSWAP_32(buf->mz_chunk[i].mze_cd);
+	}
+}
+
+void
+zap_byteswap(void *buf, size_t size)
+{
+	uint64_t block_type;
+
+	block_type = *(uint64_t *)buf;
+
+	switch (block_type) {
+	case ZBT_MICRO:
+	case BSWAP_64(ZBT_MICRO):
+		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
+		mzap_byteswap(buf, size);
+		return;
+	default:
+		ASSERT(size == (1<<ZAP_BLOCK_SHIFT));
+		fzap_byteswap(buf, size);
+		return;
+	}
+}
+
+static int
+mze_compare(const void *arg1, const void *arg2)
+{
+	const mzap_ent_t *mze1 = arg1;
+	const mzap_ent_t *mze2 = arg2;
+
+	if (mze1->mze_hash > mze2->mze_hash)
+		return (+1);
+	if (mze1->mze_hash < mze2->mze_hash)
+		return (-1);
+	if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
+		return (+1);
+	if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
+		return (-1);
+	return (0);
+}
+
+static void
+mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
+{
+	mzap_ent_t *mze;
+
+	ASSERT(zap->zap_ismicro);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	ASSERT(mzep->mze_cd < ZAP_MAXCD);
+	ASSERT3U(zap_hash(zap, mzep->mze_name), ==, hash);
+
+	mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
+	mze->mze_chunkid = chunkid;
+	mze->mze_hash = hash;
+	mze->mze_phys = *mzep;
+	avl_add(&zap->zap_m.zap_avl, mze);
+}
+
+static mzap_ent_t *
+mze_find(zap_t *zap, const char *name, uint64_t hash)
+{
+	mzap_ent_t mze_tofind;
+	mzap_ent_t *mze;
+	avl_index_t idx;
+	avl_tree_t *avl = &zap->zap_m.zap_avl;
+
+	ASSERT(zap->zap_ismicro);
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	ASSERT3U(zap_hash(zap, name), ==, hash);
+
+	if (strlen(name) >= sizeof (mze_tofind.mze_phys.mze_name))
+		return (NULL);
+
+	mze_tofind.mze_hash = hash;
+	mze_tofind.mze_phys.mze_cd = 0;
+
+	mze = avl_find(avl, &mze_tofind, &idx);
+	if (mze == NULL)
+		mze = avl_nearest(avl, idx, AVL_AFTER);
+	for (; mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+		if (strcmp(name, mze->mze_phys.mze_name) == 0)
+			return (mze);
+	}
+	return (NULL);
+}
+
+static uint32_t
+mze_find_unused_cd(zap_t *zap, uint64_t hash)
+{
+	mzap_ent_t mze_tofind;
+	mzap_ent_t *mze;
+	avl_index_t idx;
+	avl_tree_t *avl = &zap->zap_m.zap_avl;
+	uint32_t cd;
+
+	ASSERT(zap->zap_ismicro);
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	mze_tofind.mze_hash = hash;
+	mze_tofind.mze_phys.mze_cd = 0;
+
+	cd = 0;
+	for (mze = avl_find(avl, &mze_tofind, &idx);
+	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+		if (mze->mze_phys.mze_cd != cd)
+			break;
+		cd++;
+	}
+
+	return (cd);
+}
+
+static void
+mze_remove(zap_t *zap, mzap_ent_t *mze)
+{
+	ASSERT(zap->zap_ismicro);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	avl_remove(&zap->zap_m.zap_avl, mze);
+	kmem_free(mze, sizeof (mzap_ent_t));
+}
+
+static void
+mze_destroy(zap_t *zap)
+{
+	mzap_ent_t *mze;
+	void *avlcookie = NULL;
+
+	while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
+		kmem_free(mze, sizeof (mzap_ent_t));
+	avl_destroy(&zap->zap_m.zap_avl);
+}
+
+static zap_t *
+mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
+{
+	zap_t *winner;
+	zap_t *zap;
+	int i;
+
+	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
+
+	zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
+	rw_init(&zap->zap_rwlock, 0, 0, 0);
+	rw_enter(&zap->zap_rwlock, RW_WRITER);
+	zap->zap_objset = os;
+	zap->zap_object = obj;
+	zap->zap_dbuf = db;
+
+	if (((uint64_t *)db->db_data)[0] != ZBT_MICRO) {
+		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
+	} else {
+		zap->zap_ismicro = TRUE;
+	}
+
+	/*
+	 * Make sure that zap_ismicro is set before we let others see
+	 * it, because zap_lockdir() checks zap_ismicro without the lock
+	 * held.
+	 */
+	winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_pageout);
+
+	if (winner != NULL) {
+		kmem_free(zap, sizeof (zap_t));
+		return (winner);
+	}
+
+	if (zap->zap_ismicro) {
+		zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
+		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
+		avl_create(&zap->zap_m.zap_avl, mze_compare,
+		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
+
+		for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+			mzap_ent_phys_t *mze =
+			    &zap->zap_m.zap_phys->mz_chunk[i];
+			if (mze->mze_name[0]) {
+				zap->zap_m.zap_num_entries++;
+				mze_insert(zap, i,
+				    zap_hash(zap, mze->mze_name), mze);
+			}
+		}
+	} else {
+		zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
+	}
+	rw_exit(&zap->zap_rwlock);
+	return (zap);
+}
+
+int
+zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+    krw_t lti, int fatreader, zap_t **zapp)
+{
+	zap_t *zap;
+	dmu_buf_t *db;
+	krw_t lt;
+	int err;
+
+	*zapp = NULL;
+
+	db = dmu_buf_hold(os, obj, 0);
+
+#ifdef ZFS_DEBUG
+	{
+		dmu_object_info_t doi;
+		dmu_object_info_from_db(db, &doi);
+		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+	}
+#endif
+
+	/*
+	 * The zap can deal with EIO here, but its callers don't yet, so
+	 * spare them by doing a mustsucceed read.
+	 */
+	dmu_buf_read(db);
+
+	zap = dmu_buf_get_user(db);
+	if (zap == NULL)
+		zap = mzap_open(os, obj, db);
+
+	/*
+	 * We're checking zap_ismicro without the lock held, in order to
+	 * tell what type of lock we want.  Once we have some sort of
+	 * lock, see if it really is the right type.  In practice this
+	 * can only be different if it was upgraded from micro to fat,
+	 * and micro wanted WRITER but fat only needs READER.
+	 */
+	lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
+	rw_enter(&zap->zap_rwlock, lt);
+	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
+		/* it was upgraded, now we only need reader */
+		ASSERT(lt == RW_WRITER);
+		ASSERT(RW_READER ==
+		    (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
+		rw_downgrade(&zap->zap_rwlock);
+		lt = RW_READER;
+	}
+
+	zap->zap_objset = os;
+
+	if (lt == RW_WRITER)
+		dmu_buf_will_dirty(db, tx);
+
+	ASSERT3P(zap->zap_dbuf, ==, db);
+
+	ASSERT(!zap->zap_ismicro ||
+	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
+	if (zap->zap_ismicro && tx &&
+	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
+		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
+		if (newsz > MZAP_MAX_BLKSZ) {
+			dprintf("upgrading obj %llu: num_entries=%u\n",
+			    obj, zap->zap_m.zap_num_entries);
+			mzap_upgrade(zap, tx);
+			*zapp = zap;
+			return (0);
+		}
+		err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
+		ASSERT3U(err, ==, 0);
+		zap->zap_m.zap_num_chunks =
+		    db->db_size / MZAP_ENT_LEN - 1;
+	}
+
+	*zapp = zap;
+	return (0);
+}
+
+void
+zap_unlockdir(zap_t *zap)
+{
+	rw_exit(&zap->zap_rwlock);
+	dmu_buf_rele(zap->zap_dbuf);
+}
+
+static void
+mzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+{
+	mzap_phys_t *mzp;
+	int i, sz, nchunks, err;
+
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	sz = zap->zap_dbuf->db_size;
+	mzp = kmem_alloc(sz, KM_SLEEP);
+	bcopy(zap->zap_dbuf->db_data, mzp, sz);
+	nchunks = zap->zap_m.zap_num_chunks;
+
+	err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
+	    1ULL << ZAP_BLOCK_SHIFT, 0, tx);
+	ASSERT(err == 0);
+
+	dprintf("upgrading obj=%llu with %u chunks\n",
+	    zap->zap_object, nchunks);
+	mze_destroy(zap);
+
+	fzap_upgrade(zap, tx);
+
+	for (i = 0; i < nchunks; i++) {
+		int err;
+		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
+		if (mze->mze_name[0] == 0)
+			continue;
+		dprintf("adding %s=%llu\n",
+		    mze->mze_name, mze->mze_value);
+		err = fzap_add_cd(zap,
+		    mze->mze_name, 8, 1, &mze->mze_value,
+		    mze->mze_cd, tx, NULL);
+		ASSERT3U(err, ==, 0);
+	}
+	kmem_free(mzp, sz);
+}
+
+uint64_t
+zap_hash(zap_t *zap, const char *name)
+{
+	const uint8_t *cp;
+	uint8_t c;
+	uint64_t crc = zap->zap_salt;
+
+	ASSERT(crc != 0);
+	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
+		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
+
+	/*
+	 * Only use 28 bits, since we need 4 bits in the cookie for the
+	 * collision differentiator.  We MUST use the high bits, since
+	 * those are the onces that we first pay attention to when
+	 * chosing the bucket.
+	 */
+	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+
+	return (crc);
+}
+
+
+static void
+mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	mzap_phys_t *zp;
+
+	db = dmu_buf_hold(os, obj, 0);
+
+#ifdef ZFS_DEBUG
+	{
+		dmu_object_info_t doi;
+		dmu_object_info_from_db(db, &doi);
+		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+	}
+#endif
+
+	dmu_buf_will_dirty(db, tx);
+	zp = db->db_data;
+	zp->mz_block_type = ZBT_MICRO;
+	zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
+	ASSERT(zp->mz_salt != 0);
+	dmu_buf_rele(db);
+}
+
+int
+zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	int err;
+
+	err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
+	if (err != 0)
+		return (err);
+	mzap_create_impl(os, obj, tx);
+	return (0);
+}
+
+uint64_t
+zap_create(objset_t *os, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+
+	mzap_create_impl(os, obj, tx);
+	return (obj);
+}
+
+int
+zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
+{
+	/*
+	 * dmu_object_free will free the object number and free the
+	 * data.  Freeing the data will cause our pageout function to be
+	 * called, which will destroy our data (zap_leaf_t's and zap_t).
+	 */
+
+	return (dmu_object_free(os, zapobj, tx));
+}
+
+_NOTE(ARGSUSED(0))
+void
+zap_pageout(dmu_buf_t *db, void *vmzap)
+{
+	zap_t *zap = vmzap;
+
+	rw_destroy(&zap->zap_rwlock);
+
+	if (zap->zap_ismicro) {
+		mze_destroy(zap);
+	}
+
+	kmem_free(zap, sizeof (zap_t));
+}
+
+
+int
+zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+	if (err)
+		return (err);
+	if (!zap->zap_ismicro) {
+		err = fzap_count(zap, count);
+	} else {
+		*count = zap->zap_m.zap_num_entries;
+	}
+	zap_unlockdir(zap);
+	return (err);
+}
+
+/*
+ * Routines for maniplulating attributes.
+ */
+
+int
+zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	zap_t *zap;
+	int err;
+	mzap_ent_t *mze;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+	if (err)
+		return (err);
+	if (!zap->zap_ismicro) {
+		err = fzap_lookup(zap, name,
+		    integer_size, num_integers, buf);
+	} else {
+		mze = mze_find(zap, name, zap_hash(zap, name));
+		if (mze == NULL) {
+			err = ENOENT;
+		} else {
+			if (num_integers < 1)
+				err = EOVERFLOW;
+			else if (integer_size != 8)
+				err = EINVAL;
+			else
+				*(uint64_t *)buf = mze->mze_phys.mze_value;
+		}
+	}
+	zap_unlockdir(zap);
+	return (err);
+}
+
+int
+zap_length(objset_t *os, uint64_t zapobj, const char *name,
+    uint64_t *integer_size, uint64_t *num_integers)
+{
+	zap_t *zap;
+	int err;
+	mzap_ent_t *mze;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+	if (err)
+		return (err);
+	if (!zap->zap_ismicro) {
+		err = fzap_length(zap, name, integer_size, num_integers);
+	} else {
+		mze = mze_find(zap, name, zap_hash(zap, name));
+		if (mze == NULL) {
+			err = ENOENT;
+		} else {
+			if (integer_size)
+				*integer_size = 8;
+			if (num_integers)
+				*num_integers = 1;
+		}
+	}
+	zap_unlockdir(zap);
+	return (err);
+}
+
+static void
+mzap_addent(zap_t *zap, const char *name, uint64_t hash, uint64_t value)
+{
+	int i;
+	int start = zap->zap_m.zap_alloc_next;
+	uint32_t cd;
+
+	dprintf("obj=%llu %s=%llu\n", zap->zap_object, name, value);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+#ifdef ZFS_DEBUG
+	for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+		ASSERT(strcmp(name, mze->mze_name) != 0);
+	}
+#endif
+
+	cd = mze_find_unused_cd(zap, hash);
+	/* given the limited size of the microzap, this can't happen */
+	ASSERT(cd != ZAP_MAXCD);
+
+again:
+	for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
+		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+		if (mze->mze_name[0] == 0) {
+			mze->mze_value = value;
+			mze->mze_cd = cd;
+			(void) strcpy(mze->mze_name, name);
+			zap->zap_m.zap_num_entries++;
+			zap->zap_m.zap_alloc_next = i+1;
+			if (zap->zap_m.zap_alloc_next ==
+			    zap->zap_m.zap_num_chunks)
+				zap->zap_m.zap_alloc_next = 0;
+			mze_insert(zap, i, hash, mze);
+			return;
+		}
+	}
+	if (start != 0) {
+		start = 0;
+		goto again;
+	}
+	ASSERT(!"out of entries!");
+}
+
+int
+zap_add(objset_t *os, uint64_t zapobj, const char *name,
+    int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+	mzap_ent_t *mze;
+	const uint64_t *intval = val;
+	uint64_t hash;
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+	if (err)
+		return (err);
+	if (!zap->zap_ismicro) {
+		err = fzap_add(zap, name, integer_size, num_integers, val, tx);
+	} else if (integer_size != 8 || num_integers != 1 ||
+	    strlen(name) >= MZAP_NAME_LEN) {
+		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
+		    zapobj, integer_size, num_integers, name);
+		mzap_upgrade(zap, tx);
+		err = fzap_add(zap, name, integer_size, num_integers, val, tx);
+	} else {
+		hash = zap_hash(zap, name);
+		mze = mze_find(zap, name, hash);
+		if (mze != NULL) {
+			err = EEXIST;
+		} else {
+			mzap_addent(zap, name, hash, *intval);
+		}
+	}
+	zap_unlockdir(zap);
+	return (err);
+}
+
+int
+zap_update(objset_t *os, uint64_t zapobj, const char *name,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	mzap_ent_t *mze;
+	const uint64_t *intval = val;
+	uint64_t hash;
+	int err;
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+	if (err)
+		return (err);
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	if (!zap->zap_ismicro) {
+		err = fzap_update(zap, name,
+		    integer_size, num_integers, val, tx);
+	} else if (integer_size != 8 || num_integers != 1 ||
+	    strlen(name) >= MZAP_NAME_LEN) {
+		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
+		    zapobj, integer_size, num_integers, name);
+		mzap_upgrade(zap, tx);
+		err = fzap_update(zap, name,
+		    integer_size, num_integers, val, tx);
+	} else {
+		hash = zap_hash(zap, name);
+		mze = mze_find(zap, name, hash);
+		if (mze != NULL) {
+			mze->mze_phys.mze_value = *intval;
+			zap->zap_m.zap_phys->mz_chunk
+			    [mze->mze_chunkid].mze_value = *intval;
+		} else {
+			mzap_addent(zap, name, hash, *intval);
+		}
+	}
+	zap_unlockdir(zap);
+	return (0);
+}
+
+int
+zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+	mzap_ent_t *mze;
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, &zap);
+	if (err)
+		return (err);
+	if (!zap->zap_ismicro) {
+		err = fzap_remove(zap, name, tx);
+	} else {
+		mze = mze_find(zap, name, zap_hash(zap, name));
+		if (mze == NULL) {
+			dprintf("fail: %s\n", name);
+			err = ENOENT;
+		} else {
+			dprintf("success: %s\n", name);
+			zap->zap_m.zap_num_entries--;
+			bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+			    sizeof (mzap_ent_phys_t));
+			mze_remove(zap, mze);
+		}
+	}
+	zap_unlockdir(zap);
+	return (err);
+}
+
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+void
+zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+	zc->zc_objset = os;
+	zc->zc_zapobj = zapobj;
+	zc->zc_hash = 0;
+	zc->zc_cd = 0;
+}
+
+/*
+ * We want to keep the high 32 bits of the cursor zero if we can, so
+ * that 32-bit programs can access this.  So use a small hash value so
+ * we can fit 4 bits of cd into the 32-bit cursor.
+ *
+ * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
+ */
+void
+zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+    uint64_t serialized)
+{
+	zc->zc_objset = os;
+	zc->zc_zapobj = zapobj;
+	if (serialized == -1ULL) {
+		zc->zc_hash = -1ULL;
+		zc->zc_cd = 0;
+	} else {
+		zc->zc_hash = serialized << (64-ZAP_HASHBITS);
+		zc->zc_cd = serialized >> ZAP_HASHBITS;
+		if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
+			zc->zc_cd = 0;
+	}
+}
+
+uint64_t
+zap_cursor_serialize(zap_cursor_t *zc)
+{
+	if (zc->zc_hash == -1ULL)
+		return (-1ULL);
+	ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
+	ASSERT(zc->zc_cd < ZAP_MAXCD);
+	return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
+	    ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
+}
+
+int
+zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
+{
+	zap_t *zap;
+	int err;
+	avl_index_t idx;
+	mzap_ent_t mze_tofind;
+	mzap_ent_t *mze;
+
+	if (zc->zc_hash == -1ULL)
+		return (ENOENT);
+
+	err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+	    RW_READER, TRUE, &zap);
+	if (err)
+		return (err);
+	if (!zap->zap_ismicro) {
+		err = fzap_cursor_retrieve(zap, zc, za);
+	} else {
+		err = ENOENT;
+
+		mze_tofind.mze_hash = zc->zc_hash;
+		mze_tofind.mze_phys.mze_cd = zc->zc_cd;
+
+		mze = avl_find(&zap->zap_m.zap_avl, &mze_tofind, &idx);
+		ASSERT(mze == NULL || 0 == bcmp(&mze->mze_phys,
+		    &zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+		    sizeof (mze->mze_phys)));
+		if (mze == NULL)
+			mze = avl_nearest(&zap->zap_m.zap_avl, idx, AVL_AFTER);
+
+		if (mze) {
+			za->za_integer_length = 8;
+			za->za_num_integers = 1;
+			za->za_first_integer = mze->mze_phys.mze_value;
+			(void) strcpy(za->za_name, mze->mze_phys.mze_name);
+			zc->zc_hash = mze->mze_hash;
+			zc->zc_cd = mze->mze_phys.mze_cd;
+			err = 0;
+		} else {
+			zc->zc_hash = -1ULL;
+		}
+	}
+	zap_unlockdir(zap);
+	return (err);
+}
+
+void
+zap_cursor_advance(zap_cursor_t *zc)
+{
+	if (zc->zc_hash == -1ULL)
+		return;
+	zc->zc_cd++;
+	if (zc->zc_cd >= ZAP_MAXCD) {
+		zc->zc_cd = 0;
+		zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
+		if (zc->zc_hash == 0) /* EOF */
+			zc->zc_hash = -1ULL;
+	}
+}
+
+int
+zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
+{
+	int err;
+	zap_t *zap;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, &zap);
+	if (err)
+		return (err);
+
+	bzero(zs, sizeof (zap_stats_t));
+
+	if (zap->zap_ismicro) {
+		zs->zs_blocksize = zap->zap_dbuf->db_size;
+		zs->zs_num_entries = zap->zap_m.zap_num_entries;
+		zs->zs_num_blocks = 1;
+	} else {
+		fzap_get_stats(zap, zs);
+	}
+	zap_unlockdir(zap);
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs.conf b/usr/src/uts/common/fs/zfs/zfs.conf
new file mode 100644
index 000000000000..09881909b804
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs.conf
@@ -0,0 +1,28 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+name="zfs" parent="pseudo";
diff --git a/usr/src/uts/common/fs/zfs/zfs_acl.c b/usr/src/uts/common/fs/zfs/zfs_acl.c
new file mode 100644
index 000000000000..960de720d1a8
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_acl.c
@@ -0,0 +1,1537 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/fs/zfs.h>
+#include <sys/mode.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <util/qsort.h>
+#include "fs/fs_subr.h"
+#include <acl/acl_common.h>
+
+#define	ALLOW	ACE_ACCESS_ALLOWED_ACE_TYPE
+#define	DENY	ACE_ACCESS_DENIED_ACE_TYPE
+
+#define	OWNING_GROUP		(ACE_GROUP|ACE_IDENTIFIER_GROUP)
+#define	EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
+    ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
+#define	EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
+    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define	OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
+    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define	WRITE_MASK (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS| \
+    ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define	OGE_CLEAR	(ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define	OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define	ALL_INHERIT	(ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
+    ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE)
+
+#define	SECURE_NO_INHERIT	(ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define	OGE_PAD	6		/* traditional owner/group/everyone ACES */
+
+static int zfs_ace_can_use(znode_t *zp, ace_t *);
+
+static zfs_acl_t *
+zfs_acl_alloc(int slots)
+{
+	zfs_acl_t *aclp;
+
+	aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
+	if (slots != 0) {
+		aclp->z_acl = kmem_alloc(ZFS_ACL_SIZE(slots), KM_SLEEP);
+		aclp->z_acl_count = 0;
+		aclp->z_state = ACL_DATA_ALLOCED;
+	} else {
+		aclp->z_state = 0;
+	}
+	aclp->z_slots = slots;
+	return (aclp);
+}
+
+void
+zfs_acl_free(zfs_acl_t *aclp)
+{
+	if (aclp->z_state == ACL_DATA_ALLOCED) {
+		kmem_free(aclp->z_acl, ZFS_ACL_SIZE(aclp->z_slots));
+	}
+	kmem_free(aclp, sizeof (zfs_acl_t));
+}
+
+static uint32_t
+zfs_v4_to_unix(uint32_t access_mask)
+{
+	uint32_t new_mask = 0;
+
+	if (access_mask & (ACE_READ_DATA | ACE_LIST_DIRECTORY))
+		new_mask |= S_IROTH;
+	if (access_mask & (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_ADD_FILE))
+		new_mask |= S_IWOTH;
+	if (access_mask & (ACE_EXECUTE|ACE_READ_NAMED_ATTRS))
+		new_mask |= S_IXOTH;
+
+	return (new_mask);
+}
+
+/*
+ * Convert unix access mask to v4 access mask
+ */
+static uint32_t
+zfs_unix_to_v4(uint32_t access_mask)
+{
+	uint32_t new_mask = 0;
+
+	if (access_mask & 01)
+		new_mask |= (ACE_EXECUTE);
+	if (access_mask & 02) {
+		new_mask |= (ACE_WRITE_DATA);
+	} if (access_mask & 04) {
+		new_mask |= ACE_READ_DATA;
+	}
+	return (new_mask);
+}
+
+static void
+zfs_set_ace(ace_t *zacep, uint32_t access_mask, int access_type,
+    uid_t uid, int entry_type)
+{
+	zacep->a_access_mask = access_mask;
+	zacep->a_type = access_type;
+	zacep->a_who = uid;
+	zacep->a_flags = entry_type;
+}
+
+static uint64_t
+zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
+{
+	int 	i;
+	int	entry_type;
+	mode_t	mode = (zp->z_phys->zp_mode &
+	    (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+	mode_t	 seen = 0;
+	ace_t 	*acep;
+
+	for (i = 0, acep = aclp->z_acl;
+	    i != aclp->z_acl_count; i++, acep++) {
+		entry_type = (acep->a_flags & 0xf040);
+		if (entry_type == ACE_OWNER) {
+			if ((acep->a_access_mask & ACE_READ_DATA) &&
+			    (!(seen & S_IRUSR))) {
+				seen |= S_IRUSR;
+				if (acep->a_type == ALLOW) {
+					mode |= S_IRUSR;
+				}
+			}
+			if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+			    (!(seen & S_IWUSR))) {
+				seen |= S_IWUSR;
+				if (acep->a_type == ALLOW) {
+					mode |= S_IWUSR;
+				}
+			}
+			if ((acep->a_access_mask & ACE_EXECUTE) &&
+			    (!(seen & S_IXUSR))) {
+				seen |= S_IXUSR;
+				if (acep->a_type == ALLOW) {
+					mode |= S_IXUSR;
+				}
+			}
+		} else if (entry_type == OWNING_GROUP) {
+			if ((acep->a_access_mask & ACE_READ_DATA) &&
+			    (!(seen & S_IRGRP))) {
+				seen |= S_IRGRP;
+				if (acep->a_type == ALLOW) {
+					mode |= S_IRGRP;
+				}
+			}
+			if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+			    (!(seen & S_IWGRP))) {
+				seen |= S_IWGRP;
+				if (acep->a_type == ALLOW) {
+					mode |= S_IWGRP;
+				}
+			}
+			if ((acep->a_access_mask & ACE_EXECUTE) &&
+			    (!(seen & S_IXGRP))) {
+				seen |= S_IXGRP;
+				if (acep->a_type == ALLOW) {
+					mode |= S_IXGRP;
+				}
+			}
+		} else if (entry_type == ACE_EVERYONE) {
+			if ((acep->a_access_mask & ACE_READ_DATA)) {
+				if (!(seen & S_IRUSR)) {
+					seen |= S_IRUSR;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IRUSR;
+					}
+				}
+				if (!(seen & S_IRGRP)) {
+					seen |= S_IRGRP;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IRGRP;
+					}
+				}
+				if (!(seen & S_IROTH)) {
+					seen |= S_IROTH;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IROTH;
+					}
+				}
+			}
+			if ((acep->a_access_mask & ACE_WRITE_DATA)) {
+				if (!(seen & S_IWUSR)) {
+					seen |= S_IWUSR;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IWUSR;
+					}
+				}
+				if (!(seen & S_IWGRP)) {
+					seen |= S_IWGRP;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IWGRP;
+					}
+				}
+				if (!(seen & S_IWOTH)) {
+					seen |= S_IWOTH;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IWOTH;
+					}
+				}
+			}
+			if ((acep->a_access_mask & ACE_EXECUTE)) {
+				if (!(seen & S_IXUSR)) {
+					seen |= S_IXUSR;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IXUSR;
+					}
+				}
+				if (!(seen & S_IXGRP)) {
+					seen |= S_IXGRP;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IXGRP;
+					}
+				}
+				if (!(seen & S_IXOTH)) {
+					seen |= S_IXOTH;
+					if (acep->a_type == ALLOW) {
+						mode |= S_IXOTH;
+					}
+				}
+			}
+		}
+	}
+	return (mode);
+}
+
+static zfs_acl_t *
+zfs_acl_node_read_internal(znode_t *zp)
+{
+	zfs_acl_t	*aclp;
+
+	aclp = zfs_acl_alloc(0);
+	aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
+	aclp->z_acl = &zp->z_phys->zp_acl.z_ace_data[0];
+
+	return (aclp);
+}
+
+/*
+ * Read an external acl object.
+ */
+zfs_acl_t *
+zfs_acl_node_read(znode_t *zp)
+{
+	uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
+	zfs_acl_t	*aclp;
+
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+	if (zp->z_phys->zp_acl.z_acl_extern_obj == 0)
+		return (zfs_acl_node_read_internal(zp));
+
+	aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_count);
+
+	dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
+	    ZFS_ACL_SIZE(zp->z_phys->zp_acl.z_acl_count), aclp->z_acl);
+
+	aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
+
+	return (aclp);
+}
+
+static boolean_t
+zfs_acl_valid(znode_t *zp, ace_t *uace, int aclcnt, int *inherit)
+{
+	ace_t 	*acep;
+	int i;
+
+	*inherit = 0;
+
+	if (aclcnt > MAX_ACL_ENTRIES || aclcnt <= 0) {
+		return (B_FALSE);
+	}
+
+	for (i = 0, acep = uace; i != aclcnt; i++, acep++) {
+
+		/*
+		 * first check type of entry
+		 */
+
+		switch (acep->a_flags & 0xf040) {
+		case ACE_OWNER:
+			acep->a_who = -1;
+			break;
+		case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+		case ACE_IDENTIFIER_GROUP:
+			if (acep->a_flags & ACE_GROUP) {
+				acep->a_who = -1;
+			}
+			break;
+		case ACE_EVERYONE:
+			acep->a_who = -1;
+			break;
+		}
+
+		/*
+		 * next check inheritance level flags
+		 */
+
+		if (acep->a_type != ALLOW && acep->a_type != DENY)
+			return (B_FALSE);
+
+		/*
+		 * Only directories should have inheritance flags.
+		 */
+		if (ZTOV(zp)->v_type != VDIR && (acep->a_flags &
+		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE|
+		    ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE))) {
+			return (B_FALSE);
+		}
+
+		if (acep->a_flags &
+		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))
+			*inherit = 1;
+
+		if (acep->a_flags &
+		    (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
+			if ((acep->a_flags & (ACE_FILE_INHERIT_ACE|
+			    ACE_DIRECTORY_INHERIT_ACE)) == 0) {
+				return (B_FALSE);
+			}
+		}
+	}
+
+	return (B_TRUE);
+}
+/*
+ * common code for setting acl's.
+ *
+ * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
+ * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
+ * already checked the acl and knows whether to inherit.
+ */
+int
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, dmu_tx_t *tx, int *ihp)
+{
+	int 		inherit = 0;
+	int		error;
+	znode_phys_t	*zphys = zp->z_phys;
+	zfs_znode_acl_t	*zacl = &zphys->zp_acl;
+	uint32_t	acl_phys_size = ZFS_ACL_SIZE(aclp->z_acl_count);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	uint64_t	aoid = zphys->zp_acl.z_acl_extern_obj;
+
+	ASSERT(MUTEX_HELD(&zp->z_lock));
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+	if (ihp)
+		inherit = *ihp;		/* already determined by caller */
+	else if (!zfs_acl_valid(zp, aclp->z_acl,
+	    aclp->z_acl_count, &inherit)) {
+		return (EINVAL);
+	}
+
+	dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+	/*
+	 * Will ACL fit internally?
+	 */
+	if (aclp->z_acl_count > ACE_SLOT_CNT) {
+		if (aoid == 0) {
+			aoid = dmu_object_alloc(zfsvfs->z_os,
+			    DMU_OT_ACL, acl_phys_size, DMU_OT_NONE, 0, tx);
+		} else {
+			(void) dmu_object_set_blocksize(zfsvfs->z_os, aoid,
+			    acl_phys_size, 0, tx);
+		}
+		zphys->zp_acl.z_acl_extern_obj = aoid;
+		zphys->zp_acl.z_acl_count = aclp->z_acl_count;
+		dmu_write(zfsvfs->z_os, aoid, 0,
+		    acl_phys_size, aclp->z_acl, tx);
+	} else {
+		/*
+		 * Migrating back embedded?
+		 */
+		if (zphys->zp_acl.z_acl_extern_obj) {
+			error = dmu_object_free(zfsvfs->z_os,
+				zp->z_phys->zp_acl.z_acl_extern_obj, tx);
+			if (error)
+				return (error);
+			zphys->zp_acl.z_acl_extern_obj = 0;
+		}
+		bcopy(aclp->z_acl, zacl->z_ace_data,
+		    aclp->z_acl_count * sizeof (ace_t));
+		zacl->z_acl_count = aclp->z_acl_count;
+	}
+	if (inherit)
+		zp->z_phys->zp_flags |= ZFS_INHERIT_ACE;
+	else
+		zp->z_phys->zp_flags &= ~ZFS_INHERIT_ACE;
+
+	zphys->zp_mode = zfs_mode_compute(zp, aclp);
+	zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+
+	return (0);
+}
+
+/*
+ * Create space for slots_needed ACEs to be append
+ * to aclp.
+ */
+static void
+zfs_acl_append(zfs_acl_t *aclp, int slots_needed)
+{
+	ace_t	*newacep;
+	ace_t	*oldaclp;
+	int	slot_cnt;
+	int 	slots_left = aclp->z_slots - aclp->z_acl_count;
+
+	if (aclp->z_state == ACL_DATA_ALLOCED)
+		ASSERT(aclp->z_slots >= aclp->z_acl_count);
+	if (slots_left < slots_needed || aclp->z_state != ACL_DATA_ALLOCED) {
+		slot_cnt = aclp->z_slots +  1 + (slots_needed - slots_left);
+		newacep = kmem_alloc(ZFS_ACL_SIZE(slot_cnt), KM_SLEEP);
+		bcopy(aclp->z_acl, newacep,
+		    ZFS_ACL_SIZE(aclp->z_acl_count));
+		oldaclp = aclp->z_acl;
+		if (aclp->z_state == ACL_DATA_ALLOCED)
+			kmem_free(oldaclp, ZFS_ACL_SIZE(aclp->z_slots));
+		aclp->z_acl = newacep;
+		aclp->z_slots = slot_cnt;
+		aclp->z_state = ACL_DATA_ALLOCED;
+	}
+}
+
+/*
+ * Remove "slot" ACE from aclp
+ */
+static void
+zfs_ace_remove(zfs_acl_t *aclp, int slot)
+{
+	if (aclp->z_acl_count > 1) {
+		(void) memmove(&aclp->z_acl[slot],
+		    &aclp->z_acl[slot +1], sizeof (ace_t) *
+		    (--aclp->z_acl_count - slot));
+	} else
+		aclp->z_acl_count--;
+}
+
+/*
+ * Update access mask for prepended ACE
+ *
+ * This applies the "groupmask" value for aclmode property.
+ */
+static void
+zfs_acl_prepend_fixup(ace_t *acep, ace_t *origacep, mode_t mode, uid_t owner)
+{
+
+	int	rmask, wmask, xmask;
+	int	user_ace;
+
+	user_ace = (!(acep->a_flags &
+	    (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP)));
+
+	if (user_ace && (acep->a_who == owner)) {
+		rmask = S_IRUSR;
+		wmask = S_IWUSR;
+		xmask = S_IXUSR;
+	} else {
+		rmask = S_IRGRP;
+		wmask = S_IWGRP;
+		xmask = S_IXGRP;
+	}
+
+	if (origacep->a_access_mask & ACE_READ_DATA) {
+		if (mode & rmask)
+			acep->a_access_mask &= ~ACE_READ_DATA;
+		else
+			acep->a_access_mask |= ACE_READ_DATA;
+	}
+
+	if (origacep->a_access_mask & ACE_WRITE_DATA) {
+		if (mode & wmask)
+			acep->a_access_mask &= ~ACE_WRITE_DATA;
+		else
+			acep->a_access_mask |= ACE_WRITE_DATA;
+	}
+
+	if (origacep->a_access_mask & ACE_APPEND_DATA) {
+		if (mode & wmask)
+			acep->a_access_mask &= ~ACE_APPEND_DATA;
+		else
+			acep->a_access_mask |= ACE_APPEND_DATA;
+	}
+
+	if (origacep->a_access_mask & ACE_EXECUTE) {
+		if (mode & xmask)
+			acep->a_access_mask &= ~ACE_EXECUTE;
+		else
+			acep->a_access_mask |= ACE_EXECUTE;
+	}
+}
+
+/*
+ * Apply mode to canonical six ACEs.
+ */
+static void
+zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode)
+{
+	int	cnt;
+	ace_t	*acep;
+
+	cnt = aclp->z_acl_count -1;
+	acep = aclp->z_acl;
+
+	/*
+	 * Fixup final ACEs to match the mode
+	 */
+
+	ASSERT(cnt >= 5);
+	adjust_ace_pair(&acep[cnt - 1], mode);	/* everyone@ */
+	adjust_ace_pair(&acep[cnt - 3], (mode & 0070) >> 3);	/* group@ */
+	adjust_ace_pair(&acep[cnt - 5], (mode & 0700) >> 6);	/* owner@ */
+}
+
+
+static int
+zfs_acl_ace_match(ace_t *acep, int allow_deny, int type, int mask)
+{
+	return (acep->a_access_mask == mask && acep->a_type == allow_deny &&
+	    ((acep->a_flags & 0xf040) == type));
+}
+
+/*
+ * Can prepended ACE be reused?
+ */
+static int
+zfs_reuse_deny(ace_t *acep, int i)
+{
+	int okay_masks;
+
+	if (i < 1)
+		return (B_FALSE);
+
+	if (acep[i-1].a_type != DENY)
+		return (B_FALSE);
+
+	if (acep[i-1].a_flags != (acep[i].a_flags & ACE_IDENTIFIER_GROUP))
+		return (B_FALSE);
+
+	okay_masks = (acep[i].a_access_mask & OKAY_MASK_BITS);
+
+	if (acep[i-1].a_access_mask & ~okay_masks)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
+ * Create space to prepend an ACE
+ */
+static void
+zfs_acl_prepend(zfs_acl_t *aclp, int i)
+{
+	ace_t	*oldaclp = NULL;
+	ace_t	*to, *from;
+	int	slots_left = aclp->z_slots - aclp->z_acl_count;
+	int	oldslots;
+	int	need_free = 0;
+
+	if (aclp->z_state == ACL_DATA_ALLOCED)
+		ASSERT(aclp->z_slots >= aclp->z_acl_count);
+
+	if (slots_left == 0 || aclp->z_state != ACL_DATA_ALLOCED) {
+
+		to = kmem_alloc(ZFS_ACL_SIZE(aclp->z_acl_count +
+		    OGE_PAD), KM_SLEEP);
+		if (aclp->z_state == ACL_DATA_ALLOCED)
+			need_free++;
+		from = aclp->z_acl;
+		oldaclp = aclp->z_acl;
+		(void) memmove(to, from,
+		    sizeof (ace_t) * aclp->z_acl_count);
+		aclp->z_state = ACL_DATA_ALLOCED;
+	} else {
+		from = aclp->z_acl;
+		to = aclp->z_acl;
+	}
+
+
+	(void) memmove(&to[i + 1], &from[i],
+	    sizeof (ace_t) * (aclp->z_acl_count - i));
+
+	if (oldaclp) {
+		aclp->z_acl = to;
+		oldslots = aclp->z_slots;
+		aclp->z_slots = aclp->z_acl_count + OGE_PAD;
+		if (need_free)
+			kmem_free(oldaclp, ZFS_ACL_SIZE(oldslots));
+	}
+
+}
+
+/*
+ * Prepend deny ACE
+ */
+static void
+zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, int i,
+    mode_t mode)
+{
+	ace_t	*acep;
+
+	zfs_acl_prepend(aclp, i);
+
+	acep = aclp->z_acl;
+	zfs_set_ace(&acep[i], 0, DENY, acep[i + 1].a_who,
+	    (acep[i + 1].a_flags & 0xf040));
+	zfs_acl_prepend_fixup(&acep[i], &acep[i+1], mode, zp->z_phys->zp_uid);
+	aclp->z_acl_count++;
+}
+
+/*
+ * Split an inherited ACE into inherit_only ACE
+ * and original ACE with inheritance flags stripped off.
+ */
+static void
+zfs_acl_split_ace(zfs_acl_t *aclp, int i)
+{
+	ace_t *acep = aclp->z_acl;
+
+	zfs_acl_prepend(aclp, i);
+	acep = aclp->z_acl;
+	acep[i] = acep[i + 1];
+	acep[i].a_flags |= ACE_INHERIT_ONLY_ACE;
+	acep[i + 1].a_flags &= ~ALL_INHERIT;
+	aclp->z_acl_count++;
+}
+
+/*
+ * Are ACES started at index i, the canonical six ACES?
+ */
+static int
+zfs_have_canonical_six(zfs_acl_t *aclp, int i)
+{
+	ace_t *acep = aclp->z_acl;
+
+	if ((zfs_acl_ace_match(&acep[i],
+	    DENY, ACE_OWNER, 0) &&
+	    zfs_acl_ace_match(&acep[i + 1], ALLOW, ACE_OWNER,
+	    OWNER_ALLOW_MASK) && zfs_acl_ace_match(&acep[i + 2],
+	    DENY, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 3],
+	    ALLOW, OWNING_GROUP, 0) && zfs_acl_ace_match(&acep[i + 4],
+	    DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) &&
+	    zfs_acl_ace_match(&acep[i + 5], ALLOW, ACE_EVERYONE,
+	    EVERYONE_ALLOW_MASK))) {
+		return (1);
+	} else {
+		return (0);
+	}
+}
+
+/*
+ * Apply step 1g, to group entries
+ *
+ * Need to deal with corner case where group may have
+ * greater permissions than owner.  If so then limit
+ * group permissions, based on what extra permissions
+ * group has.
+ */
+static void
+zfs_fixup_group_entries(ace_t *acep, mode_t mode)
+{
+	mode_t extramode = (mode >> 3) & 07;
+	mode_t ownermode = (mode >> 6);
+
+	if (acep[0].a_flags & ACE_IDENTIFIER_GROUP) {
+
+		extramode &= ~ownermode;
+
+		if (extramode) {
+			if (extramode & 04) {
+				acep[0].a_access_mask &= ~ACE_READ_DATA;
+				acep[1].a_access_mask &= ~ACE_READ_DATA;
+			}
+			if (extramode & 02) {
+				acep[0].a_access_mask &=
+				    ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+				acep[1].a_access_mask &=
+				    ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+			}
+			if (extramode & 01) {
+				acep[0].a_access_mask &= ~ACE_EXECUTE;
+				acep[1].a_access_mask &= ~ACE_EXECUTE;
+			}
+		}
+	}
+}
+
+/*
+ * Apply the chmod algorithm as described
+ * in PSARC/2002/240
+ */
+static int
+zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp,
+    dmu_tx_t *tx)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	ace_t 		*acep;
+	int 		i;
+	int		error;
+	int 		entry_type;
+	int 		reuse_deny;
+	int 		need_canonical_six = 1;
+
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+	ASSERT(MUTEX_HELD(&zp->z_lock));
+
+	i = 0;
+	while (i < aclp->z_acl_count) {
+		acep = aclp->z_acl;
+		entry_type = (acep[i].a_flags & 0xf040);
+
+		if ((acep[i].a_type != ALLOW && acep[i].a_type != DENY) ||
+		    (acep[i].a_flags & ACE_INHERIT_ONLY_ACE)) {
+			i++;
+			continue;
+		}
+
+
+		if (zfsvfs->z_acl_mode == DISCARD) {
+			zfs_ace_remove(aclp, i);
+			continue;
+		}
+
+		/*
+		 * Need to split ace into two?
+		 */
+		if ((acep[i].a_flags & (ACE_FILE_INHERIT_ACE|
+		    ACE_DIRECTORY_INHERIT_ACE)) &&
+		    (!(acep[i].a_flags & ACE_INHERIT_ONLY_ACE))) {
+			zfs_acl_split_ace(aclp, i);
+			i++;
+			continue;
+		}
+
+		if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
+		    (entry_type == OWNING_GROUP)) {
+			acep[i].a_access_mask &= ~OGE_CLEAR;
+			i++;
+			continue;
+
+		} else {
+			if (acep[i].a_type == ALLOW) {
+
+				/*
+				 * Check preceding ACE if any, to see
+				 * if we need to prepend a DENY ACE.
+				 * This is only applicable when the acl_mode
+				 * property == groupmask.
+				 */
+				if (zfsvfs->z_acl_mode == GROUPMASK) {
+
+					reuse_deny = zfs_reuse_deny(acep, i);
+
+					if (reuse_deny == B_FALSE) {
+						zfs_acl_prepend_deny(zp, aclp,
+						    i, mode);
+						i++;
+						acep = aclp->z_acl;
+					} else {
+						zfs_acl_prepend_fixup(
+						    &acep[i - 1],
+						    &acep[i], mode,
+						    zp->z_phys->zp_uid);
+					}
+					zfs_fixup_group_entries(&acep[i - 1],
+					    mode);
+				}
+			}
+			i++;
+		}
+	}
+
+	/*
+	 * Check out last six aces, if we have six.
+	 */
+
+	if (aclp->z_acl_count >= 6) {
+		i = aclp->z_acl_count - 6;
+
+		if (zfs_have_canonical_six(aclp, i)) {
+			need_canonical_six = 0;
+		}
+	}
+
+	if (need_canonical_six) {
+
+		zfs_acl_append(aclp, 6);
+		i = aclp->z_acl_count;
+		acep = aclp->z_acl;
+		zfs_set_ace(&acep[i++], 0, DENY, -1, ACE_OWNER);
+		zfs_set_ace(&acep[i++], OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER);
+		zfs_set_ace(&acep[i++], 0, DENY, -1, OWNING_GROUP);
+		zfs_set_ace(&acep[i++], 0, ALLOW, -1, OWNING_GROUP);
+		zfs_set_ace(&acep[i++], EVERYONE_DENY_MASK,
+		    DENY, -1, ACE_EVERYONE);
+		zfs_set_ace(&acep[i++], EVERYONE_ALLOW_MASK,
+		    ALLOW, -1, ACE_EVERYONE);
+		aclp->z_acl_count += 6;
+	}
+
+	zfs_acl_fixup_canonical_six(aclp, mode);
+
+	zp->z_phys->zp_mode = mode;
+	error = zfs_aclset_common(zp, aclp, tx, NULL);
+	return (error);
+}
+
+
+int
+zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx)
+{
+	zfs_acl_t *aclp;
+	int error;
+
+	ASSERT(MUTEX_HELD(&zp->z_lock));
+	mutex_enter(&zp->z_acl_lock);
+	aclp = zfs_acl_node_read(zp);
+	error = zfs_acl_chmod(zp, mode, aclp, tx);
+	mutex_exit(&zp->z_acl_lock);
+	zfs_acl_free(aclp);
+	return (error);
+}
+
+/*
+ * strip off write_owner and write_acl
+ */
+static void
+zfs_securemode_update(zfsvfs_t *zfsvfs, ace_t *acep)
+{
+	if ((zfsvfs->z_acl_inherit == SECURE) &&
+	    acep->a_type == ALLOW)
+		acep->a_access_mask &= ~SECURE_NO_INHERIT;
+}
+
+/*
+ * inherit inheritable ACEs from parent
+ */
+static zfs_acl_t *
+zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	ace_t 		*pacep;
+	ace_t		*acep;
+	int 		ace_cnt = 0;
+	int		pace_cnt;
+	int 		i, j;
+	zfs_acl_t	*aclp = NULL;
+
+	i = j = 0;
+	pace_cnt = paclp->z_acl_count;
+	pacep = paclp->z_acl;
+	if (zfsvfs->z_acl_inherit != DISCARD) {
+		for (i = 0; i != pace_cnt; i++) {
+
+			if (zfsvfs->z_acl_inherit == NOALLOW &&
+			    pacep[i].a_type == ALLOW)
+				continue;
+
+			if (zfs_ace_can_use(zp, &pacep[i])) {
+				ace_cnt++;
+				if (!(pacep[i].a_flags &
+				    ACE_NO_PROPAGATE_INHERIT_ACE))
+					ace_cnt++;
+			}
+		}
+	}
+
+	aclp = zfs_acl_alloc(ace_cnt + OGE_PAD);
+	if (ace_cnt && zfsvfs->z_acl_inherit != DISCARD) {
+		acep = aclp->z_acl;
+		pacep = paclp->z_acl;
+		for (i = 0; i != pace_cnt; i++) {
+
+			if (zfsvfs->z_acl_inherit == NOALLOW &&
+			    pacep[i].a_type == ALLOW)
+				continue;
+
+			if (zfs_ace_can_use(zp, &pacep[i])) {
+				/*
+				 * Now create entry for inherited ace
+				 */
+				acep[j] = pacep[i];
+
+				if (pacep[i].a_flags &
+				    ACE_NO_PROPAGATE_INHERIT_ACE) {
+					acep[j].a_flags &= ~ALL_INHERIT;
+					j++;
+					continue;
+				}
+
+				if (pacep[i].a_type != ALLOW &&
+				    pacep[i].a_type != DENY) {
+					zfs_securemode_update(zfsvfs, &acep[j]);
+					j++;
+					continue;
+				}
+
+				if (ZTOV(zp)->v_type != VDIR) {
+					acep[j].a_flags &= ~ALL_INHERIT;
+					zfs_securemode_update(zfsvfs, &acep[j]);
+					j++;
+					continue;
+				}
+
+				ASSERT(ZTOV(zp)->v_type == VDIR);
+
+				/*
+				 * If we are inheriting an ACE targeted for
+				 * only files, then leave the inherit_only
+				 * one for future propagation.
+				 */
+				if ((acep[j].a_flags & (ACE_FILE_INHERIT_ACE |
+				    ACE_DIRECTORY_INHERIT_ACE)) !=
+				    ACE_FILE_INHERIT_ACE)
+					acep[j].a_flags &=
+					    ~ACE_INHERIT_ONLY_ACE;
+
+				zfs_securemode_update(zfsvfs, &acep[j]);
+				j++;
+			}
+		}
+	}
+	aclp->z_acl_count = j;
+	ASSERT(aclp->z_slots >= aclp->z_acl_count);
+
+	return (aclp);
+}
+
+/*
+ * Create file system object initial permissions
+ * including inheritable ACEs.
+ */
+void
+zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
+    vattr_t *vap, dmu_tx_t *tx, cred_t *cr)
+{
+	uint64_t	mode;
+	uid_t		uid;
+	gid_t		gid;
+	int		error;
+	int		pull_down;
+	zfs_acl_t	*aclp, *paclp;
+
+	mode = MAKEIMODE(vap->va_type, vap->va_mode);
+
+	/*
+	 * Determine uid and gid.
+	 */
+	if ((flag & (IS_ROOT_NODE | IS_REPLAY)) ||
+	    ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
+		uid = vap->va_uid;
+		gid = vap->va_gid;
+	} else {
+		uid = crgetuid(cr);
+		if ((vap->va_mask & AT_GID) &&
+		    ((vap->va_gid == parent->z_phys->zp_gid) ||
+		    groupmember(vap->va_gid, cr) ||
+		    secpolicy_vnode_create_gid(cr)))
+			gid = vap->va_gid;
+		else
+			gid = (parent->z_phys->zp_mode & S_ISGID) ?
+			    parent->z_phys->zp_gid : crgetgid(cr);
+	}
+
+	/*
+	 * If we're creating a directory, and the parent directory has the
+	 * set-GID bit set, set in on the new directory.
+	 * Otherwise, if the user is neither privileged nor a member of the
+	 * file's new group, clear the file's set-GID bit.
+	 */
+
+	if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR))
+		mode |= S_ISGID;
+	else {
+		if ((mode & S_ISGID) &&
+		    secpolicy_vnode_setids_setgids(cr, gid) != 0)
+			mode &= ~S_ISGID;
+	}
+
+	zp->z_phys->zp_uid = uid;
+	zp->z_phys->zp_gid = gid;
+	zp->z_phys->zp_mode = mode;
+
+	mutex_enter(&parent->z_lock);
+	pull_down = (parent->z_phys->zp_flags & ZFS_INHERIT_ACE);
+	if (pull_down) {
+		mutex_enter(&parent->z_acl_lock);
+		paclp = zfs_acl_node_read(parent);
+		mutex_exit(&parent->z_acl_lock);
+		aclp = zfs_acl_inherit(zp, paclp);
+		zfs_acl_free(paclp);
+	} else {
+		aclp = zfs_acl_alloc(6);
+	}
+	mutex_exit(&parent->z_lock);
+	mutex_enter(&zp->z_lock);
+	mutex_enter(&zp->z_acl_lock);
+	error = zfs_acl_chmod(zp, mode, aclp, tx);
+	mutex_exit(&zp->z_lock);
+	mutex_exit(&zp->z_acl_lock);
+	ASSERT3U(error, ==, 0);
+	zfs_acl_free(aclp);
+}
+
+/*
+ * Can use be used for inheritance
+ */
+static int
+zfs_ace_can_use(znode_t *zp, ace_t *acep)
+{
+	int vtype = ZTOV(zp)->v_type;
+
+	int	iflags = (acep->a_flags & 0xf);
+
+	if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
+		return (1);
+
+	else if (iflags & ACE_FILE_INHERIT_ACE)
+		return (1);
+
+	return (0);
+}
+
+/*
+ * Retrieve a files ACL
+ */
+int
+zfs_getacl(znode_t *zp, vsecattr_t  *vsecp, cred_t *cr)
+{
+	zfs_acl_t	*aclp;
+	ulong_t		mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+	int		error;
+
+	if (error = zfs_zaccess(zp, ACE_READ_ACL, cr)) {
+		/*
+		 * If owner of file then allow reading of the
+		 * ACL.
+		 */
+		if (crgetuid(cr) != zp->z_phys->zp_uid)
+			return (error);
+	}
+
+	if (mask == 0)
+		return (ENOSYS);
+
+	mutex_enter(&zp->z_acl_lock);
+
+	aclp = zfs_acl_node_read(zp);
+
+	if (mask & VSA_ACECNT) {
+		vsecp->vsa_aclcnt = aclp->z_acl_count;
+	}
+
+	if (mask & VSA_ACE) {
+		vsecp->vsa_aclentp = kmem_alloc(aclp->z_acl_count *
+		    sizeof (ace_t), KM_SLEEP);
+		bcopy(aclp->z_acl, vsecp->vsa_aclentp,
+		    aclp->z_acl_count * sizeof (ace_t));
+	}
+
+	mutex_exit(&zp->z_acl_lock);
+
+	zfs_acl_free(aclp);
+
+	return (0);
+}
+
+/*
+ * Set a files ACL
+ */
+int
+zfs_setacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	ace_t		*acep = vsecp->vsa_aclentp;
+	int		aclcnt = vsecp->vsa_aclcnt;
+	ulong_t		mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+	dmu_tx_t	*tx;
+	int		error;
+	int		inherit;
+	zfs_acl_t	*aclp;
+	uint64_t	seq = 0;
+
+	if (mask == 0)
+		return (EINVAL);
+
+	if (!zfs_acl_valid(zp, acep, aclcnt, &inherit))
+		return (EINVAL);
+top:
+	error = zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr);
+	if (error == EACCES || error == ACCESS_UNDETERMINED) {
+		if ((error = secpolicy_vnode_setdac(cr,
+		    zp->z_phys->zp_uid)) != 0) {
+			return (error);
+		}
+	} else if (error) {
+		return (error == EROFS ? error : EPERM);
+	}
+
+	mutex_enter(&zp->z_lock);
+	mutex_enter(&zp->z_acl_lock);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+
+	if (zp->z_phys->zp_acl.z_acl_extern_obj) {
+		dmu_tx_hold_write(tx, zp->z_phys->zp_acl.z_acl_extern_obj,
+		    0, ZFS_ACL_SIZE(aclcnt));
+	} else if (aclcnt > ACE_SLOT_CNT) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, ZFS_ACL_SIZE(aclcnt));
+	}
+
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+
+		mutex_exit(&zp->z_acl_lock);
+		mutex_exit(&zp->z_lock);
+
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		return (error);
+	}
+
+	aclp = zfs_acl_alloc(aclcnt);
+	bcopy(acep, aclp->z_acl, sizeof (ace_t) * aclcnt);
+	aclp->z_acl_count = aclcnt;
+	error = zfs_aclset_common(zp, aclp, tx, &inherit);
+	ASSERT(error == 0);
+
+	zfs_acl_free(aclp);
+	seq = zfs_log_acl(zilog, tx, TX_ACL, zp, aclcnt, acep);
+	dmu_tx_commit(tx);
+done:
+	mutex_exit(&zp->z_acl_lock);
+	mutex_exit(&zp->z_lock);
+
+	zil_commit(zilog, seq, 0);
+
+	return (error);
+}
+
+static int
+zfs_ace_access(ace_t *zacep, int mode_wanted, int *working_mode)
+{
+	if ((*working_mode & mode_wanted) == mode_wanted) {
+		return (0);
+	}
+
+	if (zacep->a_access_mask & mode_wanted) {
+		if (zacep->a_type == ALLOW) {
+			*working_mode |= (mode_wanted & zacep->a_access_mask);
+			if ((*working_mode & mode_wanted) == mode_wanted)
+				return (0);
+		} else if (zacep->a_type == DENY) {
+			return (EACCES);
+		}
+	}
+
+	/*
+	 * haven't been specifcally denied at this point
+	 * so return UNDETERMINED.
+	 */
+
+	return (ACCESS_UNDETERMINED);
+}
+
+
+static int
+zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr)
+{
+	zfs_acl_t	*aclp;
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	ace_t		*zacep;
+	gid_t		gid;
+	int		mode_wanted = v4_mode;
+	int		cnt;
+	int		i;
+	int		access_deny = ACCESS_UNDETERMINED;
+	uint_t		entry_type;
+	uid_t		uid = crgetuid(cr);
+
+	*working_mode = 0;
+
+	if (zfsvfs->z_assign >= TXG_INITIAL)		/* ZIL replay */
+		return (0);
+
+	if ((v4_mode & WRITE_MASK) &&
+	    (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
+	    (!IS_DEVVP(ZTOV(zp)))) {
+		return (EROFS);
+	}
+
+	mutex_enter(&zp->z_acl_lock);
+
+	aclp = zfs_acl_node_read(zp);
+
+	zacep = aclp->z_acl;
+	cnt = aclp->z_acl_count;
+
+	for (i = 0; i != cnt; i++) {
+
+		if (zacep[i].a_flags & ACE_INHERIT_ONLY_ACE)
+			continue;
+
+		entry_type = (zacep[i].a_flags & 0xf040);
+		switch (entry_type) {
+		case ACE_OWNER:
+			if (uid == zp->z_phys->zp_uid) {
+				access_deny = zfs_ace_access(&zacep[i],
+				    mode_wanted, working_mode);
+			}
+			break;
+		case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+		case ACE_IDENTIFIER_GROUP:
+			/*
+			 * Owning group gid is in znode not ACL
+			 */
+			if (entry_type == (ACE_IDENTIFIER_GROUP | ACE_GROUP))
+				gid = zp->z_phys->zp_gid;
+			else
+				gid = zacep[i].a_who;
+
+			if (groupmember(gid, cr)) {
+				access_deny = zfs_ace_access(&zacep[i],
+				    mode_wanted, working_mode);
+			}
+			break;
+		case ACE_EVERYONE:
+			access_deny = zfs_ace_access(&zacep[i],
+			    mode_wanted, working_mode);
+			break;
+
+		/* USER Entry */
+		default:
+			if (entry_type == 0) {
+				if (uid == zacep[i].a_who) {
+					access_deny = zfs_ace_access(&zacep[i],
+					    mode_wanted, working_mode);
+				}
+				break;
+			}
+			zfs_acl_free(aclp);
+			mutex_exit(&zp->z_acl_lock);
+			return (EIO);
+		}
+
+		if (access_deny != ACCESS_UNDETERMINED)
+			break;
+
+	}
+
+	mutex_exit(&zp->z_acl_lock);
+	zfs_acl_free(aclp);
+
+	return (access_deny);
+}
+
+
+/*
+ * Determine whether Access should be granted/denied, invoking least
+ * priv subsytem when a deny is determined.
+ */
+int
+zfs_zaccess(znode_t *zp, int mode, cred_t *cr)
+{
+	int	working_mode = 0;
+	int	error;
+	int	is_attr;
+	znode_t	*xzp;
+	znode_t *check_zp = zp;
+
+	is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) &&
+	    (ZTOV(zp)->v_type == VDIR));
+
+	/*
+	 * If attribute then validate against base file
+	 */
+	if (is_attr) {
+		if ((error = zfs_zget(zp->z_zfsvfs,
+		    zp->z_phys->zp_parent, &xzp)) != 0)	{
+			return (error);
+		}
+		check_zp = xzp;
+		/*
+		 * fixup mode to map to xattr perms
+		 */
+
+		if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+			mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+			mode |= ACE_WRITE_NAMED_ATTRS;
+		}
+
+		if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+			mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+			mode |= ACE_READ_NAMED_ATTRS;
+		}
+	}
+
+	error = zfs_zaccess_common(check_zp, mode, &working_mode, cr);
+
+	if (error == EROFS) {
+		if (is_attr)
+			VN_RELE(ZTOV(xzp));
+		return (error);
+	}
+
+	if (error || (working_mode != mode)) {
+		error = secpolicy_vnode_access(cr, ZTOV(check_zp),
+		    check_zp->z_phys->zp_uid, ~zfs_v4_to_unix(working_mode));
+	}
+
+	if (is_attr)
+		VN_RELE(ZTOV(xzp));
+
+	return (error);
+}
+
+/*
+ * Special zaccess function to check for special nfsv4 perm.
+ * doesn't call secpolicy_vnode_access() for failure, since that
+ * would probably be the wrong policy function to call.
+ * instead its up to the caller to handle that situation.
+ */
+
+int
+zfs_zaccess_v4_perm(znode_t *zp, int mode, cred_t *cr)
+{
+	int working_mode = 0;
+	return (zfs_zaccess_common(zp, mode, &working_mode, cr));
+}
+
+/*
+ * Translate tradition unix VREAD/VWRITE/VEXEC mode into
+ * native ACL format and call zfs_zaccess()
+ */
+int
+zfs_zaccess_rwx(znode_t *zp, mode_t mode, cred_t *cr)
+{
+	int v4_mode = zfs_unix_to_v4(mode >> 6);
+
+	return (zfs_zaccess(zp, v4_mode, cr));
+}
+
+/*
+ * Determine whether Access should be granted/deny, without
+ * consulting least priv subsystem.
+ *
+ *
+ * The following chart is the recommended NFSv4 enforcement for
+ * ability to delete an object.
+ *
+ *      -------------------------------------------------------
+ *      |   Parent Dir  |           Target Object Permissions |
+ *      |  permissions  |                                     |
+ *      -------------------------------------------------------
+ *      |               | ACL Allows | ACL Denies| Delete     |
+ *      |               |  Delete    |  Delete   | unspecified|
+ *      -------------------------------------------------------
+ *      |  ACL Allows   | Permit     | Permit    | Permit     |
+ *      |  DELETE_CHILD |                                     |
+ *      -------------------------------------------------------
+ *      |  ACL Denies   | Permit     | Deny      | Deny       |
+ *      |  DELETE_CHILD |            |           |            |
+ *      -------------------------------------------------------
+ *      | ACL specifies |            |           |            |
+ *      | only allow    | Permit     | Permit    | Permit     |
+ *      | write and     |            |           |            |
+ *      | execute       |            |           |            |
+ *      -------------------------------------------------------
+ *      | ACL denies    |            |           |            |
+ *      | write and     | Permit     | Deny      | Deny       |
+ *      | execute       |            |           |            |
+ *      -------------------------------------------------------
+ *         ^
+ *         |
+ *         No search privilege, can't even look up file?
+ *
+ */
+int
+zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
+{
+	int dzp_working_mode = 0;
+	int zp_working_mode = 0;
+	int dzp_error, zp_error;
+
+	/*
+	 * Arghh, this check is going to require a couple of questions
+	 * to be asked.  We want specific DELETE permissions to
+	 * take precedence over WRITE/EXECUTE.  We don't
+	 * want an ACL such as this to mess us up.
+	 * user:sloar:write_data:deny,user:sloar:delete:allow
+	 *
+	 * However, deny permissions may ultimately be overridden
+	 * by secpolicy_vnode_access().
+	 */
+
+	dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
+	    &dzp_working_mode, cr);
+	zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, cr);
+
+	if (dzp_error == EROFS || zp_error == EROFS)
+		return (dzp_error);
+
+	/*
+	 * First handle the first row
+	 */
+	if (dzp_working_mode & ACE_DELETE_CHILD)
+		return (0);
+
+	/*
+	 * Second row
+	 */
+
+	if (zp_working_mode & ACE_DELETE)
+		return (0);
+
+	/*
+	 * Third Row
+	 */
+
+	dzp_error = zfs_zaccess_common(dzp, ACE_WRITE_DATA|ACE_EXECUTE,
+	    &dzp_working_mode, cr);
+
+	if (dzp_error == EROFS)
+		return (dzp_error);
+
+	if (dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE))
+		return (0);
+
+	/*
+	 * Fourth Row
+	 */
+
+	if (((dzp_working_mode & (ACE_WRITE_DATA|ACE_EXECUTE)) == 0) &&
+	    (zp_working_mode & ACE_DELETE))
+		return (0);
+
+	return (secpolicy_vnode_access(cr, ZTOV(zp), dzp->z_phys->zp_uid,
+	    S_IWRITE|S_IEXEC));
+}
+
+int
+zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
+    znode_t *tzp, cred_t *cr)
+{
+	int add_perm;
+	int error;
+
+	add_perm = (ZTOV(szp)->v_type == VDIR) ?
+	    ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
+
+	/*
+	 * Rename permissions are combination of delete permission +
+	 * add file/subdir permission.
+	 */
+
+	/*
+	 * first make sure we do the delete portion.
+	 *
+	 * If that succeeds then check for add_file/add_subdir permissions
+	 */
+
+	if (error = zfs_zaccess_delete(sdzp, szp, cr))
+		return (error);
+
+	/*
+	 * If we have a tzp, see if we can delete it?
+	 */
+	if (tzp) {
+		if (error = zfs_zaccess_delete(tdzp, tzp, cr))
+			return (error);
+	}
+
+	/*
+	 * Now check for add permissions
+	 */
+	if (error = zfs_zaccess(sdzp, add_perm, cr))
+		return (error);
+
+	error = zfs_sticky_remove_access(sdzp, szp, cr);
+
+	return (error);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_byteswap.c b/usr/src/uts/common/fs/zfs/zfs_byteswap.c
new file mode 100644
index 000000000000..e1e857aa447c
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_byteswap.c
@@ -0,0 +1,99 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_acl.h>
+
+void
+zfs_ace_byteswap(ace_t *ace, int ace_cnt)
+{
+	int i;
+
+	for (i = 0; i != ace_cnt; i++, ace++) {
+		ace->a_who = BSWAP_32(ace->a_who);
+		ace->a_access_mask = BSWAP_32(ace->a_access_mask);
+		ace->a_flags = BSWAP_16(ace->a_flags);
+		ace->a_type = BSWAP_16(ace->a_type);
+	}
+}
+
+/* ARGSUSED */
+void
+zfs_acl_byteswap(void *buf, size_t size)
+{
+	int cnt;
+
+	/*
+	 * Arggh, since we don't know how many ACEs are in
+	 * the array, we have to swap the entire block
+	 */
+
+	cnt = size / sizeof (ace_t);
+
+	zfs_ace_byteswap((ace_t *)buf, cnt);
+}
+
+void
+zfs_znode_byteswap(void *buf, size_t size)
+{
+	znode_phys_t *zp = buf;
+
+	ASSERT(size >= sizeof (znode_phys_t));
+
+	zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]);
+	zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]);
+	zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]);
+	zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]);
+	zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]);
+	zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]);
+	zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]);
+	zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]);
+	zp->zp_gen = BSWAP_64(zp->zp_gen);
+	zp->zp_mode = BSWAP_64(zp->zp_mode);
+	zp->zp_size = BSWAP_64(zp->zp_size);
+	zp->zp_parent = BSWAP_64(zp->zp_parent);
+	zp->zp_links = BSWAP_64(zp->zp_links);
+	zp->zp_xattr = BSWAP_64(zp->zp_xattr);
+	zp->zp_rdev = BSWAP_64(zp->zp_rdev);
+	zp->zp_flags = BSWAP_64(zp->zp_flags);
+	zp->zp_uid = BSWAP_64(zp->zp_uid);
+	zp->zp_gid = BSWAP_64(zp->zp_gid);
+	zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]);
+	zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]);
+	zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]);
+	zp->zp_pad[3] = BSWAP_64(zp->zp_pad[3]);
+
+	zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj);
+	zp->zp_acl.z_acl_count = BSWAP_32(zp->zp_acl.z_acl_count);
+	zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version);
+	zp->zp_acl.z_acl_state = BSWAP_16(zp->zp_acl.z_acl_state);
+	zfs_ace_byteswap(&zp->zp_acl.z_ace_data[0], ACE_SLOT_CNT);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_ctldir.c b/usr/src/uts/common/fs/zfs/zfs_ctldir.c
new file mode 100644
index 000000000000..229b042c4a8c
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_ctldir.c
@@ -0,0 +1,936 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * ZFS control directory (a.k.a. ".zfs")
+ *
+ * This directory provides a common location for all ZFS meta-objects.
+ * Currently, this is only the 'snapshot' directory, but this may expand in the
+ * future.  The elements are built using the GFS primitives, as the hierarchy
+ * does not actually exist on disk.
+ *
+ * For 'snapshot', we don't want to have all snapshots always mounted, because
+ * this would take up a huge amount of space in /etc/mnttab.  We have three
+ * types of objects:
+ *
+ * 	ctldir ------> snapshotdir -------> snapshot
+ *                                             |
+ *                                             |
+ *                                             V
+ *                                         mounted fs
+ *
+ * The 'snapshot' node contains just enough information to lookup '..' and act
+ * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
+ * perform an automount of the underlying filesystem and return the
+ * corresponding vnode.
+ *
+ * All mounts are handled automatically by the kernel, but unmounts are
+ * (currently) handled from user land.  The main reason is that there is no
+ * reliable way to auto-unmount the filesystem when it's "no longer in use".
+ * When the user unmounts a filesystem, we call zfsctl_unmount(), which
+ * unmounts any snapshots within the snapshot directory.
+ */
+
+#include <fs/fs_subr.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/gfs.h>
+#include <sys/stat.h>
+#include <sys/dmu.h>
+#include <sys/mount.h>
+
+typedef struct {
+	char		*se_name;
+	vnode_t		*se_root;
+	avl_node_t	se_node;
+} zfs_snapentry_t;
+
+static int
+snapentry_compare(const void *a, const void *b)
+{
+	const zfs_snapentry_t *sa = a;
+	const zfs_snapentry_t *sb = b;
+	int ret = strcmp(sa->se_name, sb->se_name);
+
+	if (ret < 0)
+		return (-1);
+	else if (ret > 0)
+		return (1);
+	else
+		return (0);
+}
+
+vnodeops_t *zfsctl_ops_root;
+vnodeops_t *zfsctl_ops_snapdir;
+vnodeops_t *zfsctl_ops_snapshot;
+
+static const fs_operation_def_t zfsctl_tops_root[];
+static const fs_operation_def_t zfsctl_tops_snapdir[];
+static const fs_operation_def_t zfsctl_tops_snapshot[];
+
+static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
+static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
+
+static gfs_opsvec_t zfsctl_opsvec[] = {
+	{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
+	{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
+	{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
+	{ NULL }
+};
+
+typedef struct zfsctl_node {
+	gfs_dir_t	zc_gfs_private;
+	uint64_t	zc_id;
+} zfsctl_node_t;
+
+typedef struct zfsctl_snapdir {
+	zfsctl_node_t	sd_node;
+	kmutex_t	sd_lock;
+	avl_tree_t	sd_snaps;
+} zfsctl_snapdir_t;
+
+/*
+ * Root directory elements.  We have only a single static entry, 'snapshot'.
+ */
+static gfs_dirent_t zfsctl_root_entries[] = {
+	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
+	{ NULL }
+};
+
+/* include . and .. in the calculation */
+#define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
+    sizeof (gfs_dirent_t)) + 1)
+
+
+/*
+ * Initialize the various GFS pieces we'll need to create and manipulate .zfs
+ * directories.  This is called from the ZFS init routine, and initializes the
+ * vnode ops vectors that we'll be using.
+ */
+void
+zfsctl_init(void)
+{
+	VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
+}
+
+void
+zfsctl_fini(void)
+{
+	/*
+	 * Remove vfsctl vnode ops
+	 */
+	if (zfsctl_ops_root)
+		vn_freevnodeops(zfsctl_ops_root);
+	if (zfsctl_ops_snapdir)
+		vn_freevnodeops(zfsctl_ops_snapdir);
+	if (zfsctl_ops_snapshot)
+		vn_freevnodeops(zfsctl_ops_snapshot);
+
+	zfsctl_ops_root = NULL;
+	zfsctl_ops_snapdir = NULL;
+	zfsctl_ops_snapshot = NULL;
+}
+
+/*
+ * Return the inode number associated with the 'snapshot' directory.
+ */
+/* ARGSUSED */
+static ino64_t
+zfsctl_root_inode_cb(vnode_t *vp, int index)
+{
+	ASSERT(index == 0);
+	return (ZFSCTL_INO_SNAPDIR);
+}
+
+/*
+ * Create the '.zfs' directory.  This directory is cached as part of the VFS
+ * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
+ * therefore checks against a vfs_count of 2 instead of 1.  This reference
+ * is removed when the ctldir is destroyed in the unmount.
+ */
+void
+zfsctl_create(zfsvfs_t *zfsvfs)
+{
+	vnode_t *vp;
+	zfsctl_node_t *zcp;
+
+	ASSERT(zfsvfs->z_ctldir == NULL);
+
+	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
+	    zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
+	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
+	zcp = vp->v_data;
+	zcp->zc_id = ZFSCTL_INO_ROOT;
+
+	/*
+	 * We're only faking the fact that we have a root of a filesystem for
+	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
+	 * for us.
+	 */
+	vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
+
+	zfsvfs->z_ctldir = vp;
+}
+
+/*
+ * Destroy the '.zfs' directory.  Only called when the filesystem is
+ * unmounted, and there are no more references.  Release the vnode,
+ * which will release the hold on the vfs structure.
+ */
+void
+zfsctl_destroy(zfsvfs_t *zfsvfs)
+{
+	ASSERT(zfsvfs->z_ctldir->v_count == 1);
+	VN_RELE(zfsvfs->z_ctldir);
+	zfsvfs->z_ctldir = NULL;
+}
+
+/*
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
+ */
+vnode_t *
+zfsctl_root(znode_t *zp)
+{
+	ASSERT(zfs_has_ctldir(zp));
+	VN_HOLD(zp->z_zfsvfs->z_ctldir);
+	return (zp->z_zfsvfs->z_ctldir);
+}
+
+/*
+ * Common open routine.  Disallow any write access.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr)
+{
+	if (flags & FWRITE)
+		return (EACCES);
+
+	return (0);
+}
+
+/*
+ * Common close routine.  Nothing to do here.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
+    cred_t *cr)
+{
+	return (0);
+}
+
+/*
+ * Common access routine.  Disallow writes.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr)
+{
+	if (mode & VWRITE)
+		return (EACCES);
+
+	return (0);
+}
+
+/*
+ * Common getattr function.  Fill in basic information.
+ */
+static void
+zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
+{
+	timestruc_t now;
+
+	vap->va_uid = 0;
+	vap->va_gid = 0;
+	vap->va_rdev = 0;
+	/*
+	 * We are a purly virtual object, so we have no
+	 * blocksize or allocated blocks.
+	 */
+	vap->va_blksize = 0;
+	vap->va_nblocks = 0;
+	vap->va_seq = 0;
+	vap->va_fsid = vp->v_vfsp->vfs_dev;
+	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
+	    S_IROTH | S_IXOTH;
+	vap->va_type = VDIR;
+	/*
+	 * We live in the now.
+	 */
+	gethrestime(&now);
+	vap->va_mtime = vap->va_ctime = vap->va_atime = now;
+}
+
+static int
+zfsctl_common_fid(vnode_t *vp, fid_t *fidp)
+{
+	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
+	zfsctl_node_t	*zcp = vp->v_data;
+	uint64_t	object = zcp->zc_id;
+	zfid_short_t	*zfid;
+	int		i;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (fidp->fid_len < SHORT_FID_LEN) {
+		fidp->fid_len = SHORT_FID_LEN;
+		return (ENOSPC);
+	}
+
+	zfid = (zfid_short_t *)fidp;
+
+	zfid->zf_len = SHORT_FID_LEN;
+
+	for (i = 0; i < sizeof (zfid->zf_object); i++)
+		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+	/* .zfs znodes always have a generation number of 0 */
+	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+		zfid->zf_gen[i] = 0;
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * .zfs inode namespace
+ *
+ * We need to generate unique inode numbers for all files and directories
+ * within the .zfs pseudo-filesystem.  We use the following scheme:
+ *
+ * 	ENTRY			ZFSCTL_INODE
+ * 	.zfs			1
+ * 	.zfs/snapshot		2
+ * 	.zfs/snapshot/<snap>	objectid(snap)
+ */
+
+#define	ZFSCTL_INO_SNAP(id)	(id)
+
+/*
+ * Get root directory attributes.
+ */
+/* ARGSUSED */
+static int
+zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+
+	ZFS_ENTER(zfsvfs);
+	vap->va_nodeid = ZFSCTL_INO_ROOT;
+	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
+
+	zfsctl_common_getattr(vp, vap);
+	ZFS_EXIT(zfsvfs);
+
+	return (0);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+/* ARGSUSED */
+int
+zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+    int flags, vnode_t *rdir, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+	int err;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (strcmp(nm, "..") == 0) {
+		err = VFS_ROOT(dvp->v_vfsp, vpp);
+	} else {
+		err = gfs_dir_lookup(dvp, nm, vpp);
+	}
+
+	ZFS_EXIT(zfsvfs);
+
+	return (err);
+}
+
+static const fs_operation_def_t zfsctl_tops_root[] = {
+	{ VOPNAME_OPEN,		zfsctl_common_open			},
+	{ VOPNAME_CLOSE,	zfsctl_common_close			},
+	{ VOPNAME_IOCTL,	fs_inval				},
+	{ VOPNAME_GETATTR,	zfsctl_root_getattr			},
+	{ VOPNAME_ACCESS,	zfsctl_common_access			},
+	{ VOPNAME_READDIR,	gfs_vop_readdir				},
+	{ VOPNAME_LOOKUP,	zfsctl_root_lookup			},
+	{ VOPNAME_SEEK,		fs_seek					},
+	{ VOPNAME_INACTIVE,	(fs_generic_func_p) gfs_vop_inactive	},
+	{ VOPNAME_FID,		zfsctl_common_fid			},
+	{ NULL }
+};
+
+static int
+zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
+{
+	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+
+	dmu_objset_name(os, zname);
+	(void) strcat(zname, "@");
+	if (strlen(zname) + strlen(name) >= len)
+		return (ENAMETOOLONG);
+	(void) strcat(zname, name);
+	return (0);
+}
+
+static int
+zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
+{
+	zfsctl_snapdir_t *sdp = dvp->v_data;
+	zfs_snapentry_t search, *sep;
+	avl_index_t where;
+	int err;
+
+	ASSERT(MUTEX_HELD(&sdp->sd_lock));
+
+	search.se_name = (char *)name;
+	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
+		return (ENOENT);
+
+	ASSERT(vn_ismntpt(sep->se_root));
+
+	/* this will be dropped by dounmount() */
+	if ((err = vn_vfswlock(sep->se_root)) != 0)
+		return (err);
+
+	VN_HOLD(sep->se_root);
+	if ((err = dounmount(vn_mountedvfs(sep->se_root), force, kcred)) != 0)
+		return (err);
+	ASSERT(sep->se_root->v_count == 1);
+	gfs_vop_inactive(sep->se_root, cr);
+
+	avl_remove(&sdp->sd_snaps, sep);
+	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+	kmem_free(sep, sizeof (zfs_snapentry_t));
+
+	return (0);
+}
+
+
+static int
+zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
+{
+	avl_index_t where;
+	vfs_t *vfsp;
+	refstr_t *pathref;
+	char newpath[MAXNAMELEN];
+	const char *oldpath;
+	char *tail;
+	int err;
+
+	ASSERT(MUTEX_HELD(&sdp->sd_lock));
+	ASSERT(sep != NULL);
+
+	vfsp = vn_mountedvfs(sep->se_root);
+	ASSERT(vfsp != NULL);
+
+	if (err = vfs_lock(vfsp))
+		return (err);
+
+	/*
+	 * Change the name in the AVL tree.
+	 */
+	avl_remove(&sdp->sd_snaps, sep);
+	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
+	(void) strcpy(sep->se_name, nm);
+	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
+	avl_insert(&sdp->sd_snaps, sep, where);
+
+	/*
+	 * Change the current mountpoint info:
+	 * 	- update the tail of the mntpoint path
+	 *	- update the tail of the resource path
+	 */
+	pathref = vfs_getmntpoint(vfsp);
+	oldpath = refstr_value(pathref);
+	VERIFY((tail = strrchr(oldpath, '/')) != NULL);
+	ASSERT((tail - oldpath) + strlen(nm) + 2 < MAXNAMELEN);
+	(void) strncpy(newpath, oldpath, tail - oldpath + 1);
+	(void) strcat(newpath, nm);
+	refstr_rele(pathref);
+	vfs_setmntpoint(vfsp, newpath);
+
+	pathref = vfs_getresource(vfsp);
+	oldpath = refstr_value(pathref);
+	VERIFY((tail = strrchr(oldpath, '@')) != NULL);
+	ASSERT((tail - oldpath) + strlen(nm) + 2 < MAXNAMELEN);
+	(void) strncpy(newpath, oldpath, tail - oldpath + 1);
+	(void) strcat(newpath, nm);
+	refstr_rele(pathref);
+	vfs_setresource(vfsp, newpath);
+
+	vfs_unlock(vfsp);
+	return (0);
+}
+
+static int
+zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
+    cred_t *cr)
+{
+	zfsctl_snapdir_t *sdp = sdvp->v_data;
+	zfs_snapentry_t search, *sep;
+	avl_index_t where;
+	char from[MAXNAMELEN], to[MAXNAMELEN];
+	int err;
+
+	VERIFY(zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from) == 0);
+	err = zfs_secpolicy_write(from, NULL, cr);
+	if (err)
+		return (err);
+
+	/*
+	 * Cannot move snapshots out of the snapdir.
+	 */
+	if (sdvp != tdvp)
+		return (EINVAL);
+
+	if (strcmp(snm, tnm) == 0)
+		return (0);
+
+	mutex_enter(&sdp->sd_lock);
+
+	search.se_name = (char *)snm;
+	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
+		err = zfsctl_rename_snap(sdp, sep, tnm);
+		if (err) {
+			mutex_exit(&sdp->sd_lock);
+			return (err);
+		}
+	}
+
+
+	VERIFY(zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to) == 0);
+	err = dmu_objset_rename(from, to);
+
+	mutex_exit(&sdp->sd_lock);
+
+	return (err);
+}
+
+/* ARGSUSED */
+static int
+zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
+{
+	zfsctl_snapdir_t *sdp = dvp->v_data;
+	char snapname[MAXNAMELEN];
+	int err;
+
+	VERIFY(zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname) == 0);
+	err = zfs_secpolicy_write(snapname, NULL, cr);
+	if (err)
+		return (err);
+
+	mutex_enter(&sdp->sd_lock);
+
+	err = zfsctl_unmount_snap(dvp, name, 0, cr);
+	if (err) {
+		mutex_exit(&sdp->sd_lock);
+		return (err);
+	}
+
+	err = dmu_objset_destroy(snapname);
+
+	mutex_exit(&sdp->sd_lock);
+
+	return (err);
+}
+
+/*
+ * Lookup entry point for the 'snapshot' directory.  Try to open the
+ * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
+ * Perform a mount of the associated dataset on top of the vnode.
+ */
+/* ARGSUSED */
+static int
+zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+    int flags, vnode_t *rdir, cred_t *cr)
+{
+	zfsctl_snapdir_t *sdp = dvp->v_data;
+	objset_t *snap;
+	char snapname[MAXNAMELEN];
+	char *mountpoint;
+	zfs_snapentry_t *sep, search;
+	struct mounta margs;
+	vfs_t *vfsp;
+	size_t mountpoint_len;
+	avl_index_t where;
+	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+	int err;
+
+	ASSERT(dvp->v_type == VDIR);
+
+	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
+		return (0);
+
+	/*
+	 * If we get a recursive call, that means we got called
+	 * from the domount() code while it was trying to look up the
+	 * spec (which looks like a local path for zfs).  We need to
+	 * add some flag to domount() to tell it not to do this lookup.
+	 */
+	if (MUTEX_HELD(&sdp->sd_lock))
+		return (ENOENT);
+
+	ZFS_ENTER(zfsvfs);
+
+	mutex_enter(&sdp->sd_lock);
+	search.se_name = (char *)nm;
+	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
+		*vpp = sep->se_root;
+		VN_HOLD(*vpp);
+		/*
+		 * If the snapshot was unmounted behind our backs, remount it.
+		 */
+		if (!vn_ismntpt(*vpp))
+			goto domount;
+		VERIFY(traverse(vpp) == 0);
+		mutex_exit(&sdp->sd_lock);
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/*
+	 * The requested snapshot is not currently mounted, look it up.
+	 */
+	VERIFY(zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname) == 0);
+	if (dmu_objset_open(snapname, DMU_OST_ZFS,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
+		mutex_exit(&sdp->sd_lock);
+		ZFS_EXIT(zfsvfs);
+		return (ENOENT);
+	}
+
+	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
+	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
+	(void) strcpy(sep->se_name, nm);
+	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
+	avl_insert(&sdp->sd_snaps, sep, where);
+
+	dmu_objset_close(snap);
+domount:
+	mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
+	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
+	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
+	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
+	    refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
+
+	margs.spec = snapname;
+	margs.dir = mountpoint;
+	margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
+	margs.fstype = "zfs";
+	margs.dataptr = NULL;
+	margs.datalen = 0;
+	margs.optptr = NULL;
+	margs.optlen = 0;
+
+	err = domount("zfs", &margs, *vpp, kcred, &vfsp);
+	ASSERT3U(err, ==, 0);
+
+	kmem_free(mountpoint, mountpoint_len);
+
+	VFS_RELE(vfsp);
+
+	/*
+	 * Fix up the root vnode.
+	 */
+	VERIFY(traverse(vpp) == 0);
+	ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
+	VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
+	(*vpp)->v_vfsp = zfsvfs->z_vfs;
+	(*vpp)->v_flag &= ~VROOT;
+	mutex_exit(&sdp->sd_lock);
+	ZFS_EXIT(zfsvfs);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
+    offset_t *offp, offset_t *nextp, void *data)
+{
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	char snapname[MAXNAMELEN];
+	uint64_t id, cookie;
+
+	ZFS_ENTER(zfsvfs);
+
+	cookie = *offp;
+	if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
+	    &cookie) == ENOENT) {
+		*eofp = 1;
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	(void) strcpy(dp->d_name, snapname);
+	dp->d_ino = ZFSCTL_INO_SNAP(id);
+	*nextp = cookie;
+
+	ZFS_EXIT(zfsvfs);
+
+	return (0);
+}
+
+vnode_t *
+zfsctl_mknode_snapdir(vnode_t *pvp)
+{
+	vnode_t *vp;
+	zfsctl_snapdir_t *sdp;
+
+	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
+	    zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
+	    zfsctl_snapdir_readdir_cb, NULL);
+	sdp = vp->v_data;
+	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
+	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
+	avl_create(&sdp->sd_snaps, snapentry_compare,
+	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
+	return (vp);
+}
+
+/* ARGSUSED */
+static int
+zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	zfsctl_snapdir_t *sdp = vp->v_data;
+
+	ZFS_ENTER(zfsvfs);
+	zfsctl_common_getattr(vp, vap);
+	vap->va_nodeid = gfs_file_inode(vp);
+	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
+	ZFS_EXIT(zfsvfs);
+
+	return (0);
+}
+
+static void
+zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr)
+{
+	zfsctl_snapdir_t *sdp = vp->v_data;
+
+	ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
+	mutex_destroy(&sdp->sd_lock);
+	avl_destroy(&sdp->sd_snaps);
+	gfs_vop_inactive(vp, cr);
+}
+
+static const fs_operation_def_t zfsctl_tops_snapdir[] = {
+	{ VOPNAME_OPEN,		zfsctl_common_open			},
+	{ VOPNAME_CLOSE,	zfsctl_common_close			},
+	{ VOPNAME_IOCTL,	fs_inval				},
+	{ VOPNAME_GETATTR,	zfsctl_snapdir_getattr			},
+	{ VOPNAME_ACCESS,	zfsctl_common_access			},
+	{ VOPNAME_RENAME,	zfsctl_snapdir_rename			},
+	{ VOPNAME_RMDIR,	zfsctl_snapdir_remove			},
+	{ VOPNAME_READDIR,	gfs_vop_readdir				},
+	{ VOPNAME_LOOKUP,	zfsctl_snapdir_lookup			},
+	{ VOPNAME_SEEK,		fs_seek					},
+	{ VOPNAME_INACTIVE,	(fs_generic_func_p) zfsctl_snapdir_inactive },
+	{ VOPNAME_FID,		zfsctl_common_fid			},
+	{ NULL }
+};
+
+static vnode_t *
+zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
+{
+	vnode_t *vp;
+	zfsctl_node_t *zcp;
+
+	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
+	    zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
+	zcp = vp->v_data;
+	zcp->zc_id = objset;
+
+	return (vp);
+}
+
+static void
+zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr)
+{
+	zfsctl_snapdir_t *sdp;
+	zfs_snapentry_t *sep, *next;
+	vnode_t *dvp;
+
+	VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
+	sdp = dvp->v_data;
+
+	mutex_enter(&sdp->sd_lock);
+
+	if (vp->v_count > 1) {
+		mutex_exit(&sdp->sd_lock);
+		return;
+	}
+	ASSERT(!vn_ismntpt(vp));
+
+	sep = avl_first(&sdp->sd_snaps);
+	while (sep != NULL) {
+		next = AVL_NEXT(&sdp->sd_snaps, sep);
+
+		if (sep->se_root == vp) {
+			avl_remove(&sdp->sd_snaps, sep);
+			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+			kmem_free(sep, sizeof (zfs_snapentry_t));
+			break;
+		}
+		sep = next;
+	}
+	ASSERT(sep != NULL);
+
+	mutex_exit(&sdp->sd_lock);
+	VN_RELE(dvp);
+
+	gfs_vop_inactive(vp, cr);
+}
+
+
+/*
+ * These VP's should never see the light of day.  They should always
+ * be covered.
+ */
+static const fs_operation_def_t zfsctl_tops_snapshot[] = {
+	VOPNAME_INACTIVE, (fs_generic_func_p) zfsctl_snapshot_inactive,
+	NULL, NULL
+};
+
+int
+zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	vnode_t *dvp, *vp;
+	zfsctl_snapdir_t *sdp;
+	zfsctl_node_t *zcp;
+	zfs_snapentry_t *sep;
+	int error;
+
+	ASSERT(zfsvfs->z_ctldir != NULL);
+	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
+	    NULL, 0, NULL, kcred);
+	if (error != 0)
+		return (error);
+	sdp = dvp->v_data;
+
+	mutex_enter(&sdp->sd_lock);
+	sep = avl_first(&sdp->sd_snaps);
+	while (sep != NULL) {
+		vp = sep->se_root;
+		zcp = vp->v_data;
+		if (zcp->zc_id == objsetid)
+			break;
+
+		sep = AVL_NEXT(&sdp->sd_snaps, sep);
+	}
+
+	if (sep != NULL) {
+		VN_HOLD(vp);
+		error = traverse(&vp);
+		if (error == 0)
+			*zfsvfsp = VTOZ(vp)->z_zfsvfs;
+		VN_RELE(vp);
+	} else {
+		error = EINVAL;
+	}
+
+	mutex_exit(&sdp->sd_lock);
+	VN_RELE(dvp);
+
+	return (error);
+}
+
+/*
+ * Unmount any snapshots for the given filesystem.  This is called from
+ * zfs_umount() - if we have a ctldir, then go through and unmount all the
+ * snapshots.
+ */
+int
+zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	vnode_t *dvp, *svp;
+	zfsctl_snapdir_t *sdp;
+	zfs_snapentry_t *sep, *next;
+	int error;
+
+	ASSERT(zfsvfs->z_ctldir != NULL);
+	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
+	    NULL, 0, NULL, cr);
+	if (error != 0)
+		return (error);
+	sdp = dvp->v_data;
+
+	mutex_enter(&sdp->sd_lock);
+
+	sep = avl_first(&sdp->sd_snaps);
+	while (sep != NULL) {
+		svp = sep->se_root;
+		next = AVL_NEXT(&sdp->sd_snaps, sep);
+
+		/*
+		 * If this snapshot is not mounted, then it must
+		 * have just been unmounted by somebody else, and
+		 * will be cleaned up by zfsctl_snapdir_inactive().
+		 */
+		if (vn_ismntpt(svp)) {
+			if ((error = vn_vfswlock(svp)) != 0)
+				goto out;
+
+			VN_HOLD(svp);
+			error = dounmount(vn_mountedvfs(svp), fflags, cr);
+			if (error) {
+				VN_RELE(svp);
+				goto out;
+			}
+
+			avl_remove(&sdp->sd_snaps, sep);
+			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
+			kmem_free(sep, sizeof (zfs_snapentry_t));
+
+			/*
+			 * We can't use VN_RELE(), as that will try to
+			 * invoke zfsctl_snapdir_inactive(), and that
+			 * would lead to an attempt to re-grab the sd_lock.
+			 */
+			ASSERT3U(svp->v_count, ==, 1);
+			gfs_vop_inactive(svp, cr);
+		}
+		sep = next;
+	}
+out:
+	mutex_exit(&sdp->sd_lock);
+	VN_RELE(dvp);
+
+	return (error);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c
new file mode 100644
index 000000000000..6df89ad0c48a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c
@@ -0,0 +1,853 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/mode.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/random.h>
+#include <sys/policy.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/fs/zfs.h>
+#include "fs/fs_subr.h"
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ctldir.h>
+
+/*
+ * Lock a directory entry.  A dirlock on <dzp, name> protects that name
+ * in dzp's directory zap object.  As long as you hold a dirlock, you can
+ * assume two things: (1) dzp cannot be reaped, and (2) no other thread
+ * can change the zap entry for (i.e. link or unlink) this name.
+ *
+ * Input arguments:
+ *	dzp	- znode for directory
+ *	name	- name of entry to lock
+ *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
+ *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
+ *		  ZSHARED: allow concurrent access with other ZSHARED callers.
+ *		  ZXATTR: we want dzp's xattr directory
+ *
+ * Output arguments:
+ *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
+ *	dlpp	- pointer to the dirlock for this entry (NULL on error)
+ *
+ * Return value: 0 on success or errno on failure.
+ *
+ * NOTE: Always checks for, and rejects, '.' and '..'.
+ */
+int
+zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
+	int flag)
+{
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zfs_dirlock_t	*dl;
+	uint64_t	zoid;
+	int		error;
+
+	*zpp = NULL;
+	*dlpp = NULL;
+
+	/*
+	 * Verify that we are not trying to lock '.', '..', or '.zfs'
+	 */
+	if (name[0] == '.' &&
+	    (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
+	    zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
+		return (EEXIST);
+
+	/*
+	 * Wait until there are no locks on this name.
+	 */
+	mutex_enter(&dzp->z_lock);
+	for (;;) {
+		if (dzp->z_reap) {
+			mutex_exit(&dzp->z_lock);
+			return (ENOENT);
+		}
+		for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next)
+			if (strcmp(name, dl->dl_name) == 0)
+				break;
+		if (dl == NULL)	{
+			/*
+			 * Allocate a new dirlock and add it to the list.
+			 */
+			dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
+			cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
+			dl->dl_name = name;
+			dl->dl_sharecnt = 0;
+			dl->dl_namesize = 0;
+			dl->dl_dzp = dzp;
+			dl->dl_next = dzp->z_dirlocks;
+			dzp->z_dirlocks = dl;
+			break;
+		}
+		if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
+			break;
+		cv_wait(&dl->dl_cv, &dzp->z_lock);
+	}
+
+	if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
+		/*
+		 * We're the second shared reference to dl.  Make a copy of
+		 * dl_name in case the first thread goes away before we do.
+		 * Note that we initialize the new name before storing its
+		 * pointer into dl_name, because the first thread may load
+		 * dl->dl_name at any time.  He'll either see the old value,
+		 * which is his, or the new shared copy; either is OK.
+		 */
+		dl->dl_namesize = strlen(dl->dl_name) + 1;
+		name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
+		bcopy(dl->dl_name, name, dl->dl_namesize);
+		dl->dl_name = name;
+	}
+
+	mutex_exit(&dzp->z_lock);
+
+	/*
+	 * We have a dirlock on the name.  (Note that it is the dirlock,
+	 * not the dzp's z_lock, that protects the name in the zap object.)
+	 * See if there's an object by this name; if so, put a hold on it.
+	 */
+	if (flag & ZXATTR) {
+		zoid = dzp->z_phys->zp_xattr;
+		error = (zoid == 0 ? ENOENT : 0);
+	} else {
+		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, &zoid);
+	}
+	if (error) {
+		if (error != ENOENT || (flag & ZEXISTS)) {
+			zfs_dirent_unlock(dl);
+			return (error);
+		}
+	} else {
+		if (flag & ZNEW) {
+			zfs_dirent_unlock(dl);
+			return (EEXIST);
+		}
+		error = zfs_zget(zfsvfs, zoid, zpp);
+		if (error) {
+			zfs_dirent_unlock(dl);
+			return (error);
+		}
+	}
+
+	*dlpp = dl;
+
+	return (0);
+}
+
+/*
+ * Unlock this directory entry and wake anyone who was waiting for it.
+ */
+void
+zfs_dirent_unlock(zfs_dirlock_t *dl)
+{
+	znode_t *dzp = dl->dl_dzp;
+	zfs_dirlock_t **prev_dl, *cur_dl;
+
+	mutex_enter(&dzp->z_lock);
+	if (dl->dl_sharecnt > 1) {
+		dl->dl_sharecnt--;
+		mutex_exit(&dzp->z_lock);
+		return;
+	}
+	prev_dl = &dzp->z_dirlocks;
+	while ((cur_dl = *prev_dl) != dl)
+		prev_dl = &cur_dl->dl_next;
+	*prev_dl = dl->dl_next;
+	cv_broadcast(&dl->dl_cv);
+	mutex_exit(&dzp->z_lock);
+
+	if (dl->dl_namesize != 0)
+		kmem_free(dl->dl_name, dl->dl_namesize);
+	cv_destroy(&dl->dl_cv);
+	kmem_free(dl, sizeof (*dl));
+}
+
+/*
+ * Look up an entry in a directory.
+ *
+ * NOTE: '.' and '..' are handled as special cases because
+ *	no directory entries are actually stored for them.  If this is
+ *	the root of a filesystem, then '.zfs' is also treated as a
+ *	special pseudo-directory.
+ */
+int
+zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp)
+{
+	zfs_dirlock_t *dl;
+	znode_t *zp;
+	int error = 0;
+
+	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+		*vpp = ZTOV(dzp);
+		VN_HOLD(*vpp);
+	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+		zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+		/*
+		 * If we are a snapshot mounted under .zfs, return
+		 * the vp for the snapshot directory.
+		 */
+		if (zfsvfs->z_parent != zfsvfs) {
+			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
+			    "snapshot", vpp, NULL, 0, NULL, kcred);
+			return (error);
+		}
+		rw_enter(&dzp->z_parent_lock, RW_READER);
+		error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
+		if (error == 0)
+			*vpp = ZTOV(zp);
+		rw_exit(&dzp->z_parent_lock);
+	} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
+		*vpp = zfsctl_root(dzp);
+	} else {
+		error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS | ZSHARED);
+		if (error == 0) {
+			*vpp = ZTOV(zp);
+			zfs_dirent_unlock(dl);
+		}
+	}
+
+	return (error);
+}
+
+static char *
+zfs_dq_hexname(char namebuf[17], uint64_t x)
+{
+	char *name = &namebuf[16];
+	const char digits[16] = "0123456789abcdef";
+
+	*name = '\0';
+	do {
+		*--name = digits[x & 0xf];
+		x >>= 4;
+	} while (x != 0);
+
+	return (name);
+}
+
+void
+zfs_dq_add(znode_t *zp, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	char obj_name[17];
+	int error;
+
+	ASSERT(zp->z_reap);
+	ASSERT3U(zp->z_phys->zp_links, ==, 0);
+
+	error = zap_add(zfsvfs->z_os, zfsvfs->z_dqueue,
+	    zfs_dq_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx);
+	ASSERT3U(error, ==, 0);
+}
+
+/*
+ * Delete the entire contents of a directory.  Return a count
+ * of the number of entries that could not be deleted.
+ *
+ * NOTE: this function assumes that the directory is inactive,
+ *	so there is no need to lock its entries before deletion.
+ *	Also, it assumes the directory contents is *only* regular
+ *	files.
+ */
+static int
+zfs_purgedir(znode_t *dzp)
+{
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	znode_t		*xzp;
+	dmu_tx_t	*tx;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zfs_dirlock_t	dl;
+	int skipped = 0;
+	int error;
+
+	ASSERT(dzp->z_active == 0);
+
+	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+	    zap_cursor_advance(&zc)) {
+		error = zfs_zget(zfsvfs, zap.za_first_integer, &xzp);
+		ASSERT3U(error, ==, 0);
+
+		ASSERT((ZTOV(xzp)->v_type == VREG) ||
+		    (ZTOV(xzp)->v_type == VLNK));
+
+		tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_hold_bonus(tx, dzp->z_id);
+		dmu_tx_hold_zap(tx, dzp->z_id, -1);
+		dmu_tx_hold_bonus(tx, xzp->z_id);
+		dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			VN_RELE(ZTOV(xzp));
+			skipped += 1;
+			continue;
+		}
+		bzero(&dl, sizeof (dl));
+		dl.dl_dzp = dzp;
+		dl.dl_name = zap.za_name;
+
+		error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
+		ASSERT3U(error, ==, 0);
+		dmu_tx_commit(tx);
+
+		VN_RELE(ZTOV(xzp));
+	}
+	ASSERT(error == ENOENT);
+	return (skipped);
+}
+
+/*
+ * Special function to requeue the znodes for deletion that were
+ * in progress when we either crashed or umounted the file system.
+ */
+static void
+zfs_drain_dq(zfsvfs_t *zfsvfs)
+{
+	zap_cursor_t	zc;
+	zap_attribute_t zap;
+	dmu_object_info_t doi;
+	znode_t		*zp;
+	int		error;
+
+	/*
+	 * Interate over the contents of the delete queue.
+	 */
+	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_dqueue);
+	    zap_cursor_retrieve(&zc, &zap) == 0;
+	    zap_cursor_advance(&zc)) {
+
+		/*
+		 * Need some helpers?
+		 */
+		if (zfs_delete_thread_target(zfsvfs, -1) != 0)
+			return;
+
+		/*
+		 * See what kind of object we have in queue
+		 */
+
+		error = dmu_object_info(zfsvfs->z_os,
+		    zap.za_first_integer, &doi);
+		if (error != 0)
+			continue;
+
+		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+		/*
+		 * We need to re-mark these queue entries for reaping,
+		 * so we pull them back into core and set zp->z_reap.
+		 */
+		error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
+
+		/*
+		 * We may pick up znodes that are already marked for reaping.
+		 * This could happen during the purge of an extended attribute
+		 * directory.  All we need to do is skip over them, since they
+		 * are already in the system to be processed by the taskq.
+		 */
+		if (error != 0) {
+			continue;
+		}
+		zp->z_reap = 1;
+		VN_RELE(ZTOV(zp));
+		break;
+	}
+}
+
+void
+zfs_delete_thread(void *arg)
+{
+	zfsvfs_t	*zfsvfs = arg;
+	zfs_delete_t 	*zd = &zfsvfs->z_delete_head;
+	znode_t		*zp;
+	callb_cpr_t	cprinfo;
+
+	CALLB_CPR_INIT(&cprinfo, &zd->z_mutex, callb_generic_cpr, "zfs_delete");
+
+	mutex_enter(&zd->z_mutex);
+
+	if (!zd->z_drained && !zd->z_draining) {
+		zd->z_draining = B_TRUE;
+		mutex_exit(&zd->z_mutex);
+		zfs_drain_dq(zfsvfs);
+		mutex_enter(&zd->z_mutex);
+		zd->z_draining = B_FALSE;
+		zd->z_drained = B_TRUE;
+		cv_broadcast(&zd->z_quiesce_cv);
+	}
+
+	while (zd->z_thread_count <= zd->z_thread_target) {
+		zp = list_head(&zd->z_znodes);
+		if (zp == NULL) {
+			ASSERT(zd->z_znode_count == 0);
+			CALLB_CPR_SAFE_BEGIN(&cprinfo);
+			cv_wait(&zd->z_cv, &zd->z_mutex);
+			CALLB_CPR_SAFE_END(&cprinfo, &zd->z_mutex);
+			continue;
+		}
+		ASSERT(zd->z_znode_count != 0);
+		list_remove(&zd->z_znodes, zp);
+		if (--zd->z_znode_count == 0)
+			cv_broadcast(&zd->z_quiesce_cv);
+		mutex_exit(&zd->z_mutex);
+		zfs_rmnode(zp);
+		(void) zfs_delete_thread_target(zfsvfs, -1);
+		mutex_enter(&zd->z_mutex);
+	}
+
+	ASSERT(zd->z_thread_count != 0);
+	if (--zd->z_thread_count == 0)
+		cv_broadcast(&zd->z_cv);
+
+	CALLB_CPR_EXIT(&cprinfo);	/* NB: drops z_mutex */
+	thread_exit();
+}
+
+static int zfs_work_per_thread_shift = 11;	/* 2048 (2^11) per thread */
+
+/*
+ * Set the target number of delete threads to 'nthreads'.
+ * If nthreads == -1, choose a number based on current workload.
+ * If nthreads == 0, don't return until the threads have exited.
+ */
+int
+zfs_delete_thread_target(zfsvfs_t *zfsvfs, int nthreads)
+{
+	zfs_delete_t *zd = &zfsvfs->z_delete_head;
+
+	mutex_enter(&zd->z_mutex);
+
+	if (nthreads == -1) {
+		if (zd->z_thread_target == 0) {
+			mutex_exit(&zd->z_mutex);
+			return (EBUSY);
+		}
+		nthreads = zd->z_znode_count >> zfs_work_per_thread_shift;
+		nthreads = MIN(nthreads, ncpus << 1);
+		nthreads = MAX(nthreads, 1);
+		nthreads += !!zd->z_draining;
+	}
+
+	zd->z_thread_target = nthreads;
+
+	while (zd->z_thread_count < zd->z_thread_target) {
+		(void) thread_create(NULL, 0, zfs_delete_thread, zfsvfs,
+		    0, &p0, TS_RUN, minclsyspri);
+		zd->z_thread_count++;
+	}
+
+	while (zd->z_thread_count > zd->z_thread_target && nthreads == 0) {
+		cv_broadcast(&zd->z_cv);
+		cv_wait(&zd->z_cv, &zd->z_mutex);
+	}
+
+	mutex_exit(&zd->z_mutex);
+
+	return (0);
+}
+
+/*
+ * Wait until everything that's been queued has been deleted.
+ */
+void
+zfs_delete_wait_empty(zfsvfs_t *zfsvfs)
+{
+	zfs_delete_t *zd = &zfsvfs->z_delete_head;
+
+	mutex_enter(&zd->z_mutex);
+	ASSERT(zd->z_thread_target != 0);
+	while (!zd->z_drained || zd->z_znode_count != 0) {
+		ASSERT(zd->z_thread_target != 0);
+		cv_wait(&zd->z_quiesce_cv, &zd->z_mutex);
+	}
+	mutex_exit(&zd->z_mutex);
+}
+
+void
+zfs_rmnode(znode_t *zp)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	objset_t	*os = zfsvfs->z_os;
+	znode_t		*xzp = NULL;
+	char		obj_name[17];
+	dmu_tx_t	*tx;
+	uint64_t	acl_obj;
+	int		error;
+
+	ASSERT(zp->z_active == 0);
+	ASSERT(ZTOV(zp)->v_count == 0);
+	ASSERT(zp->z_phys->zp_links == 0);
+
+	/*
+	 * If this is an attribute directory, purge its contents.
+	 */
+	if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR))
+		if (zfs_purgedir(zp) != 0) {
+			zfs_delete_t *delq = &zfsvfs->z_delete_head;
+			/*
+			 * Add this back to the delete list to be retried later.
+			 *
+			 * XXX - this could just busy loop on us...
+			 */
+			mutex_enter(&delq->z_mutex);
+			list_insert_tail(&delq->z_znodes, zp);
+			delq->z_znode_count++;
+			mutex_exit(&delq->z_mutex);
+			return;
+		}
+
+	/*
+	 * If the file has extended attributes, unlink the xattr dir.
+	 */
+	if (zp->z_phys->zp_xattr) {
+		error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
+		ASSERT(error == 0);
+	}
+
+	acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+
+	/*
+	 * Set up the transaction.
+	 */
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, -1);
+	if (xzp) {
+		dmu_tx_hold_bonus(tx, xzp->z_id);
+		dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+	}
+	if (acl_obj)
+		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		zfs_delete_t *delq = &zfsvfs->z_delete_head;
+
+		dmu_tx_abort(tx);
+		/*
+		 * Add this back to the delete list to be retried later.
+		 *
+		 * XXX - this could just busy loop on us...
+		 */
+		mutex_enter(&delq->z_mutex);
+		list_insert_tail(&delq->z_znodes, zp);
+		delq->z_znode_count++;
+		mutex_exit(&delq->z_mutex);
+		return;
+	}
+
+	if (xzp) {
+		dmu_buf_will_dirty(xzp->z_dbuf, tx);
+		mutex_enter(&xzp->z_lock);
+		xzp->z_reap = 1;		/* mark xzp for deletion */
+		xzp->z_phys->zp_links = 0;	/* no more links to it */
+		mutex_exit(&xzp->z_lock);
+		zfs_dq_add(xzp, tx);		/* add xzp to delete queue */
+	}
+
+	/*
+	 * Remove this znode from delete queue
+	 */
+	error = zap_remove(os, zfsvfs->z_dqueue,
+	    zfs_dq_hexname(obj_name, zp->z_id), tx);
+	ASSERT3U(error, ==, 0);
+
+	zfs_znode_delete(zp, tx);
+
+	dmu_tx_commit(tx);
+
+	if (xzp)
+		VN_RELE(ZTOV(xzp));
+}
+
+/*
+ * Link zp into dl.  Can only fail if zp has been reaped.
+ */
+int
+zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+{
+	znode_t *dzp = dl->dl_dzp;
+	vnode_t *vp = ZTOV(zp);
+	int zp_is_dir = (vp->v_type == VDIR);
+	int error;
+
+	dmu_buf_will_dirty(zp->z_dbuf, tx);
+	mutex_enter(&zp->z_lock);
+
+	if (!(flag & ZRENAMING)) {
+		if (zp->z_reap) {	/* no new links to reaped zp */
+			ASSERT(!(flag & (ZNEW | ZEXISTS)));
+			mutex_exit(&zp->z_lock);
+			return (ENOENT);
+		}
+		zp->z_phys->zp_links++;
+	}
+	zp->z_phys->zp_parent = dzp->z_id;	/* dzp is now zp's parent */
+
+	if (!(flag & ZNEW))
+		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+	mutex_exit(&zp->z_lock);
+
+	dmu_buf_will_dirty(dzp->z_dbuf, tx);
+	mutex_enter(&dzp->z_lock);
+	dzp->z_phys->zp_size++;			/* one dirent added */
+	dzp->z_phys->zp_links += zp_is_dir;	/* ".." link from zp */
+	zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+	mutex_exit(&dzp->z_lock);
+
+	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
+	    8, 1, &zp->z_id, tx);
+	ASSERT(error == 0);
+
+	return (0);
+}
+
+/*
+ * Unlink zp from dl, and mark zp for reaping if this was the last link.
+ * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
+ * If 'reaped_ptr' is NULL, we put reaped znodes on the delete queue.
+ * If it's non-NULL, we use it to indicate whether the znode needs reaping,
+ * and it's the caller's job to do it.
+ */
+int
+zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
+	int *reaped_ptr)
+{
+	znode_t *dzp = dl->dl_dzp;
+	vnode_t *vp = ZTOV(zp);
+	int zp_is_dir = (vp->v_type == VDIR);
+	int reaped = 0;
+	int error;
+
+	if (!(flag & ZRENAMING)) {
+		dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+		if (vn_vfswlock(vp))		/* prevent new mounts on zp */
+			return (EBUSY);
+
+		if (vn_ismntpt(vp)) {		/* don't remove mount point */
+			vn_vfsunlock(vp);
+			return (EBUSY);
+		}
+
+		mutex_enter(&zp->z_lock);
+		if (zp_is_dir && !zfs_dirempty(zp)) {	/* dir not empty */
+			mutex_exit(&zp->z_lock);
+			vn_vfsunlock(vp);
+			return (EEXIST);
+		}
+		ASSERT(zp->z_phys->zp_links > zp_is_dir);
+		if (--zp->z_phys->zp_links == zp_is_dir) {
+			zp->z_reap = 1;
+			zp->z_phys->zp_links = 0;
+			reaped = 1;
+		} else {
+			zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+		}
+		mutex_exit(&zp->z_lock);
+		vn_vfsunlock(vp);
+	}
+
+	dmu_buf_will_dirty(dzp->z_dbuf, tx);
+	mutex_enter(&dzp->z_lock);
+	dzp->z_phys->zp_size--;			/* one dirent removed */
+	dzp->z_phys->zp_links -= zp_is_dir;	/* ".." link from zp */
+	zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
+	mutex_exit(&dzp->z_lock);
+
+	error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, tx);
+	ASSERT(error == 0);
+
+	if (reaped_ptr != NULL)
+		*reaped_ptr = reaped;
+	else if (reaped)
+		zfs_dq_add(zp, tx);
+
+	return (0);
+}
+
+/*
+ * Indicate whether the directory is empty.  Works with or without z_lock
+ * held, but can only be consider a hint in the latter case.  Returns true
+ * if only "." and ".." remain and there's no work in progress.
+ */
+boolean_t
+zfs_dirempty(znode_t *dzp)
+{
+	return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
+}
+
+int
+zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	znode_t *xzp;
+	dmu_tx_t *tx;
+	uint64_t xoid;
+	int error;
+
+	*xvpp = NULL;
+
+	if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, cr))
+		return (error);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 0);
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (error);
+	}
+	zfs_mknode(zp, vap, &xoid, tx, cr, IS_XATTR, &xzp, 0);
+	ASSERT(xzp->z_id == xoid);
+	ASSERT(xzp->z_phys->zp_parent == zp->z_id);
+	dmu_buf_will_dirty(zp->z_dbuf, tx);
+	zp->z_phys->zp_xattr = xoid;
+
+	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "");
+	dmu_tx_commit(tx);
+
+	*xvpp = ZTOV(xzp);
+
+	return (0);
+}
+
+/*
+ * Return a znode for the extended attribute directory for zp.
+ * ** If the directory does not already exist, it is created **
+ *
+ *	IN:	zp	- znode to obtain attribute directory from
+ *		cr	- credentials of caller
+ *
+ *	OUT:	xzpp	- pointer to extended attribute znode
+ *
+ *	RETURN:	0 on success
+ *		error number on failure
+ */
+int
+zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	znode_t		*xzp;
+	zfs_dirlock_t	*dl;
+	vattr_t		va;
+	int		error;
+top:
+	error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR);
+	if (error)
+		return (error);
+
+	if (xzp != NULL) {
+		*xvpp = ZTOV(xzp);
+		zfs_dirent_unlock(dl);
+		return (0);
+	}
+
+	ASSERT(zp->z_phys->zp_xattr == 0);
+
+	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+		zfs_dirent_unlock(dl);
+		return (EROFS);
+	}
+
+	/*
+	 * The ability to 'create' files in an attribute
+	 * directory comes from the write_xattr permission on the base file.
+	 *
+	 * The ability to 'search' an attribute directory requires
+	 * read_xattr permission on the base file.
+	 *
+	 * Once in a directory the ability to read/write attributes
+	 * is controlled by the permissions on the attribute file.
+	 */
+	va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
+	va.va_type = VDIR;
+	va.va_mode = S_IFDIR | 0755;
+	va.va_uid = (uid_t)zp->z_phys->zp_uid;
+	va.va_gid = (gid_t)zp->z_phys->zp_gid;
+
+	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
+	zfs_dirent_unlock(dl);
+
+	if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+		goto top;
+	}
+
+	return (error);
+}
+
+/*
+ * Decide whether it is okay to remove within a sticky directory.
+ *
+ * In sticky directories, write access is not sufficient;
+ * you can remove entries from a directory only if:
+ *
+ *	you own the directory,
+ *	you own the entry,
+ *	the entry is a plain file and you have write access,
+ *	or you are privileged (checked in secpolicy...).
+ *
+ * The function returns 0 if remove access is granted.
+ */
+int
+zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
+{
+	uid_t  		uid;
+
+	if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL)	/* ZIL replay */
+		return (0);
+
+	if ((zdp->z_phys->zp_mode & S_ISVTX) == 0 ||
+	    (uid = crgetuid(cr)) == zdp->z_phys->zp_uid ||
+	    uid == zp->z_phys->zp_uid ||
+	    (ZTOV(zp)->v_type == VREG &&
+	    zfs_zaccess(zp, ACE_WRITE_DATA, cr) == 0))
+		return (0);
+	else
+		return (secpolicy_vnode_remove(cr));
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
new file mode 100644
index 000000000000..e8723ffe89db
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -0,0 +1,1323 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+#include <sys/modctl.h>
+#include <sys/open.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/vdev.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+#include <sys/nvpair.h>
+#include <sys/pathname.h>
+#include <sys/mount.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+
+#include "zfs_namecheck.h"
+
+extern struct modlfs zfs_modlfs;
+
+extern void zfs_init(void);
+extern void zfs_fini(void);
+
+ldi_ident_t zfs_li = NULL;
+dev_info_t *zfs_dip;
+
+typedef int zfs_ioc_func_t(zfs_cmd_t *);
+typedef int zfs_secpolicy_func_t(const char *, const char *, cred_t *);
+
+typedef struct zfs_ioc_vec {
+	zfs_ioc_func_t		*zvec_func;
+	zfs_secpolicy_func_t	*zvec_secpolicy;
+	enum {
+		no_name,
+		pool_name,
+		dataset_name
+	}			zvec_namecheck;
+} zfs_ioc_vec_t;
+
+/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
+void
+__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
+{
+	const char *newfile;
+	char buf[256];
+	va_list adx;
+
+	/*
+	 * Get rid of annoying "../common/" prefix to filename.
+	 */
+	newfile = strrchr(file, '/');
+	if (newfile != NULL) {
+		newfile = newfile + 1; /* Get rid of leading / */
+	} else {
+		newfile = file;
+	}
+
+	va_start(adx, fmt);
+	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
+	va_end(adx);
+
+	/*
+	 * To get this data, use the zfs-dprintf probe as so:
+	 * dtrace -q -n 'zfs-dprintf \
+	 *	/stringof(arg0) == "dbuf.c"/ \
+	 *	{printf("%s: %s", stringof(arg1), stringof(arg3))}'
+	 * arg0 = file name
+	 * arg1 = function name
+	 * arg2 = line number
+	 * arg3 = message
+	 */
+	DTRACE_PROBE4(zfs__dprintf,
+	    char *, newfile, char *, func, int, line, char *, buf);
+}
+
+/*
+ * Policy for top-level read operations (list pools).  Requires no privileges,
+ * and can be used in the local zone, as there is no associated dataset.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_none(const char *unused1, const char *unused2, cred_t *cr)
+{
+	return (0);
+}
+
+/*
+ * Policy for dataset read operations (list children, get statistics).  Requires
+ * no privileges, but must be visible in the local zone.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_read(const char *dataset, const char *unused, cred_t *cr)
+{
+	if (INGLOBALZONE(curproc) ||
+	    zone_dataset_visible(dataset, NULL))
+		return (0);
+
+	return (ENOENT);
+}
+
+static int
+zfs_dozonecheck(const char *dataset, cred_t *cr)
+{
+	uint64_t zoned;
+	int writable = 1;
+
+	/*
+	 * The dataset must be visible by this zone -- check this first
+	 * so they don't see EPERM on something they shouldn't know about.
+	 */
+	if (!INGLOBALZONE(curproc) &&
+	    !zone_dataset_visible(dataset, &writable))
+		return (ENOENT);
+
+	if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL))
+		return (ENOENT);
+
+	if (INGLOBALZONE(curproc)) {
+		/*
+		 * If the fs is zoned, only root can access it from the
+		 * global zone.
+		 */
+		if (secpolicy_zfs(cr) && zoned)
+			return (EPERM);
+	} else {
+		/*
+		 * If we are in a local zone, the 'zoned' property must be set.
+		 */
+		if (!zoned)
+			return (EPERM);
+
+		/* must be writable by this zone */
+		if (!writable)
+			return (EPERM);
+	}
+	return (0);
+}
+
+/*
+ * Policy for dataset write operations (create children, set properties, etc).
+ * Requires SYS_MOUNT privilege, and must be writable in the local zone.
+ */
+/* ARGSUSED */
+int
+zfs_secpolicy_write(const char *dataset, const char *unused, cred_t *cr)
+{
+	int error;
+
+	if (error = zfs_dozonecheck(dataset, cr))
+		return (error);
+
+	return (secpolicy_zfs(cr));
+}
+
+/*
+ * Policy for operations that want to write a dataset's parent:
+ * create, destroy, snapshot, clone, restore.
+ */
+static int
+zfs_secpolicy_parent(const char *dataset, const char *unused, cred_t *cr)
+{
+	char parentname[MAXNAMELEN];
+	char *cp;
+
+	/*
+	 * Remove the @bla or /bla from the end of the name to get the parent.
+	 */
+	(void) strncpy(parentname, dataset, sizeof (parentname));
+	cp = strrchr(parentname, '@');
+	if (cp != NULL) {
+		cp[0] = '\0';
+	} else {
+		cp = strrchr(parentname, '/');
+		if (cp == NULL)
+			return (ENOENT);
+		cp[0] = '\0';
+
+	}
+
+	return (zfs_secpolicy_write(parentname, unused, cr));
+}
+
+/*
+ * Policy for dataset write operations (create children, set properties, etc).
+ * Requires SYS_MOUNT privilege, and must be writable in the local zone.
+ */
+static int
+zfs_secpolicy_setprop(const char *dataset, const char *prop, cred_t *cr)
+{
+	int error;
+
+	if (error = zfs_dozonecheck(dataset, cr))
+		return (error);
+
+	if (strcmp(prop, "zoned") == 0) {
+		/*
+		 * Disallow setting of 'zoned' from within a local zone.
+		 */
+		if (!INGLOBALZONE(curproc))
+			return (EPERM);
+	}
+
+	return (secpolicy_zfs(cr));
+}
+
+/*
+ * Security policy for setting the quota.  This is the same as
+ * zfs_secpolicy_write, except that the local zone may not change the quota at
+ * the zone-property setpoint.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_quota(const char *dataset, const char *unused, cred_t *cr)
+{
+	int error;
+
+	if (error = zfs_dozonecheck(dataset, cr))
+		return (error);
+
+	if (!INGLOBALZONE(curproc)) {
+		uint64_t zoned;
+		char setpoint[MAXNAMELEN];
+		int dslen;
+		/*
+		 * Unprivileged users are allowed to modify the quota
+		 * on things *under* (ie. contained by) the thing they
+		 * own.
+		 */
+		if (dsl_prop_get_integer(dataset, "zoned", &zoned, setpoint))
+			return (EPERM);
+		if (!zoned) /* this shouldn't happen */
+			return (EPERM);
+		dslen = strlen(dataset);
+		if (dslen <= strlen(setpoint))
+			return (EPERM);
+	}
+
+	return (secpolicy_zfs(cr));
+}
+
+/*
+ * Policy for pool operations - create/destroy pools, add vdevs, etc.  Requires
+ * SYS_CONFIG privilege, which is not available in a local zone.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_config(const char *unused, const char *unused2, cred_t *cr)
+{
+	if (secpolicy_sys_config(cr, B_FALSE) != 0)
+		return (EPERM);
+
+	return (0);
+}
+
+/*
+ * Returns the nvlist as specified by the user in the zfs_cmd_t.
+ */
+static int
+get_config(zfs_cmd_t *zc, nvlist_t **nvp)
+{
+	char *packed;
+	size_t size;
+	int error;
+	nvlist_t *config = NULL;
+
+	/*
+	 * Read in and unpack the user-supplied nvlist.  By this point, we know
+	 * that the user has the SYS_CONFIG privilege, so allocating arbitrary
+	 * sized regions of memory should not be a problem.
+	 */
+	if ((size = zc->zc_config_src_size) == 0)
+		return (EINVAL);
+
+	packed = kmem_alloc(size, KM_SLEEP);
+
+	if ((error = xcopyin((void *)(uintptr_t)zc->zc_config_src, packed,
+	    size)) != 0) {
+		kmem_free(packed, size);
+		return (error);
+	}
+
+	if ((error = nvlist_unpack(packed, size, &config, 0)) != 0) {
+		kmem_free(packed, size);
+		return (error);
+	}
+
+	kmem_free(packed, size);
+
+	*nvp = config;
+	return (0);
+}
+
+static int
+zfs_ioc_pool_create(zfs_cmd_t *zc)
+{
+	int error;
+	nvlist_t *config;
+
+	if ((error = get_config(zc, &config)) != 0)
+		return (error);
+
+	error = spa_create(zc->zc_name, config, zc->zc_root[0] == '\0' ?
+	    NULL : zc->zc_root);
+
+	nvlist_free(config);
+
+	return (error);
+}
+
+static int
+zfs_ioc_pool_destroy(zfs_cmd_t *zc)
+{
+	return (spa_destroy(zc->zc_name));
+}
+
+static int
+zfs_ioc_pool_import(zfs_cmd_t *zc)
+{
+	int error;
+	nvlist_t *config;
+	uint64_t guid;
+
+	if ((error = get_config(zc, &config)) != 0)
+		return (error);
+
+	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
+	    guid != zc->zc_pool_guid)
+		error = EINVAL;
+	else
+		error = spa_import(zc->zc_name, config,
+		    zc->zc_root[0] == '\0' ? NULL : zc->zc_root);
+
+	nvlist_free(config);
+
+	return (error);
+}
+
+static int
+zfs_ioc_pool_export(zfs_cmd_t *zc)
+{
+	return (spa_export(zc->zc_name));
+}
+
+static int
+zfs_ioc_pool_configs(zfs_cmd_t *zc)
+{
+	nvlist_t *configs;
+	char *packed = NULL;
+	size_t size = 0;
+	int error;
+
+	if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
+		return (EEXIST);
+
+	VERIFY(nvlist_pack(configs, &packed, &size, NV_ENCODE_NATIVE, 0) == 0);
+
+	if (size > zc->zc_config_dst_size)
+		error = ENOMEM;
+	else
+		error = xcopyout(packed, (void *)(uintptr_t)zc->zc_config_dst,
+		    size);
+
+	zc->zc_config_dst_size = size;
+
+	kmem_free(packed, size);
+	nvlist_free(configs);
+
+	return (error);
+}
+
+static int
+zfs_ioc_pool_guid(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error == 0) {
+		zc->zc_pool_guid = spa_guid(spa);
+		spa_close(spa, FTAG);
+	}
+	return (error);
+}
+
+static int
+zfs_ioc_pool_stats(zfs_cmd_t *zc)
+{
+	nvlist_t *config;
+	char *packed = NULL;
+	size_t size = 0;
+	int error;
+
+	error = spa_get_stats(zc->zc_name, &config);
+
+	if (config != NULL) {
+		VERIFY(nvlist_pack(config, &packed, &size,
+		    NV_ENCODE_NATIVE, 0) == 0);
+
+		if (size > zc->zc_config_dst_size)
+			error = ENOMEM;
+		else if (xcopyout(packed, (void *)(uintptr_t)zc->zc_config_dst,
+		    size))
+			error = EFAULT;
+
+		zc->zc_config_dst_size = size;
+
+		kmem_free(packed, size);
+		nvlist_free(config);
+	} else {
+		ASSERT(error != 0);
+	}
+
+	return (error);
+}
+
+/*
+ * Try to import the given pool, returning pool stats as appropriate so that
+ * user land knows which devices are available and overall pool health.
+ */
+static int
+zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
+{
+	nvlist_t *tryconfig, *config;
+	char *packed = NULL;
+	size_t size = 0;
+	int error;
+
+	if ((error = get_config(zc, &tryconfig)) != 0)
+		return (error);
+
+	config = spa_tryimport(tryconfig);
+
+	nvlist_free(tryconfig);
+
+	if (config == NULL)
+		return (EINVAL);
+
+	VERIFY(nvlist_pack(config, &packed, &size, NV_ENCODE_NATIVE, 0) == 0);
+
+	if (size > zc->zc_config_dst_size)
+		error = ENOMEM;
+	else
+		error = xcopyout(packed, (void *)(uintptr_t)zc->zc_config_dst,
+		    size);
+
+	zc->zc_config_dst_size = size;
+
+	kmem_free(packed, size);
+	nvlist_free(config);
+
+	return (error);
+}
+
+static int
+zfs_ioc_pool_scrub(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error == 0) {
+		error = spa_scrub(spa, zc->zc_cookie, B_FALSE);
+		spa_close(spa, FTAG);
+	}
+	return (error);
+}
+
+static int
+zfs_ioc_pool_freeze(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error == 0) {
+		spa_freeze(spa);
+		spa_close(spa, FTAG);
+	}
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_add(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+	nvlist_t *config;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	if ((error = get_config(zc, &config)) == 0) {
+		error = spa_vdev_add(spa, config);
+		nvlist_free(config);
+	}
+
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_ioc_vdev_remove(zfs_cmd_t *zc)
+{
+	return (ENOTSUP);
+}
+
+static int
+zfs_ioc_vdev_online(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	char *path = zc->zc_prop_value;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	error = vdev_online(spa, path);
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_offline(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	char *path = zc->zc_prop_value;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	error = vdev_offline(spa, path);
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_attach(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	char *path = zc->zc_prop_value;
+	int replacing = zc->zc_cookie;
+	nvlist_t *config;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	if ((error = get_config(zc, &config)) == 0) {
+		error = spa_vdev_attach(spa, path, config, replacing);
+		nvlist_free(config);
+	}
+
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_detach(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	char *path = zc->zc_prop_value;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = spa_vdev_detach(spa, path, 0, B_FALSE);
+
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+static int
+zfs_get_stats(zfs_cmd_t *zc)
+{
+	char *name = zc->zc_name;
+	zfs_stats_t *zs = &zc->zc_zfs_stats;
+	int error;
+
+	bzero(zs, sizeof (zfs_stats_t));
+
+	if ((error = dsl_prop_get_integer(name, "atime",
+	    &zs->zs_atime, zs->zs_atime_setpoint)) != 0 ||
+	    (error = dsl_prop_get_integer(name, "recordsize",
+	    &zs->zs_recordsize, zs->zs_recordsize_setpoint)) != 0 ||
+	    (error = dsl_prop_get_integer(name, "readonly",
+	    &zs->zs_readonly, zs->zs_readonly_setpoint)) != 0 ||
+	    (error = dsl_prop_get_integer(name, "devices",
+	    &zs->zs_devices, zs->zs_devices_setpoint)) != 0 ||
+	    (error = dsl_prop_get_integer(name, "setuid",
+	    &zs->zs_setuid, zs->zs_setuid_setpoint)) != 0 ||
+	    (error = dsl_prop_get_integer(name, "exec",
+	    &zs->zs_exec, zs->zs_exec_setpoint)) != 0 ||
+	    (error = dsl_prop_get_string(name, "mountpoint", zs->zs_mountpoint,
+	    sizeof (zs->zs_mountpoint), zs->zs_mountpoint_setpoint)) != 0 ||
+	    (error = dsl_prop_get_string(name, "sharenfs", zs->zs_sharenfs,
+	    sizeof (zs->zs_sharenfs), zs->zs_sharenfs_setpoint)) != 0 ||
+	    (error = dsl_prop_get_integer(name, "aclmode",
+	    &zs->zs_acl_mode, zs->zs_acl_mode_setpoint)) != 0 ||
+	    (error = dsl_prop_get_integer(name, "snapdir",
+	    &zs->zs_snapdir, zs->zs_snapdir_setpoint)) != 0 ||
+	    (error = dsl_prop_get_integer(name, "aclinherit",
+	    &zs->zs_acl_inherit, zs->zs_acl_inherit_setpoint)) != 0)
+		return (error);
+
+	return (0);
+}
+
+static int
+zfs_ioc_objset_stats(zfs_cmd_t *zc)
+{
+	objset_t *os = NULL;
+	int error;
+
+retry:
+	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+	if (error != 0) {
+		/*
+		 * This is ugly: dmu_objset_open() can return EBUSY if
+		 * the objset is held exclusively. Fortunately this hold is
+		 * only for a short while, so we retry here.
+		 * This avoids user code having to handle EBUSY,
+		 * for example for a "zfs list".
+		 */
+		if (error == EBUSY) {
+			delay(1);
+			goto retry;
+		}
+		return (error);
+	}
+
+	dmu_objset_stats(os, &zc->zc_objset_stats);
+
+	switch (zc->zc_objset_stats.dds_type) {
+
+	case DMU_OST_ZFS:
+		error = zfs_get_stats(zc);
+		break;
+
+	case DMU_OST_ZVOL:
+		error = zvol_get_stats(zc, os);
+		break;
+	}
+
+	dmu_objset_close(os);
+	return (error);
+}
+
+static int
+zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
+{
+	dsl_dir_t *dd;
+	zap_cursor_t cursor;
+	zap_attribute_t attr;
+	int error;
+	char *p;
+
+	dd = dsl_dir_open(zc->zc_name, FTAG, NULL);
+	if (dd == NULL)
+		return (ESRCH);
+
+	if (dd->dd_phys->dd_child_dir_zapobj == 0) {
+		dsl_dir_close(dd, FTAG);
+		return (ESRCH);
+	}
+
+	p = strrchr(zc->zc_name, '/');
+	if (p == NULL || p[1] != '\0')
+		(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
+	p = zc->zc_name + strlen(zc->zc_name);
+
+	do {
+		zap_cursor_init_serialized(&cursor, dd->dd_pool->dp_meta_objset,
+		    dd->dd_phys->dd_child_dir_zapobj, zc->zc_cookie);
+
+		error = zap_cursor_retrieve(&cursor, &attr);
+		if (error == ENOENT)
+			error = ESRCH;
+		if (error != 0) {
+			dsl_dir_close(dd, FTAG);
+			*p = '\0';
+			return (error);
+		}
+
+		(void) strlcpy(p, attr.za_name, sizeof (zc->zc_name) -
+		    (p - zc->zc_name));
+
+		zap_cursor_advance(&cursor);
+		zc->zc_cookie = zap_cursor_serialize(&cursor);
+
+	} while (!INGLOBALZONE(curproc) &&
+	    !zone_dataset_visible(zc->zc_name, NULL));
+
+	dsl_dir_close(dd, FTAG);
+
+	/*
+	 * If it's a hidden dataset, don't try to get stats for it.
+	 * User land will skip over it.
+	 */
+	if (strchr(zc->zc_name, '$') != NULL)
+		return (0);
+
+	error = zfs_ioc_objset_stats(zc); /* will just fill in the stats */
+	return (error);
+}
+
+static int
+zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
+{
+	zap_cursor_t cursor;
+	zap_attribute_t attr;
+	dsl_dataset_t *ds;
+	int error;
+
+retry:
+	error = dsl_dataset_open(zc->zc_name,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds);
+	if (error) {
+		/*
+		 * This is ugly: dsl_dataset_open() can return EBUSY if
+		 * the objset is held exclusively. Fortunately this hold is
+		 * only for a short while, so we retry here.
+		 * This avoids user code having to handle EBUSY,
+		 * for example for a "zfs list -s".
+		 */
+		if (error == EBUSY) {
+			delay(1);
+			goto retry;
+		}
+		if (error == ENOENT)
+			return (ESRCH);
+		return (error);
+	}
+
+	/*
+	 * If ds_snapnames_zapobj is 0, someone is trying to iterate over
+	 * snapshots of a snapshot.  In this case, pretend that it has no
+	 * snapshots; otherwise zap_cursor_retrieve() will blow up.
+	 */
+	if (ds->ds_phys->ds_snapnames_zapobj == 0) {
+		error = ESRCH;
+		goto out;
+	}
+
+	zap_cursor_init_serialized(&cursor,
+	    ds->ds_dir->dd_pool->dp_meta_objset,
+	    ds->ds_phys->ds_snapnames_zapobj, zc->zc_cookie);
+
+	error = zap_cursor_retrieve(&cursor, &attr);
+	if (error == ENOENT)
+		error = ESRCH;
+	if (error != 0)
+		goto out;
+
+	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
+	    sizeof (zc->zc_name) ||
+	    strlcat(zc->zc_name, attr.za_name, sizeof (zc->zc_name)) >=
+	    sizeof (zc->zc_name)) {
+		error = ENAMETOOLONG;
+		goto out;
+	}
+
+	zap_cursor_advance(&cursor);
+	zc->zc_cookie = zap_cursor_serialize(&cursor);
+
+	error = zfs_ioc_objset_stats(zc); /* will just fill in the stats */
+
+out:
+	dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+	return (error);
+}
+
+static int
+zfs_ioc_set_prop(zfs_cmd_t *zc)
+{
+	return (dsl_prop_set(zc->zc_name, zc->zc_prop_name,
+	    zc->zc_intsz, zc->zc_numints, zc->zc_prop_value));
+}
+
+static int
+zfs_ioc_set_quota(zfs_cmd_t *zc)
+{
+	return (dsl_dir_set_quota(zc->zc_name, zc->zc_cookie));
+}
+
+static int
+zfs_ioc_set_reservation(zfs_cmd_t *zc)
+{
+	return (dsl_dir_set_reservation(zc->zc_name, zc->zc_cookie));
+}
+
+static int
+zfs_ioc_set_volsize(zfs_cmd_t *zc)
+{
+	return (zvol_set_volsize(zc));
+}
+
+static int
+zfs_ioc_set_volblocksize(zfs_cmd_t *zc)
+{
+	return (zvol_set_volblocksize(zc));
+}
+
+static int
+zfs_ioc_create_minor(zfs_cmd_t *zc)
+{
+	return (zvol_create_minor(zc));
+}
+
+static int
+zfs_ioc_remove_minor(zfs_cmd_t *zc)
+{
+	return (zvol_remove_minor(zc));
+}
+
+/*
+ * Search the vfs list for a specified resource.  Returns a pointer to it
+ * or NULL if no suitable entry is found. The caller of this routine
+ * is responsible for releasing the returned vfs pointer.
+ */
+static vfs_t *
+zfs_get_vfs(const char *resource)
+{
+	struct vfs *vfsp;
+	struct vfs *vfs_found = NULL;
+
+	vfs_list_read_lock();
+	vfsp = rootvfs;
+	do {
+		if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) {
+			VFS_HOLD(vfsp);
+			vfs_found = vfsp;
+			break;
+		}
+		vfsp = vfsp->vfs_next;
+	} while (vfsp != rootvfs);
+	vfs_list_unlock();
+	return (vfs_found);
+}
+
+static void
+zfs_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
+{
+	zfs_cmd_t *zc = arg;
+	zfs_create_fs(os, (cred_t *)(uintptr_t)zc->zc_cred, tx);
+}
+
+static int
+zfs_ioc_create(zfs_cmd_t *zc)
+{
+	objset_t *clone;
+	int error = 0;
+	void (*cbfunc)(objset_t *os, void *arg, dmu_tx_t *tx);
+	dmu_objset_type_t type = zc->zc_objset_type;
+
+	switch (type) {
+
+	case DMU_OST_ZFS:
+		cbfunc = zfs_create_cb;
+		break;
+
+	case DMU_OST_ZVOL:
+		cbfunc = zvol_create_cb;
+		break;
+
+	default:
+		return (EINVAL);
+	}
+
+	if (zc->zc_filename[0] != '\0') {
+		/*
+		 * We're creating a clone of an existing snapshot.
+		 */
+		zc->zc_filename[sizeof (zc->zc_filename) - 1] = '\0';
+		if (dataset_namecheck(zc->zc_filename, NULL, NULL) != 0)
+			return (EINVAL);
+
+		error = dmu_objset_open(zc->zc_filename, type,
+		    DS_MODE_STANDARD | DS_MODE_READONLY, &clone);
+		if (error)
+			return (error);
+		error = dmu_objset_create(zc->zc_name, type, clone, NULL, NULL);
+		dmu_objset_close(clone);
+	} else if (strchr(zc->zc_name, '@') != 0) {
+		/*
+		 * We're taking a snapshot of an existing dataset.
+		 */
+		error = dmu_objset_create(zc->zc_name, type, NULL, NULL, NULL);
+	} else {
+		/*
+		 * We're creating a new dataset.
+		 */
+		if (type == DMU_OST_ZVOL) {
+			if ((error = zvol_check_volsize(zc)) != 0)
+				return (error);
+			if ((error = zvol_check_volblocksize(zc)) != 0)
+				return (error);
+		}
+		error = dmu_objset_create(zc->zc_name, type, NULL, cbfunc, zc);
+	}
+	return (error);
+}
+
+static int
+zfs_ioc_destroy(zfs_cmd_t *zc)
+{
+	if (strchr(zc->zc_name, '@') != NULL &&
+	    zc->zc_objset_type == DMU_OST_ZFS) {
+		vfs_t *vfsp;
+		int err;
+
+		/*
+		 * Snapshots under .zfs control must be unmounted
+		 * before they can be destroyed.
+		 */
+		if ((vfsp = zfs_get_vfs(zc->zc_name)) != NULL) {
+			/*
+			 * Always force the unmount for snapshots.
+			 */
+			int flag = MS_FORCE;
+
+			if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) {
+				VFS_RELE(vfsp);
+				return (err);
+			}
+			VFS_RELE(vfsp);
+			if ((err = dounmount(vfsp, flag, kcred)) != 0)
+				return (err);
+		}
+	}
+
+	return (dmu_objset_destroy(zc->zc_name));
+}
+
+static int
+zfs_ioc_rollback(zfs_cmd_t *zc)
+{
+	return (dmu_objset_rollback(zc->zc_name));
+}
+
+static int
+zfs_ioc_rename(zfs_cmd_t *zc)
+{
+	zc->zc_prop_value[sizeof (zc->zc_prop_value) - 1] = '\0';
+	if (dataset_namecheck(zc->zc_prop_value, NULL, NULL) != 0)
+		return (EINVAL);
+
+	if (strchr(zc->zc_name, '@') != NULL &&
+	    zc->zc_objset_type == DMU_OST_ZFS) {
+		vfs_t *vfsp;
+		int err;
+
+		/*
+		 * Snapshots under .zfs control must be unmounted
+		 * before they can be renamed.
+		 */
+		if ((vfsp = zfs_get_vfs(zc->zc_name)) != NULL) {
+			/*
+			 * Always force the unmount for snapshots.
+			 */
+			int flag = MS_FORCE;
+
+			if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) {
+				VFS_RELE(vfsp);
+				return (err);
+			}
+			VFS_RELE(vfsp);
+			if ((err = dounmount(vfsp, flag, kcred)) != 0)
+				return (err);
+		}
+	}
+
+	return (dmu_objset_rename(zc->zc_name, zc->zc_prop_value));
+}
+
+static int
+zfs_ioc_recvbackup(zfs_cmd_t *zc)
+{
+	file_t *fp;
+	int error, fd;
+
+	fd = zc->zc_cookie;
+	fp = getf(fd);
+	if (fp == NULL)
+		return (EBADF);
+	error = dmu_recvbackup(&zc->zc_begin_record, &zc->zc_cookie,
+	    fp->f_vnode, fp->f_offset);
+	releasef(fd);
+	return (error);
+}
+
+static int
+zfs_ioc_sendbackup(zfs_cmd_t *zc)
+{
+	objset_t *fromsnap = NULL;
+	objset_t *tosnap;
+	file_t *fp;
+	int error;
+
+	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, &tosnap);
+	if (error)
+		return (error);
+
+	if (zc->zc_prop_value[0] != '\0') {
+		error = dmu_objset_open(zc->zc_prop_value, DMU_OST_ANY,
+		    DS_MODE_STANDARD | DS_MODE_READONLY, &fromsnap);
+		if (error) {
+			dmu_objset_close(tosnap);
+			return (error);
+		}
+	}
+
+	fp = getf(zc->zc_cookie);
+	if (fp == NULL) {
+		dmu_objset_close(tosnap);
+		if (fromsnap)
+			dmu_objset_close(fromsnap);
+		return (EBADF);
+	}
+
+	error = dmu_sendbackup(tosnap, fromsnap, fp->f_vnode);
+
+	releasef(zc->zc_cookie);
+	if (fromsnap)
+		dmu_objset_close(fromsnap);
+	dmu_objset_close(tosnap);
+	return (error);
+}
+
+static zfs_ioc_vec_t zfs_ioc_vec[] = {
+	{ zfs_ioc_pool_create,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_pool_destroy,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_pool_import,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_pool_export,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_pool_configs,		zfs_secpolicy_none,	no_name },
+	{ zfs_ioc_pool_guid,		zfs_secpolicy_read,	pool_name },
+	{ zfs_ioc_pool_stats,		zfs_secpolicy_read,	pool_name },
+	{ zfs_ioc_pool_tryimport,	zfs_secpolicy_config,	no_name },
+	{ zfs_ioc_pool_scrub,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_pool_freeze,		zfs_secpolicy_config,	no_name },
+	{ zfs_ioc_vdev_add,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_vdev_remove,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_vdev_online,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_vdev_offline,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_vdev_attach,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_vdev_detach,		zfs_secpolicy_config,	pool_name },
+	{ zfs_ioc_objset_stats,		zfs_secpolicy_read,	dataset_name },
+	{ zfs_ioc_dataset_list_next,	zfs_secpolicy_read,	dataset_name },
+	{ zfs_ioc_snapshot_list_next,	zfs_secpolicy_read,	dataset_name },
+	{ zfs_ioc_set_prop,		zfs_secpolicy_setprop,	dataset_name },
+	{ zfs_ioc_set_quota,		zfs_secpolicy_quota,	dataset_name },
+	{ zfs_ioc_set_reservation,	zfs_secpolicy_write,	dataset_name },
+	{ zfs_ioc_set_volsize,		zfs_secpolicy_config,	dataset_name },
+	{ zfs_ioc_set_volblocksize,	zfs_secpolicy_config,	dataset_name },
+	{ zfs_ioc_create_minor,		zfs_secpolicy_config,	dataset_name },
+	{ zfs_ioc_remove_minor,		zfs_secpolicy_config,	dataset_name },
+	{ zfs_ioc_create,		zfs_secpolicy_parent,	dataset_name },
+	{ zfs_ioc_destroy,		zfs_secpolicy_parent,	dataset_name },
+	{ zfs_ioc_rollback,		zfs_secpolicy_write,	dataset_name },
+	{ zfs_ioc_rename,		zfs_secpolicy_write,	dataset_name },
+	{ zfs_ioc_recvbackup,		zfs_secpolicy_write,	dataset_name },
+	{ zfs_ioc_sendbackup,		zfs_secpolicy_write,	dataset_name },
+};
+
+static int
+zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
+{
+	zfs_cmd_t *zc;
+	uint_t vec;
+	int error;
+
+	if (getminor(dev) != 0)
+		return (zvol_ioctl(dev, cmd, arg, flag, cr, rvalp));
+
+	vec = cmd - ZFS_IOC;
+
+	if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
+		return (EINVAL);
+
+	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+
+	error = xcopyin((void *)arg, zc, sizeof (zfs_cmd_t));
+
+	if (error == 0) {
+		zc->zc_cred = (uintptr_t)cr;
+		zc->zc_dev = dev;
+		error = zfs_ioc_vec[vec].zvec_secpolicy(zc->zc_name,
+		    zc->zc_prop_name, cr);
+	}
+
+	/*
+	 * Ensure that all pool/dataset names are valid before we pass down to
+	 * the lower layers.
+	 */
+	if (error == 0) {
+		zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+		switch (zfs_ioc_vec[vec].zvec_namecheck) {
+		case pool_name:
+			if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
+				error = EINVAL;
+			break;
+
+		case dataset_name:
+			if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
+				error = EINVAL;
+			break;
+		}
+	}
+
+	if (error == 0)
+		error = zfs_ioc_vec[vec].zvec_func(zc);
+
+	if (error == 0 || error == ENOMEM) {
+		int rc = xcopyout(zc, (void *)arg, sizeof (zfs_cmd_t));
+		if (error == 0)
+			error = rc;
+	}
+
+	kmem_free(zc, sizeof (zfs_cmd_t));
+	return (error);
+}
+
+static int
+zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0,
+	    DDI_PSEUDO, 0) == DDI_FAILURE)
+		return (DDI_FAILURE);
+
+	zfs_dip = dip;
+
+	ddi_report_dev(dip);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	if (spa_busy() || zfs_busy() || zvol_busy())
+		return (DDI_FAILURE);
+
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	zfs_dip = NULL;
+
+	ddi_prop_remove_all(dip);
+	ddi_remove_minor_node(dip, NULL);
+
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+	switch (infocmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		*result = zfs_dip;
+		return (DDI_SUCCESS);
+
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)(uintptr_t)getminor((dev_t)arg);
+		return (DDI_SUCCESS);
+	}
+
+	return (DDI_FAILURE);
+}
+
+/*
+ * OK, so this is a little weird.
+ *
+ * /dev/zfs is the control node, i.e. minor 0.
+ * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0.
+ *
+ * /dev/zfs has basically nothing to do except serve up ioctls,
+ * so most of the standard driver entry points are in zvol.c.
+ */
+static struct cb_ops zfs_cb_ops = {
+	zvol_open,	/* open */
+	zvol_close,	/* close */
+	zvol_strategy,	/* strategy */
+	nodev,		/* print */
+	nodev,		/* dump */
+	zvol_read,	/* read */
+	zvol_write,	/* write */
+	zfsdev_ioctl,	/* ioctl */
+	nodev,		/* devmap */
+	nodev,		/* mmap */
+	nodev,		/* segmap */
+	nochpoll,	/* poll */
+	ddi_prop_op,	/* prop_op */
+	NULL,		/* streamtab */
+	D_NEW | D_MP | D_64BIT,		/* Driver compatibility flag */
+	CB_REV,		/* version */
+	zvol_aread,	/* async read */
+	zvol_awrite,	/* async write */
+};
+
+static struct dev_ops zfs_dev_ops = {
+	DEVO_REV,	/* version */
+	0,		/* refcnt */
+	zfs_info,	/* info */
+	nulldev,	/* identify */
+	nulldev,	/* probe */
+	zfs_attach,	/* attach */
+	zfs_detach,	/* detach */
+	nodev,		/* reset */
+	&zfs_cb_ops,	/* driver operations */
+	NULL		/* no bus operations */
+};
+
+static struct modldrv zfs_modldrv = {
+	&mod_driverops, "ZFS storage pool version 1", &zfs_dev_ops
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	(void *)&zfs_modlfs,
+	(void *)&zfs_modldrv,
+	NULL
+};
+
+int
+_init(void)
+{
+	int error;
+
+	if ((error = mod_install(&modlinkage)) != 0)
+		return (error);
+
+	error = ldi_ident_from_mod(&modlinkage, &zfs_li);
+	ASSERT(error == 0);
+
+	spa_init(FREAD | FWRITE);
+	zfs_init();
+	zvol_init();
+
+	return (0);
+}
+
+int
+_fini(void)
+{
+	int error;
+
+	if (spa_busy() || zfs_busy() || zvol_busy())
+		return (EBUSY);
+
+	if ((error = mod_remove(&modlinkage)) != 0)
+		return (error);
+
+	zvol_fini();
+	zfs_fini();
+	spa_fini();
+
+	ldi_ident_release(zfs_li);
+	zfs_li = NULL;
+
+	return (error);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_log.c b/usr/src/uts/common/fs/zfs/zfs_log.c
new file mode 100644
index 000000000000..dbfd87f67adf
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c
@@ -0,0 +1,337 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/file.h>
+#include <sys/vfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/byteorder.h>
+#include <sys/policy.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/acl.h>
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/ddi.h>
+
+/*
+ * All the functions in this file are used to construct the log entries
+ * to record transactions. They allocate * a intent log transaction
+ * structure (itx_t) and save within it all the information necessary to
+ * possibly replay the transaction. The itx is then assigned a sequence
+ * number and inserted in the in-memory list anchored in the zilog.
+ */
+
+/*
+ * zfs_log_create() is used to handle TX_CREATE, TX_MKDIR and TX_MKXATTR
+ * transactions.
+ */
+uint64_t
+zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *dzp, znode_t *zp, char *name)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_create_t *lr;
+	size_t namesize = strlen(name) + 1;
+
+	if (zilog == NULL)
+		return (0);
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+	lr = (lr_create_t *)&itx->itx_lr;
+	lr->lr_doid = dzp->z_id;
+	lr->lr_foid = zp->z_id;
+	lr->lr_mode = zp->z_phys->zp_mode;
+	lr->lr_uid = zp->z_phys->zp_uid;
+	lr->lr_gid = zp->z_phys->zp_gid;
+	lr->lr_gen = zp->z_phys->zp_gen;
+	lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
+	lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+	lr->lr_rdev = zp->z_phys->zp_rdev;
+	bcopy(name, (char *)(lr + 1), namesize);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	dzp->z_last_itx = seq;
+	zp->z_last_itx = seq;
+	return (seq);
+}
+
+/*
+ * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions.
+ */
+uint64_t
+zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *dzp, char *name)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_remove_t *lr;
+	size_t namesize = strlen(name) + 1;
+
+	if (zilog == NULL)
+		return (0);
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+	lr = (lr_remove_t *)&itx->itx_lr;
+	lr->lr_doid = dzp->z_id;
+	bcopy(name, (char *)(lr + 1), namesize);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	dzp->z_last_itx = seq;
+	return (seq);
+}
+
+/*
+ * zfs_log_link() handles TX_LINK transactions.
+ */
+uint64_t
+zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *dzp, znode_t *zp, char *name)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_link_t *lr;
+	size_t namesize = strlen(name) + 1;
+
+	if (zilog == NULL)
+		return (0);
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+	lr = (lr_link_t *)&itx->itx_lr;
+	lr->lr_doid = dzp->z_id;
+	lr->lr_link_obj = zp->z_id;
+	bcopy(name, (char *)(lr + 1), namesize);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	dzp->z_last_itx = seq;
+	zp->z_last_itx = seq;
+	return (seq);
+}
+
+/*
+ * zfs_log_symlink() handles TX_SYMLINK transactions.
+ */
+uint64_t
+zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *dzp, znode_t *zp, char *name, char *link)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_create_t *lr;
+	size_t namesize = strlen(name) + 1;
+	size_t linksize = strlen(link) + 1;
+
+	if (zilog == NULL)
+		return (0);
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
+	lr = (lr_create_t *)&itx->itx_lr;
+	lr->lr_doid = dzp->z_id;
+	lr->lr_foid = zp->z_id;
+	lr->lr_mode = zp->z_phys->zp_mode;
+	lr->lr_uid = zp->z_phys->zp_uid;
+	lr->lr_gid = zp->z_phys->zp_gid;
+	lr->lr_gen = zp->z_phys->zp_gen;
+	lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
+	lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+	bcopy(name, (char *)(lr + 1), namesize);
+	bcopy(link, (char *)(lr + 1) + namesize, linksize);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	dzp->z_last_itx = seq;
+	zp->z_last_itx = seq;
+	return (seq);
+}
+
+/*
+ * zfs_log_rename() handles TX_RENAME transactions.
+ */
+uint64_t
+zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_rename_t *lr;
+	size_t snamesize = strlen(sname) + 1;
+	size_t dnamesize = strlen(dname) + 1;
+
+	if (zilog == NULL)
+		return (0);
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
+	lr = (lr_rename_t *)&itx->itx_lr;
+	lr->lr_sdoid = sdzp->z_id;
+	lr->lr_tdoid = tdzp->z_id;
+	bcopy(sname, (char *)(lr + 1), snamesize);
+	bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	sdzp->z_last_itx = seq;
+	tdzp->z_last_itx = seq;
+	szp->z_last_itx = seq;
+	return (seq);
+}
+
+/*
+ * zfs_log_write() handles TX_WRITE transactions.
+ *
+ * We store data in the log buffers if it small enough.
+ * Otherwise we flush the data out via dmu_sync().
+ */
+ssize_t zfs_immediate_write_sz = 65536;
+
+uint64_t
+zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *zp, offset_t off, ssize_t len, int ioflag, uio_t *uio)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_write_t *lr;
+	int dlen, err;
+
+	if (zilog == NULL || zp->z_reap)
+		return (0);
+
+	dlen = (len <= zfs_immediate_write_sz ? len : 0);
+	itx = zil_itx_create(txtype, sizeof (*lr) + dlen);
+	itx->itx_data_copied = 0;
+	if ((ioflag & FDSYNC) && (dlen != 0)) {
+		err = xcopyin(uio->uio_iov->iov_base - len,
+		    (char *)itx + offsetof(itx_t, itx_lr) +  sizeof (*lr),
+		    len);
+		/*
+		 * copyin shouldn't fault as we've already successfully
+		 * copied it to a dmu buffer. However if it does we'll get
+		 * the data from the dmu later.
+		 */
+		if (!err)
+			itx->itx_data_copied = 1;
+	}
+	lr = (lr_write_t *)&itx->itx_lr;
+	lr->lr_foid = zp->z_id;
+	lr->lr_offset = off;
+	lr->lr_length = len;
+	lr->lr_blkoff = 0;
+	BP_ZERO(&lr->lr_blkptr);
+
+	itx->itx_private = zp->z_zfsvfs;
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	zp->z_last_itx = seq;
+	return (seq);
+}
+
+/*
+ * zfs_log_truncate() handles TX_TRUNCATE transactions.
+ */
+uint64_t
+zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *zp, uint64_t off, uint64_t len)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_truncate_t *lr;
+
+	if (zilog == NULL || zp->z_reap)
+		return (0);
+
+	itx = zil_itx_create(txtype, sizeof (*lr));
+	lr = (lr_truncate_t *)&itx->itx_lr;
+	lr->lr_foid = zp->z_id;
+	lr->lr_offset = off;
+	lr->lr_length = len;
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	zp->z_last_itx = seq;
+	return (seq);
+}
+
+/*
+ * zfs_log_setattr() handles TX_SETATTR transactions.
+ */
+uint64_t
+zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *zp, vattr_t *vap, uint_t mask_applied)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_setattr_t *lr;
+
+	if (zilog == NULL || zp->z_reap)
+		return (0);
+
+	itx = zil_itx_create(txtype, sizeof (*lr));
+	lr = (lr_setattr_t *)&itx->itx_lr;
+	lr->lr_foid = zp->z_id;
+	lr->lr_mask = (uint64_t)mask_applied;
+	lr->lr_mode = (uint64_t)vap->va_mode;
+	lr->lr_uid = (uint64_t)vap->va_uid;
+	lr->lr_gid = (uint64_t)vap->va_gid;
+	lr->lr_size = (uint64_t)vap->va_size;
+	ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
+	ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	zp->z_last_itx = seq;
+	return (seq);
+}
+
+/*
+ * zfs_log_acl() handles TX_ACL transactions.
+ */
+uint64_t
+zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+	znode_t *zp, int aclcnt, ace_t *z_ace)
+{
+	itx_t *itx;
+	uint64_t seq;
+	lr_acl_t *lr;
+
+	if (zilog == NULL || zp->z_reap)
+		return (0);
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + aclcnt * sizeof (ace_t));
+	lr = (lr_acl_t *)&itx->itx_lr;
+	lr->lr_foid = zp->z_id;
+	lr->lr_aclcnt = (uint64_t)aclcnt;
+	bcopy(z_ace, (ace_t *)(lr + 1), aclcnt * sizeof (ace_t));
+
+	seq = zil_itx_assign(zilog, itx, tx);
+	zp->z_last_itx = seq;
+	return (seq);
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_replay.c b/usr/src/uts/common/fs/zfs/zfs_replay.c
new file mode 100644
index 000000000000..cd5a3848cb7a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_replay.c
@@ -0,0 +1,337 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/spa.h>
+#include <sys/zil.h>
+#include <sys/byteorder.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/acl.h>
+#include <sys/atomic.h>
+#include <sys/cred.h>
+
+/*
+ * Functions to replay ZFS intent log (ZIL) records
+ * The functions are called through a function vector (zfs_replay_vector)
+ * which is indexed by the transaction type.
+ */
+
+static void
+zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
+	uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
+{
+	bzero(vap, sizeof (*vap));
+	vap->va_mask = (uint_t)mask;
+	vap->va_type = IFTOVT(mode);
+	vap->va_mode = mode & MODEMASK;
+	vap->va_uid = (uid_t)uid;
+	vap->va_gid = (gid_t)gid;
+	vap->va_rdev = (dev_t)rdev;
+	vap->va_nodeid = nodeid;
+}
+
+/* ARGSUSED */
+static int
+zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
+{
+	return (ENOTSUP);
+}
+
+static int
+zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
+{
+	char *name = (char *)(lr + 1);	/* name follows lr_create_t */
+	char *link;			/* symlink content follows name */
+	znode_t *dzp;
+	vnode_t *vp = NULL;
+	vattr_t va;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+		return (error);
+
+	zfs_init_vattr(&va, AT_TYPE | AT_MODE | AT_UID | AT_GID,
+	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
+
+	/*
+	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
+	 * eventually end up in zfs_mknode(), which assigns the object's
+	 * creation time and generation number.  The generic VOP_CREATE()
+	 * doesn't have either concept, so we smuggle the values inside
+	 * the vattr's otherwise unused va_ctime and va_nblocks fields.
+	 */
+	ZFS_TIME_DECODE(&va.va_ctime, lr->lr_crtime);
+	va.va_nblocks = lr->lr_gen;
+
+	switch ((int)lr->lr_common.lrc_txtype) {
+	case TX_CREATE:
+		error = VOP_CREATE(ZTOV(dzp), name, &va, 0, 0, &vp, kcred, 0);
+		break;
+	case TX_MKDIR:
+		error = VOP_MKDIR(ZTOV(dzp), name, &va, &vp, kcred);
+		break;
+	case TX_MKXATTR:
+		error = zfs_make_xattrdir(dzp, &va, &vp, kcred);
+		break;
+	case TX_SYMLINK:
+		link = name + strlen(name) + 1;
+		error = VOP_SYMLINK(ZTOV(dzp), name, &va, link, kcred);
+		break;
+	default:
+		error = ENOTSUP;
+	}
+
+	if (error == 0 && vp != NULL)
+		VN_RELE(vp);
+
+	VN_RELE(ZTOV(dzp));
+
+	return (error);
+}
+
+static int
+zfs_replay_remove(zfsvfs_t *zfsvfs, lr_remove_t *lr, boolean_t byteswap)
+{
+	char *name = (char *)(lr + 1);	/* name follows lr_remove_t */
+	znode_t *dzp;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+		return (error);
+
+	switch ((int)lr->lr_common.lrc_txtype) {
+	case TX_REMOVE:
+		error = VOP_REMOVE(ZTOV(dzp), name, kcred);
+		break;
+	case TX_RMDIR:
+		error = VOP_RMDIR(ZTOV(dzp), name, NULL, kcred);
+		break;
+	default:
+		error = ENOTSUP;
+	}
+
+	VN_RELE(ZTOV(dzp));
+
+	return (error);
+}
+
+static int
+zfs_replay_link(zfsvfs_t *zfsvfs, lr_link_t *lr, boolean_t byteswap)
+{
+	char *name = (char *)(lr + 1);	/* name follows lr_link_t */
+	znode_t *dzp, *zp;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+		return (error);
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
+		VN_RELE(ZTOV(dzp));
+		return (error);
+	}
+
+	error = VOP_LINK(ZTOV(dzp), ZTOV(zp), name, kcred);
+
+	VN_RELE(ZTOV(zp));
+	VN_RELE(ZTOV(dzp));
+
+	return (error);
+}
+
+static int
+zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, boolean_t byteswap)
+{
+	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
+	char *tname = sname + strlen(sname) + 1;
+	znode_t *sdzp, *tdzp;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
+		return (error);
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
+		VN_RELE(ZTOV(sdzp));
+		return (error);
+	}
+
+	error = VOP_RENAME(ZTOV(sdzp), sname, ZTOV(tdzp), tname, kcred);
+
+	VN_RELE(ZTOV(tdzp));
+	VN_RELE(ZTOV(sdzp));
+
+	return (error);
+}
+
+static int
+zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
+{
+	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
+	znode_t	*zp;
+	int error;
+	ssize_t resid;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+		return (error);
+
+	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
+	    lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+static int
+zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
+{
+	znode_t *zp;
+	flock64_t fl;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+		return (error);
+
+	bzero(&fl, sizeof (fl));
+	fl.l_type = F_WRLCK;
+	fl.l_whence = 0;
+	fl.l_start = lr->lr_offset;
+	fl.l_len = lr->lr_length;
+
+	error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX,
+	    lr->lr_offset, kcred, NULL);
+
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+static int
+zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
+{
+	znode_t *zp;
+	vattr_t va;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+		return (error);
+
+	zfs_init_vattr(&va, lr->lr_mask, lr->lr_mode,
+	    lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
+
+	va.va_size = lr->lr_size;
+	ZFS_TIME_DECODE(&va.va_atime, lr->lr_atime);
+	ZFS_TIME_DECODE(&va.va_mtime, lr->lr_mtime);
+
+	error = VOP_SETATTR(ZTOV(zp), &va, 0, kcred, NULL);
+
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+static int
+zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
+{
+	ace_t *ace = (ace_t *)(lr + 1);	/* ace array follows lr_acl_t */
+	vsecattr_t vsa;
+	znode_t *zp;
+	int error;
+
+	if (byteswap) {
+		byteswap_uint64_array(lr, sizeof (*lr));
+		zfs_ace_byteswap(ace, lr->lr_aclcnt);
+	}
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+		return (error);
+
+	bzero(&vsa, sizeof (vsa));
+	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
+	vsa.vsa_aclcnt = lr->lr_aclcnt;
+	vsa.vsa_aclentp = ace;
+
+	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred);
+
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+/*
+ * Callback vectors for replaying records
+ */
+zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
+	zfs_replay_error,	/* 0 no such transaction type */
+	zfs_replay_create,	/* TX_CREATE */
+	zfs_replay_create,	/* TX_MKDIR */
+	zfs_replay_create,	/* TX_MKXATTR */
+	zfs_replay_create,	/* TX_SYMLINK */
+	zfs_replay_remove,	/* TX_REMOVE */
+	zfs_replay_remove,	/* TX_RMDIR */
+	zfs_replay_link,	/* TX_LINK */
+	zfs_replay_rename,	/* TX_RENAME */
+	zfs_replay_write,	/* TX_WRITE */
+	zfs_replay_truncate,	/* TX_TRUNCATE */
+	zfs_replay_setattr,	/* TX_SETATTR */
+	zfs_replay_acl,		/* TX_ACL */
+};
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
new file mode 100644
index 000000000000..502bcf39bf9e
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -0,0 +1,1072 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/acl.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/mntent.h>
+#include <sys/mount.h>
+#include <sys/cmn_err.h>
+#include "fs/fs_subr.h"
+#include <sys/zfs_znode.h>
+#include <sys/zil.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/varargs.h>
+#include <sys/policy.h>
+#include <sys/atomic.h>
+#include <sys/mkdev.h>
+#include <sys/modctl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_ctldir.h>
+
+int zfsfstype;
+vfsops_t *zfs_vfsops = NULL;
+static major_t	zfs_major;
+static minor_t zfs_minor;
+static kmutex_t	zfs_dev_mtx;
+
+static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
+static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
+static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
+static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
+static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
+static void zfs_freevfs(vfs_t *vfsp);
+static void zfs_objset_close(zfsvfs_t *zfsvfs);
+
+static const fs_operation_def_t zfs_vfsops_template[] = {
+	VFSNAME_MOUNT, zfs_mount,
+	VFSNAME_UNMOUNT, zfs_umount,
+	VFSNAME_ROOT, zfs_root,
+	VFSNAME_STATVFS, zfs_statvfs,
+	VFSNAME_SYNC, (fs_generic_func_p) zfs_sync,
+	VFSNAME_VGET, zfs_vget,
+	VFSNAME_FREEVFS, (fs_generic_func_p) zfs_freevfs,
+	NULL, NULL
+};
+
+static const fs_operation_def_t zfs_vfsops_eio_template[] = {
+	VFSNAME_FREEVFS, (fs_generic_func_p) zfs_freevfs,
+	NULL, NULL
+};
+
+/*
+ * We need to keep a count of active fs's.
+ * This is necessary to prevent our module
+ * from being unloaded after a umount -f
+ */
+static uint32_t	zfs_active_fs_count = 0;
+
+static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
+static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
+
+static mntopt_t mntopts[] = {
+	{ MNTOPT_XATTR, NULL, NULL, MO_NODISPLAY|MO_DEFAULT, NULL },
+	{ MNTOPT_NOATIME, noatime_cancel, NULL, MO_DEFAULT, NULL },
+	{ MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
+};
+
+static mntopts_t zfs_mntopts = {
+	sizeof (mntopts) / sizeof (mntopt_t),
+	mntopts
+};
+
+/*ARGSUSED*/
+int
+zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
+{
+	/*
+	 * Data integrity is job one.  We don't want a compromised kernel
+	 * writing to the storage pool, so we never sync during panic.
+	 */
+	if (panicstr)
+		return (0);
+
+	/*
+	 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
+	 * to sync metadata, which they would otherwise cache indefinitely.
+	 * Semantically, the only requirement is that the sync be initiated.
+	 * The DMU syncs out txgs frequently, so there's nothing to do.
+	 */
+	if (flag & SYNC_ATTR)
+		return (0);
+
+	if (vfsp != NULL) {
+		/*
+		 * Sync a specific filesystem.
+		 */
+		zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+		ZFS_ENTER(zfsvfs);
+		if (zfsvfs->z_log != NULL)
+			zil_commit(zfsvfs->z_log, UINT64_MAX, FSYNC);
+		else
+			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+		ZFS_EXIT(zfsvfs);
+	} else {
+		/*
+		 * Sync all ZFS filesystems.  This is what happens when you
+		 * run sync(1M).  Unlike other filesystems, ZFS honors the
+		 * request by waiting for all pools to commit all dirty data.
+		 */
+		spa_sync_allpools();
+	}
+
+	return (0);
+}
+
+static void
+atime_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == TRUE) {
+		zfsvfs->z_atime = TRUE;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
+	} else {
+		zfsvfs->z_atime = FALSE;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
+	}
+}
+
+static void
+blksz_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval < SPA_MINBLOCKSIZE ||
+	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
+		newval = SPA_MAXBLOCKSIZE;
+
+	zfsvfs->z_max_blksz = newval;
+	zfsvfs->z_vfs->vfs_bsize = newval;
+}
+
+static void
+readonly_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval) {
+		/* XXX locking on vfs_flag? */
+		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
+		(void) zfs_delete_thread_target(zfsvfs, 0);
+	} else {
+		/* XXX locking on vfs_flag? */
+		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
+		(void) zfs_delete_thread_target(zfsvfs, 1);
+	}
+}
+
+static void
+devices_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == FALSE) {
+		zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
+	} else {
+		zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
+	}
+}
+
+static void
+setuid_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == FALSE) {
+		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
+	} else {
+		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
+	}
+}
+
+static void
+exec_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == FALSE) {
+		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
+	} else {
+		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
+	}
+}
+
+static void
+snapdir_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_show_ctldir = newval;
+}
+
+static void
+acl_mode_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_acl_mode = newval;
+}
+
+static void
+acl_inherit_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_acl_inherit = newval;
+}
+
+/*ARGSUSED*/
+static int
+zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = NULL;
+	znode_t		*zp = NULL;
+	vnode_t		*vp = NULL;
+	objset_t	*os = NULL;
+	struct dsl_dataset *ds;
+	char		*osname;
+	uint64_t	readonly, recordsize;
+	pathname_t	spn;
+	dev_t		mount_dev;
+	major_t		new_major;
+	int		mode;
+	int		error = 0;
+	uio_seg_t	fromspace = (uap->flags & MS_SYSSPACE) ?
+				UIO_SYSSPACE : UIO_USERSPACE;
+	int		canwrite;
+
+	if (mvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	mutex_enter(&mvp->v_lock);
+	if ((uap->flags & MS_REMOUNT) == 0 &&
+	    (uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(&mvp->v_lock);
+		return (EBUSY);
+	}
+	mutex_exit(&mvp->v_lock);
+
+	/*
+	 * ZFS does not support passing unparsed data in via MS_DATA.
+	 * Users should use the MS_OPTIONSTR interface; this means
+	 * that all option parsing is already done and the options struct
+	 * can be interrogated.
+	 */
+	if ((uap->flags & MS_DATA) && uap->datalen > 0)
+		return (EINVAL);
+
+	/*
+	 * When doing a remount, we simply refresh our temporary properties
+	 * according to those options set in the current VFS options.
+	 */
+	if (uap->flags & MS_REMOUNT) {
+		zfsvfs = vfsp->vfs_data;
+
+		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
+			readonly_changed_cb(zfsvfs, B_TRUE);
+		else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+			if (dmu_objset_is_snapshot(zfsvfs->z_os))
+				return (EROFS);
+			readonly_changed_cb(zfsvfs, B_FALSE);
+		}
+
+		if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+			devices_changed_cb(zfsvfs, B_FALSE);
+			setuid_changed_cb(zfsvfs, B_FALSE);
+		} else {
+			if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
+				devices_changed_cb(zfsvfs, B_FALSE);
+			else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
+				devices_changed_cb(zfsvfs, B_TRUE);
+
+			if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
+				setuid_changed_cb(zfsvfs, B_FALSE);
+			else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
+				setuid_changed_cb(zfsvfs, B_TRUE);
+		}
+
+		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
+			exec_changed_cb(zfsvfs, B_FALSE);
+		else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
+			exec_changed_cb(zfsvfs, B_TRUE);
+
+		return (0);
+	}
+
+	/*
+	 * Get the objset name (the "special" mount argument).
+	 */
+	if (error = pn_get(uap->spec, fromspace, &spn))
+		return (error);
+
+	osname = spn.pn_path;
+
+	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+		goto out;
+
+	/*
+	 * Refuse to mount a filesystem if we are in a local zone and the
+	 * dataset is not visible.
+	 */
+	if (!INGLOBALZONE(curproc) &&
+	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
+		error = EPERM;
+		goto out;
+	}
+
+	/*
+	 * Initialize the zfs-specific filesystem structure.
+	 * Should probably make this a kmem cache, shuffle fields,
+	 * and just bzero upto z_hold_mtx[].
+	 */
+	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+	zfsvfs->z_vfs = vfsp;
+	zfsvfs->z_parent = zfsvfs;
+	zfsvfs->z_assign = TXG_NOWAIT;
+	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
+	zfsvfs->z_show_ctldir = VISIBLE;
+
+	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+	    offsetof(znode_t, z_link_node));
+	rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL);
+
+	/*
+	 * Initialize the generic filesystem structure.
+	 */
+	vfsp->vfs_bcount = 0;
+	vfsp->vfs_data = NULL;
+
+	/*
+	 * Create a unique device for the mount.
+	 */
+	do {
+		ASSERT3U(zfs_minor, <=, MAXMIN32);
+		int start = zfs_minor;
+		do {
+			mutex_enter(&zfs_dev_mtx);
+			zfs_minor++;
+			if (zfs_minor > MAXMIN32)
+				zfs_minor = 0;
+			mount_dev = makedevice(zfs_major, zfs_minor);
+			mutex_exit(&zfs_dev_mtx);
+		} while (vfs_devismounted(mount_dev) && zfs_minor != start);
+		if (zfs_minor == start) {
+			/*
+			 * We are using all ~262,000 minor numbers
+			 * for the current major number.  Create a
+			 * new major number.
+			 */
+			if ((new_major = getudev()) == (major_t)-1) {
+				cmn_err(CE_WARN,
+				    "zfs_mount: Can't get unique"
+				    " major device number.");
+				goto out;
+			}
+			mutex_enter(&zfs_dev_mtx);
+			zfs_major = new_major;
+			zfs_minor = 0;
+			mutex_exit(&zfs_dev_mtx);
+		} else {
+			break;
+		}
+		/* CONSTANTCONDITION */
+	} while (1);
+
+	ASSERT(vfs_devismounted(mount_dev) == 0);
+
+	if (dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL) != 0)
+		recordsize = SPA_MAXBLOCKSIZE;
+
+	vfsp->vfs_dev = mount_dev;
+	vfsp->vfs_fstype = zfsfstype;
+	vfsp->vfs_bsize = recordsize;
+	vfsp->vfs_flag |= VFS_NOTRUNC;
+	vfsp->vfs_data = zfsvfs;
+
+	error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL);
+	if (error)
+		goto out;
+
+	if (readonly)
+		mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
+	else
+		mode = DS_MODE_PRIMARY;
+
+	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
+	if (error == EROFS) {
+		mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
+		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
+		    &zfsvfs->z_os);
+	}
+	os = zfsvfs->z_os;
+
+	if (error)
+		goto out;
+
+	if (error = zfs_init_fs(zfsvfs, &zp, cr))
+		goto out;
+
+	if (dmu_objset_is_snapshot(os)) {
+		ASSERT(mode & DS_MODE_READONLY);
+		atime_changed_cb(zfsvfs, B_FALSE);
+		readonly_changed_cb(zfsvfs, B_TRUE);
+		zfsvfs->z_issnap = B_TRUE;
+	} else {
+		int do_readonly = FALSE, readonly;
+		int do_setuid = FALSE, setuid;
+		int do_exec = FALSE, exec;
+		int do_devices = FALSE, devices;
+
+		/*
+		 * Start a delete thread running.
+		 */
+		(void) zfs_delete_thread_target(zfsvfs, 1);
+
+		/*
+		 * Parse and replay the intent log.
+		 */
+		zil_replay(os, zfsvfs, &zfsvfs->z_assign, zfs_replay_vector,
+		    (void (*)(void *))zfs_delete_wait_empty);
+
+		if (!zil_disable)
+			zfsvfs->z_log = zil_open(os, zfs_get_data);
+
+		/*
+		 * The act of registering our callbacks will destroy any mount
+		 * options we may have.  In order to enable temporary overrides
+		 * of mount options, we stash away the current values and
+		 * restore them after we register the callbacks.
+		 */
+		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+			readonly = B_TRUE;
+			do_readonly = B_TRUE;
+		} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+			readonly = B_FALSE;
+			do_readonly = B_TRUE;
+		}
+		if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+			devices = B_FALSE;
+			setuid = B_FALSE;
+			do_devices = B_TRUE;
+			do_setuid = B_TRUE;
+		} else {
+			if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
+				devices = B_FALSE;
+				do_devices = B_TRUE;
+			} else if (vfs_optionisset(vfsp,
+			    MNTOPT_DEVICES, NULL)) {
+				devices = B_TRUE;
+				do_devices = B_TRUE;
+			}
+
+			if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
+				setuid = B_FALSE;
+				do_setuid = B_TRUE;
+			} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
+				setuid = B_TRUE;
+				do_setuid = B_TRUE;
+			}
+		}
+		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
+			exec = B_FALSE;
+			do_exec = B_TRUE;
+		} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
+			exec = B_TRUE;
+			do_exec = B_TRUE;
+		}
+
+		/*
+		 * Register property callbacks.
+		 */
+		ds = dmu_objset_ds(os);
+		VERIFY(dsl_prop_register(ds, "atime", atime_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_register(ds, "recordsize", blksz_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_register(ds, "readonly", readonly_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_register(ds, "devices", devices_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_register(ds, "setuid", setuid_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_register(ds, "exec", exec_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_register(ds, "snapdir", snapdir_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_register(ds, "aclmode", acl_mode_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_register(ds, "aclinherit",
+		    acl_inherit_changed_cb, zfsvfs) == 0);
+
+
+		/*
+		 * Invoke our callbacks to restore temporary mount options.
+		 */
+		if (do_readonly)
+			readonly_changed_cb(zfsvfs, readonly);
+		if (do_setuid)
+			setuid_changed_cb(zfsvfs, setuid);
+		if (do_exec)
+			exec_changed_cb(zfsvfs, exec);
+		if (do_devices)
+			devices_changed_cb(zfsvfs, devices);
+	}
+
+	vp = ZTOV(zp);
+	if (!zfsvfs->z_issnap)
+		zfsctl_create(zfsvfs);
+out:
+	if (error) {
+		if (zp)
+			VN_RELE(vp);
+
+		if (zfsvfs) {
+			if (os)
+				dmu_objset_close(os);
+			kmem_free(zfsvfs, sizeof (zfsvfs_t));
+		}
+	} else {
+		atomic_add_32(&zfs_active_fs_count, 1);
+		VN_RELE(vp);
+	}
+
+	pn_free(&spn);
+	return (error);
+}
+
+static int
+zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	dmu_objset_stats_t dstats;
+	dev32_t d32;
+
+	ZFS_ENTER(zfsvfs);
+
+	dmu_objset_stats(zfsvfs->z_os, &dstats);
+
+	/*
+	 * The underlying storage pool actually uses multiple block sizes.
+	 * We report the fragsize as the smallest block size we support,
+	 * and we report our blocksize as the filesystem's maximum blocksize.
+	 */
+	statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
+	statp->f_bsize = zfsvfs->z_max_blksz;
+
+	/*
+	 * The following report "total" blocks of various kinds in the
+	 * file system, but reported in terms of f_frsize - the
+	 * "fragment" size.
+	 */
+
+	statp->f_blocks =
+	    (dstats.dds_space_refd + dstats.dds_available) >> SPA_MINBLOCKSHIFT;
+	statp->f_bfree = dstats.dds_available >> SPA_MINBLOCKSHIFT;
+	statp->f_bavail = statp->f_bfree; /* no root reservation */
+
+	/*
+	 * statvfs() should really be called statufs(), because it assumes
+	 * static metadata.  ZFS doesn't preallocate files, so the best
+	 * we can do is report the max that could possibly fit in f_files,
+	 * and that minus the number actually used in f_ffree.
+	 * For f_ffree, report the smaller of the number of object available
+	 * and the number of blocks (each object will take at least a block).
+	 */
+	statp->f_ffree = MIN(dstats.dds_objects_avail, statp->f_bfree);
+	statp->f_favail = statp->f_ffree;	/* no "root reservation" */
+	statp->f_files = statp->f_ffree + dstats.dds_objects_used;
+
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	statp->f_fsid = d32;
+
+	/*
+	 * We're a zfs filesystem.
+	 */
+	(void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
+
+	statp->f_flag = 0;
+
+	statp->f_namemax = ZFS_MAXNAMELEN;
+
+	/*
+	 * We have all of 32 characters to stuff a string here.
+	 * Is there anything useful we could/should provide?
+	 */
+	bzero(statp->f_fstr, sizeof (statp->f_fstr));
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+static int
+zfs_root(vfs_t *vfsp, vnode_t **vpp)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	znode_t *rootzp;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+	if (error == 0)
+		*vpp = ZTOV(rootzp);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	int ret;
+
+	if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0)
+		return (ret);
+
+	/*
+	 * Unmount any snapshots mounted under .zfs before unmounting the
+	 * dataset itself.
+	 */
+	if (zfsvfs->z_ctldir != NULL &&
+	    (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
+		return (ret);
+
+	if (fflag & MS_FORCE) {
+		vfsp->vfs_flag |= VFS_UNMOUNTED;
+		zfsvfs->z_unmounted1 = B_TRUE;
+
+		/*
+		 * Wait for all zfs threads to leave zfs.
+		 * Grabbing a rwlock as reader in all vops and
+		 * as writer here doesn't work because it too easy to get
+		 * multiple reader enters as zfs can re-enter itself.
+		 * This can lead to deadlock if there is an intervening
+		 * rw_enter as writer.
+		 * So a file system threads ref count (z_op_cnt) is used.
+		 * A polling loop on z_op_cnt may seem inefficient, but
+		 * - this saves all threads on exit from having to grab a
+		 *   mutex in order to cv_signal
+		 * - only occurs on forced unmount in the rare case when
+		 *   there are outstanding threads within the file system.
+		 */
+		while (zfsvfs->z_op_cnt) {
+			delay(1);
+		}
+
+		zfs_objset_close(zfsvfs);
+
+		return (0);
+	}
+
+	zfs_zcache_flush(zfsvfs);
+
+	/*
+	 * Stop all delete threads.
+	 */
+	(void) zfs_delete_thread_target(zfsvfs, 0);
+
+	/*
+	 * Check the number of active vnodes in the file system.
+	 * Our count is maintained in the vfs structure, but the number
+	 * is off by 1 to indicate a hold on the vfs structure itself.
+	 *
+	 * The '.zfs' directory maintains a reference of its own, and any active
+	 * references underneath are reflected in the vnode count.
+	 */
+	if (zfsvfs->z_ctldir == NULL) {
+		if (vfsp->vfs_count > 1) {
+			if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0)
+				(void) zfs_delete_thread_target(zfsvfs, 1);
+			return (EBUSY);
+		}
+	} else {
+		if (vfsp->vfs_count > 2 ||
+		    (zfsvfs->z_ctldir->v_count > 1 && !(fflag & MS_FORCE))) {
+			if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0)
+				(void) zfs_delete_thread_target(zfsvfs, 1);
+			return (EBUSY);
+		}
+	}
+
+	vfsp->vfs_flag |= VFS_UNMOUNTED;
+	zfs_objset_close(zfsvfs);
+
+	/*
+	 * We can now safely destroy the '.zfs' directory node, which will
+	 * release its hold on the vfs_t.
+	 */
+	if (zfsvfs->z_ctldir != NULL)
+		zfsctl_destroy(zfsvfs);
+
+	return (0);
+}
+
+static int
+zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
+{
+	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
+	znode_t		*zp;
+	uint64_t	object = 0;
+	uint64_t	fid_gen = 0;
+	uint64_t	gen_mask;
+	uint64_t	zp_gen;
+	int 		i, err;
+
+	*vpp = NULL;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (fidp->fid_len == LONG_FID_LEN) {
+		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
+		uint64_t	objsetid = 0;
+		uint64_t	setgen = 0;
+
+		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
+
+		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
+
+		ZFS_EXIT(zfsvfs);
+
+		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
+		if (err)
+			return (EINVAL);
+		ZFS_ENTER(zfsvfs);
+	}
+
+	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
+		zfid_short_t	*zfid = (zfid_short_t *)fidp;
+
+		for (i = 0; i < sizeof (zfid->zf_object); i++)
+			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
+
+		for (i = 0; i < sizeof (zfid->zf_gen); i++)
+			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
+	} else {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	/* A zero fid_gen means we are in the .zfs control directories */
+	if (fid_gen == 0 &&
+	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
+		*vpp = zfsvfs->z_ctldir;
+		ASSERT(*vpp != NULL);
+		if (object == ZFSCTL_INO_SNAPDIR) {
+			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
+			    0, NULL, NULL) == 0);
+		} else {
+			VN_HOLD(*vpp);
+		}
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	gen_mask = -1ULL >> (64 - 8 * i);
+
+	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
+	if (err = zfs_zget(zfsvfs, object, &zp)) {
+		ZFS_EXIT(zfsvfs);
+		return (err);
+	}
+	zp_gen = zp->z_phys->zp_gen & gen_mask;
+	if (zp_gen == 0)
+		zp_gen = 1;
+	if (zp->z_reap || zp_gen != fid_gen) {
+		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
+		VN_RELE(ZTOV(zp));
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	*vpp = ZTOV(zp);
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+static void
+zfs_objset_close(zfsvfs_t *zfsvfs)
+{
+	zfs_delete_t	*zd = &zfsvfs->z_delete_head;
+	znode_t		*zp, *nextzp;
+	objset_t	*os = zfsvfs->z_os;
+	struct dsl_dataset *ds;
+
+	/*
+	 * Stop all delete threads.
+	 */
+	(void) zfs_delete_thread_target(zfsvfs, 0);
+
+	/*
+	 * For forced unmount, at this point all vops except zfs_inactive
+	 * are erroring EIO. We need to now suspend zfs_inactive threads
+	 * while we are freeing dbufs before switching zfs_inactive
+	 * to use behaviour without a objset.
+	 */
+	rw_enter(&zfsvfs->z_um_lock, RW_WRITER);
+
+	zfs_zcache_flush(zfsvfs);
+
+	/*
+	 * Release all delete in progress znodes
+	 * They will be processed when the file system remounts.
+	 */
+	mutex_enter(&zd->z_mutex);
+	while (zp = list_head(&zd->z_znodes)) {
+		list_remove(&zd->z_znodes, zp);
+		zp->z_dbuf_held = 0;
+		dmu_buf_rele(zp->z_dbuf);
+	}
+	mutex_exit(&zd->z_mutex);
+
+	/*
+	 * Release all holds on dbufs
+	 * Note, although we have stopped all other vop threads and
+	 * zfs_inactive(), the dmu can callback via znode_pageout_func()
+	 * which can zfs_znode_free() the znode.
+	 * So we lock z_all_znodes; search the list for a held
+	 * dbuf; drop the lock (we know zp can't disappear if we hold
+	 * a dbuf lock; then regrab the lock and restart.
+	 */
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) {
+		nextzp = list_next(&zfsvfs->z_all_znodes, zp);
+		if (zp->z_dbuf_held) {
+			/* dbufs should only be held when force unmounting */
+			zp->z_dbuf_held = 0;
+			mutex_exit(&zfsvfs->z_znodes_lock);
+			dmu_buf_rele(zp->z_dbuf);
+			/* Start again */
+			mutex_enter(&zfsvfs->z_znodes_lock);
+			nextzp = list_head(&zfsvfs->z_all_znodes);
+		}
+	}
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	/*
+	 * Unregister properties.
+	 */
+	if (!dmu_objset_is_snapshot(os)) {
+		ds = dmu_objset_ds(os);
+
+		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
+		    zfsvfs) == 0);
+
+		VERIFY(dsl_prop_unregister(ds, "aclinherit",
+		    acl_inherit_changed_cb, zfsvfs) == 0);
+	}
+
+	/*
+	 * Make the dmu drop all it dbuf holds so that zfs_inactive
+	 * can then safely free znode/vnodes.
+	 */
+	txg_wait_synced(dmu_objset_pool(os), 0);
+
+	/*
+	 * Switch zfs_inactive to behaviour without an objset.
+	 * It just tosses cached pages and frees the znode & vnode.
+	 * Then re-enable zfs_inactive threads in that new behaviour.
+	 */
+	zfsvfs->z_unmounted2 = B_TRUE;
+	rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */
+
+	/*
+	 * Close the zil. Can't close the zil while zfs_inactive
+	 * threads are blocked as zil_close can call zfs_inactive.
+	 */
+	if (zfsvfs->z_log) {
+		zil_close(zfsvfs->z_log);
+		zfsvfs->z_log = NULL;
+	}
+
+	/*
+	 * Finally close the objset
+	 */
+	dmu_objset_close(os);
+
+}
+
+static void
+zfs_freevfs(vfs_t *vfsp)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+	kmem_free(zfsvfs, sizeof (zfsvfs_t));
+
+	atomic_add_32(&zfs_active_fs_count, -1);
+}
+
+/*
+ * VFS_INIT() initialization.  Note that there is no VFS_FINI(),
+ * so we can't safely do any non-idempotent initialization here.
+ * Leave that to zfs_init() and zfs_fini(), which are called
+ * from the module's _init() and _fini() entry points.
+ */
+/*ARGSUSED*/
+static int
+zfs_vfsinit(int fstype, char *name)
+{
+	int error;
+
+	zfsfstype = fstype;
+
+	/*
+	 * Setup vfsops and vnodeops tables.
+	 */
+	error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
+	if (error != 0) {
+		cmn_err(CE_WARN, "zfs: bad vfs ops template");
+	}
+
+	error = zfs_create_op_tables();
+	if (error) {
+		zfs_remove_op_tables();
+		cmn_err(CE_WARN, "zfs: bad vnode ops template");
+		(void) vfs_freevfsops_by_type(zfsfstype);
+		return (error);
+	}
+
+	mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+	/*
+	 * unique major number for all zfs mounts
+	 */
+	if ((zfs_major = getudev()) == (major_t)-1) {
+		cmn_err(CE_WARN,
+		    "zfs_vfsinit: Can't get unique device number.");
+		zfs_remove_op_tables();
+		(void) vfs_freevfsops_by_type(zfsfstype);
+		return (error);
+	}
+	zfs_minor = 0;
+
+	return (0);
+}
+
+void
+zfs_init(void)
+{
+	/*
+	 * Initialize .zfs directory structures
+	 */
+	zfsctl_init();
+
+	/*
+	 * Initialize znode cache, vnode ops, etc...
+	 */
+	zfs_znode_init();
+}
+
+void
+zfs_fini(void)
+{
+	zfsctl_fini();
+	zfs_znode_fini();
+}
+
+int
+zfs_busy(void)
+{
+	return (zfs_active_fs_count != 0);
+}
+
+static vfsdef_t vfw = {
+	VFSDEF_VERSION,
+	MNTTYPE_ZFS,
+	zfs_vfsinit,
+	VSW_HASPROTO | VSW_CANRWRO | VSW_CANREMOUNT | VSW_VOLATILEDEV,
+	&zfs_mntopts
+};
+
+struct modlfs zfs_modlfs = {
+	&mod_fsops, "ZFS filesystem version 1", &vfw
+};
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
new file mode 100644
index 000000000000..eb9964aa20c5
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -0,0 +1,3663 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/uio.h>
+#include <sys/vmsystm.h>
+#include <sys/atomic.h>
+#include <vm/seg_vn.h>
+#include <vm/pvn.h>
+#include <vm/as.h>
+#include <sys/mman.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/refcount.h>  /* temporary for debugging purposes */
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/dirent.h>
+#include <sys/policy.h>
+#include <sys/sunddi.h>
+#include <sys/filio.h>
+#include "fs/fs_subr.h"
+#include <sys/zfs_ctldir.h>
+
+/*
+ * Programming rules.
+ *
+ * Each vnode op performs some logical unit of work.  To do this, the ZPL must
+ * properly lock its in-core state, create a DMU transaction, do the work,
+ * record this work in the intent log (ZIL), commit the DMU transaction,
+ * and wait the the intent log to commit if it's is a synchronous operation.
+ * Morover, the vnode ops must work in both normal and log replay context.
+ * The ordering of events is important to avoid deadlocks and references
+ * to freed memory.  The example below illustrates the following Big Rules:
+ *
+ *  (1) A check must be made in each zfs thread for a mounted file system.
+ *	This is done avoiding races using ZFS_ENTER(zfsvfs).
+ *	A ZFS_EXIT(zfsvfs) is needed before all returns.
+ *
+ *  (2)	VN_RELE() should always be the last thing except for zil_commit()
+ *	and ZFS_EXIT(). This is for 3 reasons:
+ *	First, if it's the last reference, the vnode/znode
+ *	can be freed, so the zp may point to freed memory.  Second, the last
+ *	reference will call zfs_zinactive(), which may induce a lot of work --
+ *	pushing cached pages (which requires z_grow_lock) and syncing out
+ *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
+ *	which could deadlock the system if you were already holding one.
+ *
+ *  (3)	Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
+ *	In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
+ *	it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
+ *	This is critical because we don't want to block while holding locks.
+ *	Note, in particular, that if a lock is sometimes acquired before
+ *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
+ *	use a non-blocking assign can deadlock the system.  The scenario:
+ *
+ *	Thread A has grabbed a lock before calling dmu_tx_assign().
+ *	Thread B is in an already-assigned tx, and blocks for this lock.
+ *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
+ *	forever, because the previous txg can't quiesce until B's tx commits.
+ *
+ *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
+ *	then drop all locks, call txg_wait_open(), and try again.
+ *
+ *  (4)	If the operation succeeded, generate the intent log entry for it
+ *	before dropping locks.  This ensures that the ordering of events
+ *	in the intent log matches the order in which they actually occurred.
+ *
+ *  (5)	At the end of each vnode op, the DMU tx must always commit,
+ *	regardless of whether there were any errors.
+ *
+ *  (6)	After dropping all locks, invoke zil_commit(zilog, seq, ioflag)
+ *	to ensure that synchronous semantics are provided when necessary.
+ *
+ * In general, this is how things should be ordered in each vnode op:
+ *
+ *	ZFS_ENTER(zfsvfs);		// exit if unmounted
+ * top:
+ *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
+ *	rw_enter(...);			// grab any other locks you need
+ *	tx = dmu_tx_create(...);	// get DMU tx
+ *	dmu_tx_hold_*();		// hold each object you might modify
+ *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
+ *	if (error) {
+ *		dmu_tx_abort(tx);	// abort DMU tx
+ *		rw_exit(...);		// drop locks
+ *		zfs_dirent_unlock(dl);	// unlock directory entry
+ *		VN_RELE(...);		// release held vnodes
+ *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ *			txg_wait_open(dmu_objset_pool(os), 0);
+ *			goto top;
+ *		}
+ *		ZFS_EXIT(zfsvfs);	// finished in zfs
+ *		return (error);		// really out of space
+ *	}
+ *	error = do_real_work();		// do whatever this VOP does
+ *	if (error == 0)
+ *		seq = zfs_log_*(...);	// on success, make ZIL entry
+ *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
+ *	rw_exit(...);			// drop locks
+ *	zfs_dirent_unlock(dl);		// unlock directory entry
+ *	VN_RELE(...);			// release held vnodes
+ *	zil_commit(zilog, seq, ioflag);	// synchronous when necessary
+ *	ZFS_EXIT(zfsvfs);		// finished in zfs
+ *	return (error);			// done, report error
+ */
+
+/* ARGSUSED */
+static int
+zfs_open(vnode_t **vpp, int flag, cred_t *cr)
+{
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
+{
+	/*
+	 * Clean up any locks held by this process on the vp.
+	 */
+	cleanlocks(vp, ddi_get_pid(), 0);
+	cleanshares(vp, ddi_get_pid());
+
+	return (0);
+}
+
+/*
+ * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
+ * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
+ */
+static int
+zfs_holey(vnode_t *vp, int cmd, offset_t *off)
+{
+	znode_t	*zp = VTOZ(vp);
+	uint64_t noff = (uint64_t)*off; /* new offset */
+	uint64_t file_sz;
+	int error;
+	boolean_t hole;
+
+	rw_enter(&zp->z_grow_lock, RW_READER);
+	file_sz = zp->z_phys->zp_size;
+	if (noff >= file_sz)  {
+		rw_exit(&zp->z_grow_lock);
+		return (ENXIO);
+	}
+
+	if (cmd == _FIO_SEEK_HOLE)
+		hole = B_TRUE;
+	else
+		hole = B_FALSE;
+
+	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
+	rw_exit(&zp->z_grow_lock);
+
+	/* end of file? */
+	if ((error == ESRCH) || (noff > file_sz)) {
+		/*
+		 * Handle the virtual hole at the end of file.
+		 */
+		if (hole) {
+			*off = file_sz;
+			return (0);
+		}
+		return (ENXIO);
+	}
+
+	if (noff < *off)
+		return (error);
+	*off = noff;
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
+    int *rvalp)
+{
+	offset_t off;
+	int error;
+	zfsvfs_t *zfsvfs;
+
+	switch (com) {
+	    case _FIOFFS:
+		return (zfs_sync(vp->v_vfsp, 0, cred));
+
+	    case _FIO_SEEK_DATA:
+	    case _FIO_SEEK_HOLE:
+		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
+			return (EFAULT);
+
+		zfsvfs = VTOZ(vp)->z_zfsvfs;
+		ZFS_ENTER(zfsvfs);
+
+		/* offset parameter is in/out */
+		error = zfs_holey(vp, com, &off);
+		ZFS_EXIT(zfsvfs);
+		if (error)
+			return (error);
+		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
+			return (EFAULT);
+		return (0);
+	}
+	return (ENOTTY);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages.  What this means:
+ *
+ * On Write:	If we find a memory mapped page, we write to *both*
+ *		the page and the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ *	the file is memory mapped.
+ */
+static int
+mappedwrite(vnode_t *vp, uint64_t woff, int nbytes, uio_t *uio, dmu_tx_t *tx)
+{
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int64_t	start, off;
+	int len = nbytes;
+	int error = 0;
+
+	start = uio->uio_loffset;
+	off = start & PAGEOFFSET;
+	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+		page_t *pp;
+		uint64_t bytes = MIN(PAGESIZE - off, len);
+
+		/*
+		 * We don't want a new page to "appear" in the middle of
+		 * the file update (because it may not get the write
+		 * update data), so we grab a lock to block
+		 * zfs_getpage().
+		 */
+		rw_enter(&zp->z_map_lock, RW_WRITER);
+		if (pp = page_lookup(vp, start, SE_SHARED)) {
+			caddr_t va;
+
+			rw_exit(&zp->z_map_lock);
+			va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L);
+			error = uiomove(va+off, bytes, UIO_WRITE, uio);
+			if (error == 0) {
+				dmu_write(zfsvfs->z_os, zp->z_id,
+				    woff, bytes, va+off, tx);
+			}
+			ppmapout(va);
+			page_unlock(pp);
+		} else {
+			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
+			    woff, bytes, uio, tx);
+			rw_exit(&zp->z_map_lock);
+		}
+		len -= bytes;
+		woff += bytes;
+		off = 0;
+		if (error)
+			break;
+	}
+	return (error);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages.  What this means:
+ *
+ * On Read:	We "read" preferentially from memory mapped pages,
+ *		else we default from the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ *	the file is memory mapped.
+ */
+static int
+mappedread(vnode_t *vp, char *addr, int nbytes, uio_t *uio)
+{
+	int64_t	start, off, bytes;
+	int len = nbytes;
+	int error = 0;
+
+	start = uio->uio_loffset;
+	off = start & PAGEOFFSET;
+	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+		page_t *pp;
+
+		bytes = MIN(PAGESIZE - off, len);
+		if (pp = page_lookup(vp, start, SE_SHARED)) {
+			caddr_t va;
+
+			va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L);
+			error = uiomove(va + off, bytes, UIO_READ, uio);
+			ppmapout(va);
+			page_unlock(pp);
+		} else {
+			/* XXX use dmu_read here? */
+			error = uiomove(addr, bytes, UIO_READ, uio);
+		}
+		len -= bytes;
+		addr += bytes;
+		off = 0;
+		if (error)
+			break;
+	}
+	return (error);
+}
+
+uint_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
+
+/*
+ * Read bytes from specified file into supplied buffer.
+ *
+ *	IN:	vp	- vnode of file to be read from.
+ *		uio	- structure supplying read location, range info,
+ *			  and return buffer.
+ *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	uio	- updated offset and range, buffer filled.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Side Effects:
+ *	vp - atime updated if byte count > 0
+ */
+/* ARGSUSED */
+static int
+zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	uint64_t	delta;
+	ssize_t		n, size, cnt, ndone;
+	int		error, i, numbufs;
+	dmu_buf_t	*dbp, **dbpp;
+
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * Validate file offset
+	 */
+	if (uio->uio_loffset < (offset_t)0) {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	/*
+	 * Fasttrack empty reads
+	 */
+	if (uio->uio_resid == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/*
+	 * Check for region locks
+	 */
+	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
+		if (error = chklock(vp, FREAD,
+		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	/*
+	 * If we're in FRSYNC mode, sync out this znode before reading it.
+	 */
+	zil_commit(zfsvfs->z_log, zp->z_last_itx, ioflag & FRSYNC);
+
+	/*
+	 * Make sure nobody restructures the file (changes block size)
+	 * in the middle of the read.
+	 */
+	rw_enter(&zp->z_grow_lock, RW_READER);
+	/*
+	 * If we are reading past end-of-file we can skip
+	 * to the end; but we might still need to set atime.
+	 */
+	if (uio->uio_loffset >= zp->z_phys->zp_size) {
+		cnt = 0;
+		error = 0;
+		goto out;
+	}
+
+	cnt = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
+
+	for (ndone = 0; ndone < cnt; ndone += zfs_read_chunk_size) {
+		ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
+		n = MIN(zfs_read_chunk_size,
+		    zp->z_phys->zp_size - uio->uio_loffset);
+		n = MIN(n, cnt);
+		dbpp = dmu_buf_hold_array(zfsvfs->z_os, zp->z_id,
+		    uio->uio_loffset, n, &numbufs);
+		if (error = dmu_buf_read_array_canfail(dbpp, numbufs)) {
+			dmu_buf_rele_array(dbpp, numbufs);
+			goto out;
+		}
+		/*
+		 * Compute the adjustment to align the dmu buffers
+		 * with the uio buffer.
+		 */
+		delta = uio->uio_loffset - dbpp[0]->db_offset;
+
+		for (i = 0; i < numbufs; i++) {
+			if (n < 0)
+				break;
+			dbp = dbpp[i];
+			size = dbp->db_size - delta;
+			/*
+			 * XXX -- this is correct, but may be suboptimal.
+			 * If the pages are all clean, we don't need to
+			 * go through mappedread().  Maybe the VMODSORT
+			 * stuff can help us here.
+			 */
+			if (vn_has_cached_data(vp)) {
+				error = mappedread(vp, (caddr_t)dbp->db_data +
+				    delta, (n < size ? n : size), uio);
+			} else {
+				error = uiomove((caddr_t)dbp->db_data + delta,
+					(n < size ? n : size), UIO_READ, uio);
+			}
+			if (error) {
+				dmu_buf_rele_array(dbpp, numbufs);
+				goto out;
+			}
+			n -= dbp->db_size;
+			if (delta) {
+				n += delta;
+				delta = 0;
+			}
+		}
+		dmu_buf_rele_array(dbpp, numbufs);
+	}
+out:
+	rw_exit(&zp->z_grow_lock);
+
+	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Fault in the pages of the first n bytes specified by the uio structure.
+ * 1 byte in each page is touched and the uio struct is unmodified.
+ * Any error will exit this routine as this is only a best
+ * attempt to get the pages resident. This is a copy of ufs_trans_touch().
+ */
+static void
+zfs_prefault_write(ssize_t n, struct uio *uio)
+{
+	struct iovec *iov;
+	ulong_t cnt, incr;
+	caddr_t p;
+	uint8_t tmp;
+
+	iov = uio->uio_iov;
+
+	while (n) {
+		cnt = MIN(iov->iov_len, n);
+		if (cnt == 0) {
+			/* empty iov entry */
+			iov++;
+			continue;
+		}
+		n -= cnt;
+		/*
+		 * touch each page in this segment.
+		 */
+		p = iov->iov_base;
+		while (cnt) {
+			switch (uio->uio_segflg) {
+			case UIO_USERSPACE:
+			case UIO_USERISPACE:
+				if (fuword8(p, &tmp))
+					return;
+				break;
+			case UIO_SYSSPACE:
+				if (kcopy(p, &tmp, 1))
+					return;
+				break;
+			}
+			incr = MIN(cnt, PAGESIZE);
+			p += incr;
+			cnt -= incr;
+		}
+		/*
+		 * touch the last byte in case it straddles a page.
+		 */
+		p--;
+		switch (uio->uio_segflg) {
+		case UIO_USERSPACE:
+		case UIO_USERISPACE:
+			if (fuword8(p, &tmp))
+				return;
+			break;
+		case UIO_SYSSPACE:
+			if (kcopy(p, &tmp, 1))
+				return;
+			break;
+		}
+		iov++;
+	}
+}
+
+/*
+ * Write the bytes to a file.
+ *
+ *	IN:	vp	- vnode of file to be written to.
+ *		uio	- structure supplying write location, range info,
+ *			  and data buffer.
+ *		ioflag	- FAPPEND flag set if in append mode.
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	uio	- updated offset and range.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	vp - ctime|mtime updated if byte count > 0
+ *
+ * Note: zfs_write() holds z_append_lock across calls to txg_wait_open().
+ * It has to because of the semantics of FAPPEND.  The implication is that
+ * we must never grab z_append_lock while in an assigned tx.
+ */
+/* ARGSUSED */
+static int
+zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
+{
+	znode_t		*zp = VTOZ(vp);
+	rlim64_t	limit = uio->uio_llimit;
+	ssize_t		start_resid = uio->uio_resid;
+	ssize_t		tx_bytes;
+	uint64_t	end_size;
+	dmu_tx_t	*tx;
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	offset_t	woff;
+	ssize_t		n, nbytes;
+	int		max_blksz = zfsvfs->z_max_blksz;
+	int		need_append_lock, error;
+	krw_t		grow_rw = RW_READER;
+
+	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
+		limit = MAXOFFSET_T;
+
+	n = start_resid;
+
+	/*
+	 * Fasttrack empty write
+	 */
+	if (n == 0)
+		return (0);
+
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * Pre-fault the pages to ensure slow (eg NFS) pages don't hold up txg
+	 */
+	zfs_prefault_write(MIN(start_resid, SPA_MAXBLOCKSIZE), uio);
+
+	/*
+	 * If in append mode, set the io offset pointer to eof.
+	 */
+	need_append_lock = ioflag & FAPPEND;
+	if (need_append_lock) {
+		rw_enter(&zp->z_append_lock, RW_WRITER);
+		woff = uio->uio_loffset = zp->z_phys->zp_size;
+	} else {
+		woff = uio->uio_loffset;
+		/*
+		 * Validate file offset
+		 */
+		if (woff < 0) {
+			ZFS_EXIT(zfsvfs);
+			return (EINVAL);
+		}
+
+		/*
+		 * If this write could change the file length,
+		 * we need to synchronize with "appenders".
+		 */
+		if (woff < limit - n && woff + n > zp->z_phys->zp_size) {
+			need_append_lock = TRUE;
+			rw_enter(&zp->z_append_lock, RW_READER);
+		}
+	}
+
+	if (woff >= limit) {
+		error = EFBIG;
+		goto no_tx_done;
+	}
+
+	if ((woff + n) > limit || woff > (limit - n))
+		n = limit - woff;
+
+	/*
+	 * Check for region locks
+	 */
+	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
+	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0)
+		goto no_tx_done;
+top:
+	/*
+	 * Make sure nobody restructures the file (changes block size)
+	 * in the middle of the write.
+	 */
+	rw_enter(&zp->z_grow_lock, grow_rw);
+
+	end_size = MAX(zp->z_phys->zp_size, woff + n);
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		rw_exit(&zp->z_grow_lock);
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		goto no_tx_done;
+	}
+
+	if (end_size > zp->z_blksz &&
+	    (!ISP2(zp->z_blksz) || zp->z_blksz < max_blksz)) {
+		uint64_t new_blksz;
+		/*
+		 * This write will increase the file size beyond
+		 * the current block size so increase the block size.
+		 */
+		if (grow_rw == RW_READER && !rw_tryupgrade(&zp->z_grow_lock)) {
+			dmu_tx_commit(tx);
+			rw_exit(&zp->z_grow_lock);
+			grow_rw = RW_WRITER;
+			goto top;
+		}
+		if (zp->z_blksz > max_blksz) {
+			ASSERT(!ISP2(zp->z_blksz));
+			new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
+		} else {
+			new_blksz = MIN(end_size, max_blksz);
+		}
+		error = zfs_grow_blocksize(zp, new_blksz, tx);
+		if (error) {
+			tx_bytes = 0;
+			goto tx_done;
+		}
+	}
+
+	if (grow_rw == RW_WRITER) {
+		rw_downgrade(&zp->z_grow_lock);
+		grow_rw = RW_READER;
+	}
+
+	/*
+	 * The file data does not fit in the znode "cache", so we
+	 * will be writing to the file block data buffers.
+	 * Each buffer will be written in a separate transaction;
+	 * this keeps the intent log records small and allows us
+	 * to do more fine-grained space accounting.
+	 */
+	while (n > 0) {
+		/*
+		 * XXX - should we really limit each write to z_max_blksz?
+		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
+		 */
+		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+		rw_enter(&zp->z_map_lock, RW_READER);
+
+		tx_bytes = uio->uio_resid;
+		if (vn_has_cached_data(vp)) {
+			rw_exit(&zp->z_map_lock);
+			error = mappedwrite(vp, woff, nbytes, uio, tx);
+		} else {
+			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
+			    woff, nbytes, uio, tx);
+			rw_exit(&zp->z_map_lock);
+		}
+		tx_bytes -= uio->uio_resid;
+
+		if (error) {
+			/* XXX - do we need to "clean up" the dmu buffer? */
+			break;
+		}
+
+		ASSERT(tx_bytes == nbytes);
+
+		n -= nbytes;
+		if (n <= 0)
+			break;
+
+		/*
+		 * We have more work ahead of us, so wrap up this transaction
+		 * and start another.  Exact same logic as tx_done below.
+		 */
+		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) {
+			dmu_buf_will_dirty(zp->z_dbuf, tx);
+			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
+			    uio->uio_loffset);
+		}
+		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+		seq = zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes,
+		    ioflag, uio);
+		dmu_tx_commit(tx);
+
+		/* Pre-fault the next set of pages */
+		zfs_prefault_write(MIN(n, SPA_MAXBLOCKSIZE), uio);
+
+		/*
+		 * Start another transaction.
+		 */
+		woff = uio->uio_loffset;
+		tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_hold_bonus(tx, zp->z_id);
+		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
+		error = dmu_tx_assign(tx, zfsvfs->z_assign);
+		if (error) {
+			dmu_tx_abort(tx);
+			rw_exit(&zp->z_grow_lock);
+			if (error == ERESTART &&
+			    zfsvfs->z_assign == TXG_NOWAIT) {
+				txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+				goto top;
+			}
+			goto no_tx_done;
+		}
+	}
+
+tx_done:
+
+	if (tx_bytes != 0) {
+		/*
+		 * Update the file size if it has changed; account
+		 * for possible concurrent updates.
+		 */
+		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) {
+			dmu_buf_will_dirty(zp->z_dbuf, tx);
+			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
+			    uio->uio_loffset);
+		}
+		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+		seq = zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes,
+		    ioflag, uio);
+	}
+	dmu_tx_commit(tx);
+
+	rw_exit(&zp->z_grow_lock);
+
+no_tx_done:
+
+	if (need_append_lock)
+		rw_exit(&zp->z_append_lock);
+
+	/*
+	 * If we're in replay mode, or we made no progress, return error.
+	 * Otherwise, it's at least a partial write, so it's successful.
+	 */
+	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	zil_commit(zilog, seq, ioflag & (FSYNC | FDSYNC));
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zfs_get_data(void *arg, lr_write_t *lr)
+{
+	zfsvfs_t *zfsvfs = arg;
+	objset_t *os = zfsvfs->z_os;
+	znode_t *zp;
+	uint64_t off = lr->lr_offset;
+	int dlen = lr->lr_length;  		/* length of user data */
+	int reclen = lr->lr_common.lrc_reclen;
+	int error = 0;
+
+	ASSERT(dlen != 0);
+
+	/*
+	 * Nothing to do if the file has been removed or truncated.
+	 */
+	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
+		return (ENOENT);
+	if (off >= zp->z_phys->zp_size || zp->z_reap) {
+		VN_RELE(ZTOV(zp));
+		return (ENOENT);
+	}
+
+	/*
+	 * Write records come in two flavors: immediate and indirect.
+	 * For small writes it's cheaper to store the data with the
+	 * log record (immediate); for large writes it's cheaper to
+	 * sync the data and get a pointer to it (indirect) so that
+	 * we don't have to write the data twice.
+	 */
+	if (sizeof (lr_write_t) + dlen <= reclen) { /* immediate write */
+		rw_enter(&zp->z_grow_lock, RW_READER);
+		dmu_buf_t *db = dmu_buf_hold(os, lr->lr_foid, off);
+		dmu_buf_read(db);
+		bcopy((char *)db->db_data + off - db->db_offset, lr + 1, dlen);
+		dmu_buf_rele(db);
+		rw_exit(&zp->z_grow_lock);
+	} else {
+		/*
+		 * We have to grab z_grow_lock as RW_WRITER because
+		 * dmu_sync() can't handle concurrent dbuf_dirty() (6313856).
+		 * z_grow_lock will be replaced with a range lock soon,
+		 * which will eliminate the concurrency hit, but dmu_sync()
+		 * really needs more thought.  It shouldn't have to rely on
+		 * the caller to provide MT safety.
+		 */
+		rw_enter(&zp->z_grow_lock, RW_WRITER);
+		txg_suspend(dmu_objset_pool(os));
+		error = dmu_sync(os, lr->lr_foid, off, &lr->lr_blkoff,
+		    &lr->lr_blkptr, lr->lr_common.lrc_txg);
+		txg_resume(dmu_objset_pool(os));
+		rw_exit(&zp->z_grow_lock);
+	}
+	VN_RELE(ZTOV(zp));
+	return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	error = zfs_zaccess_rwx(zp, mode, cr);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Lookup an entry in a directory, or an extended attribute directory.
+ * If it exists, return a held vnode reference for it.
+ *
+ *	IN:	dvp	- vnode of directory to search.
+ *		nm	- name of entry to lookup.
+ *		pnp	- full pathname to lookup [UNUSED].
+ *		flags	- LOOKUP_XATTR set if looking for an attribute.
+ *		rdir	- root directory vnode [UNUSED].
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	vpp	- vnode of located entry, NULL if not found.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	NA
+ */
+/* ARGSUSED */
+static int
+zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+    int flags, vnode_t *rdir, cred_t *cr)
+{
+
+	znode_t *zdp = VTOZ(dvp);
+	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+	int	error;
+
+	ZFS_ENTER(zfsvfs);
+
+	*vpp = NULL;
+
+	if (flags & LOOKUP_XATTR) {
+		/*
+		 * We don't allow recursive attributes..
+		 * Maybe someday we will.
+		 */
+		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
+			ZFS_EXIT(zfsvfs);
+			return (EINVAL);
+		}
+
+		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr)) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+
+		/*
+		 * Do we have permission to get into attribute directory?
+		 */
+
+		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) {
+			VN_RELE(*vpp);
+		}
+
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Check accessibility of directory.
+	 */
+
+	if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) {
+
+		/*
+		 * Convert device special files
+		 */
+		if (IS_DEVVP(*vpp)) {
+			vnode_t	*svp;
+
+			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
+			VN_RELE(*vpp);
+			if (svp == NULL)
+				error = ENOSYS;
+			else
+				*vpp = svp;
+		}
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Attempt to create a new entry in a directory.  If the entry
+ * already exists, truncate the file if permissible, else return
+ * an error.  Return the vp of the created or trunc'd file.
+ *
+ *	IN:	dvp	- vnode of directory to put new file entry in.
+ *		name	- name of new file entry.
+ *		vap	- attributes of new file.
+ *		excl	- flag indicating exclusive or non-exclusive mode.
+ *		mode	- mode to open file with.
+ *		cr	- credentials of caller.
+ *		flag	- large file flag [UNUSED].
+ *
+ *	OUT:	vpp	- vnode of created or trunc'd entry.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime updated if new entry created
+ *	 vp - ctime|mtime always, atime if new
+ */
+/* ARGSUSED */
+static int
+zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
+    int mode, vnode_t **vpp, cred_t *cr, int flag)
+{
+	znode_t		*zp, *dzp = VTOZ(dvp);
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	objset_t	*os = zfsvfs->z_os;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	int		error;
+	uint64_t	zoid;
+
+	ZFS_ENTER(zfsvfs);
+
+top:
+	*vpp = NULL;
+
+	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
+		vap->va_mode &= ~VSVTX;
+
+	if (*name == '\0') {
+		/*
+		 * Null component name refers to the directory itself.
+		 */
+		VN_HOLD(dvp);
+		zp = dzp;
+		dl = NULL;
+		error = 0;
+	} else {
+		/* possible VN_HOLD(zp) */
+		if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) {
+			if (strcmp(name, "..") == 0)
+				error = EISDIR;
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	zoid = zp ? zp->z_id : -1ULL;
+
+	if (zp == NULL) {
+		/*
+		 * Create a new file object and update the directory
+		 * to reference it.
+		 */
+		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+			goto out;
+		}
+
+		/*
+		 * We only support the creation of regular files in
+		 * extended attribute directories.
+		 */
+		if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
+		    (vap->va_type != VREG)) {
+			error = EINVAL;
+			goto out;
+		}
+
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+		dmu_tx_hold_bonus(tx, dzp->z_id);
+		dmu_tx_hold_zap(tx, dzp->z_id, 1);
+		if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+			    0, SPA_MAXBLOCKSIZE);
+		error = dmu_tx_assign(tx, zfsvfs->z_assign);
+		if (error) {
+			dmu_tx_abort(tx);
+			zfs_dirent_unlock(dl);
+			if (error == ERESTART &&
+			    zfsvfs->z_assign == TXG_NOWAIT) {
+				txg_wait_open(dmu_objset_pool(os), 0);
+				goto top;
+			}
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
+		ASSERT(zp->z_id == zoid);
+		(void) zfs_link_create(dl, zp, tx, ZNEW);
+		seq = zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name);
+		dmu_tx_commit(tx);
+	} else {
+		/*
+		 * A directory entry already exists for this name.
+		 */
+		/*
+		 * Can't truncate an existing file if in exclusive mode.
+		 */
+		if (excl == EXCL) {
+			error = EEXIST;
+			goto out;
+		}
+		/*
+		 * Can't open a directory for writing.
+		 */
+		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
+			error = EISDIR;
+			goto out;
+		}
+		/*
+		 * Verify requested access to file.
+		 */
+		if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) {
+			goto out;
+		}
+		/*
+		 * Truncate regular files if requested.
+		 */
+
+		/*
+		 * Need to update dzp->z_seq?
+		 */
+
+		mutex_enter(&dzp->z_lock);
+		dzp->z_seq++;
+		mutex_exit(&dzp->z_lock);
+
+		if ((ZTOV(zp)->v_type == VREG) && (zp->z_phys->zp_size != 0) &&
+		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
+			/*
+			 * Truncate the file.
+			 */
+			tx = dmu_tx_create(os);
+			dmu_tx_hold_bonus(tx, zoid);
+			dmu_tx_hold_free(tx, zoid, 0, DMU_OBJECT_END);
+			error = dmu_tx_assign(tx, zfsvfs->z_assign);
+			if (error) {
+				dmu_tx_abort(tx);
+				if (dl)
+					zfs_dirent_unlock(dl);
+				VN_RELE(ZTOV(zp));
+				if (error == ERESTART &&
+				    zfsvfs->z_assign == TXG_NOWAIT) {
+					txg_wait_open(dmu_objset_pool(os), 0);
+					goto top;
+				}
+				ZFS_EXIT(zfsvfs);
+				return (error);
+			}
+			/*
+			 * Grab the grow_lock to serialize this change with
+			 * respect to other file manipulations.
+			 */
+			rw_enter(&zp->z_grow_lock, RW_WRITER);
+			error = zfs_freesp(zp, 0, 0, mode, tx, cr);
+			if (error == 0) {
+				zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+				seq = zfs_log_truncate(zilog, tx,
+				    TX_TRUNCATE, zp, 0, 0);
+			}
+			rw_exit(&zp->z_grow_lock);
+			dmu_tx_commit(tx);
+		}
+	}
+out:
+
+	if (dl)
+		zfs_dirent_unlock(dl);
+
+	if (error) {
+		if (zp)
+			VN_RELE(ZTOV(zp));
+	} else {
+		*vpp = ZTOV(zp);
+		/*
+		 * If vnode is for a device return a specfs vnode instead.
+		 */
+		if (IS_DEVVP(*vpp)) {
+			struct vnode *svp;
+
+			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
+			VN_RELE(*vpp);
+			if (svp == NULL) {
+				error = ENOSYS;
+			}
+			*vpp = svp;
+		}
+	}
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Remove an entry from a directory.
+ *
+ *	IN:	dvp	- vnode of directory to remove entry from.
+ *		name	- name of entry to remove.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime
+ *	 vp - ctime (if nlink > 0)
+ */
+static int
+zfs_remove(vnode_t *dvp, char *name, cred_t *cr)
+{
+	znode_t		*zp, *dzp = VTOZ(dvp);
+	znode_t		*xzp = NULL;
+	vnode_t		*vp;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	uint64_t	acl_obj, xattr_obj;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	int		may_delete_now, delete_now = FALSE;
+	int		reaped;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+
+top:
+	/*
+	 * Attempt to lock directory; fail if entry doesn't exist.
+	 */
+	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	vp = ZTOV(zp);
+
+	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
+		goto out;
+	}
+
+	/*
+	 * Check the restrictions that apply on sticky directories.
+	 */
+	if (error = zfs_sticky_remove_access(dzp, zp, cr))
+		goto out;
+
+	/*
+	 * Need to use rmdir for removing directories.
+	 */
+	if (vp->v_type == VDIR) {
+		error = EPERM;
+		goto out;
+	}
+
+	vnevent_remove(vp);
+
+	mutex_enter(&vp->v_lock);
+	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
+	mutex_exit(&vp->v_lock);
+
+	/*
+	 * We may delete the znode now, or we may put it on the delete queue;
+	 * it depends on whether we're the last link, and on whether there are
+	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
+	 * allow for either case.
+	 */
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, -1);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	if (may_delete_now)
+		dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+
+	/* are there any extended attributes? */
+	if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
+		/*
+		 * XXX - There is a possibility that the delete
+		 * of the parent file could succeed, but then we get
+		 * an ENOSPC when we try to delete the xattrs...
+		 * so we would need to re-try the deletes periodically
+		 */
+		/* XXX - do we need this if we are deleting? */
+		dmu_tx_hold_bonus(tx, xattr_obj);
+	}
+
+	/* are there any additional acls */
+	if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
+	    may_delete_now)
+		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+	/* charge as an update -- would be nice not to charge at all */
+	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, -1);
+
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_dirent_unlock(dl);
+		VN_RELE(vp);
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Remove the directory entry.
+	 */
+	error = zfs_link_destroy(dl, zp, tx, 0, &reaped);
+
+	if (error) {
+		dmu_tx_commit(tx);
+		goto out;
+	}
+
+	if (reaped) {
+		mutex_enter(&vp->v_lock);
+		delete_now = may_delete_now &&
+		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
+		    zp->z_phys->zp_xattr == xattr_obj &&
+		    zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
+		mutex_exit(&vp->v_lock);
+	}
+
+	if (delete_now) {
+		if (zp->z_phys->zp_xattr) {
+			error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
+			ASSERT3U(error, ==, 0);
+			ASSERT3U(xzp->z_phys->zp_links, ==, 2);
+			dmu_buf_will_dirty(xzp->z_dbuf, tx);
+			mutex_enter(&xzp->z_lock);
+			xzp->z_reap = 1;
+			xzp->z_phys->zp_links = 0;
+			mutex_exit(&xzp->z_lock);
+			zfs_dq_add(xzp, tx);
+			zp->z_phys->zp_xattr = 0; /* probably unnecessary */
+		}
+		mutex_enter(&zp->z_lock);
+		mutex_enter(&vp->v_lock);
+		vp->v_count--;
+		ASSERT3U(vp->v_count, ==, 0);
+		mutex_exit(&vp->v_lock);
+		zp->z_active = 0;
+		mutex_exit(&zp->z_lock);
+		zfs_znode_delete(zp, tx);
+		VFS_RELE(zfsvfs->z_vfs);
+	} else if (reaped) {
+		zfs_dq_add(zp, tx);
+	}
+
+	seq = zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name);
+
+	dmu_tx_commit(tx);
+out:
+	zfs_dirent_unlock(dl);
+
+	if (!delete_now) {
+		VN_RELE(vp);
+	} else if (xzp) {
+		/* this rele delayed to prevent nesting transactions */
+		VN_RELE(ZTOV(xzp));
+	}
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Create a new directory and insert it into dvp using the name
+ * provided.  Return a pointer to the inserted directory.
+ *
+ *	IN:	dvp	- vnode of directory to add subdir to.
+ *		dirname	- name of new directory.
+ *		vap	- attributes of new directory.
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	vpp	- vnode of created directory.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime updated
+ *	 vp - ctime|mtime|atime updated
+ */
+static int
+zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
+{
+	znode_t		*zp, *dzp = VTOZ(dvp);
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	zfs_dirlock_t	*dl;
+	uint64_t	zoid = 0;
+	dmu_tx_t	*tx;
+	int		error;
+
+	ASSERT(vap->va_type == VDIR);
+
+	ZFS_ENTER(zfsvfs);
+
+	if (dzp->z_phys->zp_flags & ZFS_XATTR) {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+top:
+	*vpp = NULL;
+	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * First make sure the new directory doesn't exist.
+	 */
+	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Add a new entry to the directory.
+	 */
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, 1);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 0);
+	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+		    0, SPA_MAXBLOCKSIZE);
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_dirent_unlock(dl);
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Create new node.
+	 */
+	zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
+
+	/*
+	 * Now put new name in parent dir.
+	 */
+	(void) zfs_link_create(dl, zp, tx, ZNEW);
+
+	*vpp = ZTOV(zp);
+
+	seq = zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname);
+	dmu_tx_commit(tx);
+
+	zfs_dirent_unlock(dl);
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Remove a directory subdir entry.  If the current working
+ * directory is the same as the subdir to be removed, the
+ * remove will fail.
+ *
+ *	IN:	dvp	- vnode of directory to remove from.
+ *		name	- name of directory to be removed.
+ *		cwd	- vnode of current working directory.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime updated
+ */
+static int
+zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
+{
+	znode_t		*dzp = VTOZ(dvp);
+	znode_t		*zp;
+	vnode_t		*vp;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+
+top:
+	zp = NULL;
+
+	/*
+	 * Attempt to lock directory; fail if entry doesn't exist.
+	 */
+	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	vp = ZTOV(zp);
+
+	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
+		goto out;
+	}
+
+	/*
+	 * Check the restrictions that apply on sticky directories.
+	 */
+	if (error = zfs_sticky_remove_access(dzp, zp, cr))
+		goto out;
+
+	if (vp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto out;
+	}
+
+	if (vp == cwd) {
+		error = EINVAL;
+		goto out;
+	}
+
+	vnevent_rmdir(vp);
+
+	/*
+	 * Grab a lock on the parent pointer make sure we play well
+	 * with the treewalk and directory rename code.
+	 */
+	rw_enter(&zp->z_parent_lock, RW_WRITER);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, 1);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		rw_exit(&zp->z_parent_lock);
+		zfs_dirent_unlock(dl);
+		VN_RELE(vp);
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	error = zfs_link_destroy(dl, zp, tx, 0, NULL);
+
+	if (error == 0)
+		seq = zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name);
+
+	dmu_tx_commit(tx);
+
+	rw_exit(&zp->z_parent_lock);
+out:
+	zfs_dirent_unlock(dl);
+
+	VN_RELE(vp);
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Read as many directory entries as will fit into the provided
+ * buffer from the given directory cursor position (specified in
+ * the uio structure.
+ *
+ *	IN:	vp	- vnode of directory to read.
+ *		uio	- structure supplying read location, range info,
+ *			  and return buffer.
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	uio	- updated offset and range, buffer filled.
+ *		eofp	- set to true if end-of-file detected.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	vp - atime updated
+ *
+ * Note that the low 4 bits of the cookie returned by zap is always zero.
+ * This allows us to use the low range for "special" directory entries:
+ * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
+ * we use the offset 2 for the '.zfs' directory.
+ */
+/* ARGSUSED */
+static int
+zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp)
+{
+	znode_t		*zp = VTOZ(vp);
+	iovec_t		*iovp;
+	dirent64_t	*odp;
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	caddr_t		outbuf;
+	size_t		bufsize;
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	uint_t		bytes_wanted;
+	ushort_t	this_reclen;
+	uint64_t	offset; /* must be unsigned; checks for < 1 */
+	off64_t		*next;
+	int		local_eof;
+	int		outcount = 0;
+	int		error = 0;
+
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * If we are not given an eof variable,
+	 * use a local one.
+	 */
+	if (eofp == NULL)
+		eofp = &local_eof;
+
+	/*
+	 * Check for valid iov_len.
+	 */
+	if (uio->uio_iov->iov_len <= 0) {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	/*
+	 * Quit if directory has been removed (posix)
+	 */
+	if ((*eofp = zp->z_reap) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/*
+	 * Initialize the iterator cursor.
+	 */
+	offset = uio->uio_loffset;
+	if (offset <= 3) {
+		/*
+		 * Start iteration from the beginning of the directory.
+		 */
+		zap_cursor_init(&zc, zfsvfs->z_os, zp->z_id);
+	} else {
+		/*
+		 * The offset is a serialized cursor.
+		 */
+		zap_cursor_init_serialized(&zc, zfsvfs->z_os, zp->z_id,
+		    offset);
+	}
+
+	/*
+	 * Get space to change directory entries into fs independent format.
+	 */
+	iovp = uio->uio_iov;
+	bytes_wanted = iovp->iov_len;
+	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
+		bufsize = bytes_wanted;
+		outbuf = kmem_alloc(bufsize, KM_SLEEP);
+		odp = (struct dirent64 *)outbuf;
+	} else {
+		bufsize = bytes_wanted;
+		odp = (struct dirent64 *)iovp->iov_base;
+	}
+
+	/*
+	 * Transform to file-system independent format
+	 */
+	outcount = 0;
+	while (outcount < bytes_wanted) {
+		/*
+		 * Special case `.', `..', and `.zfs'.
+		 */
+		if (offset == 0) {
+			(void) strcpy(zap.za_name, ".");
+			zap.za_first_integer = zp->z_id;
+			this_reclen = DIRENT64_RECLEN(1);
+		} else if (offset == 1) {
+			(void) strcpy(zap.za_name, "..");
+			zap.za_first_integer = zp->z_phys->zp_parent;
+			this_reclen = DIRENT64_RECLEN(2);
+		} else if (offset == 2 && zfs_show_ctldir(zp)) {
+			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
+			zap.za_first_integer = ZFSCTL_INO_ROOT;
+			this_reclen =
+			    DIRENT64_RECLEN(sizeof (ZFS_CTLDIR_NAME) - 1);
+		} else {
+			/*
+			 * Grab next entry.
+			 */
+			if (error = zap_cursor_retrieve(&zc, &zap)) {
+				if ((*eofp = (error == ENOENT)) != 0)
+					break;
+				else
+					goto update;
+			}
+
+			if (zap.za_integer_length != 8 ||
+			    zap.za_num_integers != 1) {
+				cmn_err(CE_WARN, "zap_readdir: bad directory "
+				    "entry, obj = %lld, offset = %lld\n",
+				    (u_longlong_t)zp->z_id,
+				    (u_longlong_t)offset);
+				error = ENXIO;
+				goto update;
+			}
+			this_reclen = DIRENT64_RECLEN(strlen(zap.za_name));
+		}
+
+		/*
+		 * Will this entry fit in the buffer?
+		 */
+		if (outcount + this_reclen > bufsize) {
+			/*
+			 * Did we manage to fit anything in the buffer?
+			 */
+			if (!outcount) {
+				error = EINVAL;
+				goto update;
+			}
+			break;
+		}
+		/*
+		 * Add this entry:
+		 */
+		odp->d_ino = (ino64_t)zap.za_first_integer;
+		odp->d_reclen = (ushort_t)this_reclen;
+		/* NOTE: d_off is the offset for the *next* entry */
+		next = &(odp->d_off);
+		(void) strncpy(odp->d_name, zap.za_name,
+		    DIRENT64_NAMELEN(this_reclen));
+		outcount += this_reclen;
+		odp = (dirent64_t *)((intptr_t)odp + this_reclen);
+
+		ASSERT(outcount <= bufsize);
+
+		/* Prefetch znode */
+		dmu_prefetch(zfsvfs->z_os, zap.za_first_integer, 0, 0);
+
+		/*
+		 * Move to the next entry, fill in the previous offset.
+		 */
+		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
+			zap_cursor_advance(&zc);
+			offset = zap_cursor_serialize(&zc);
+		} else {
+			offset += 1;
+		}
+		*next = offset;
+	}
+
+	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
+		iovp->iov_base += outcount;
+		iovp->iov_len -= outcount;
+		uio->uio_resid -= outcount;
+	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
+		/*
+		 * Reset the pointer.
+		 */
+		offset = uio->uio_loffset;
+	}
+
+update:
+	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
+		kmem_free(outbuf, bufsize);
+
+	if (error == ENOENT)
+		error = 0;
+
+	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
+	uio->uio_loffset = offset;
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
+{
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	ZFS_ENTER(zfsvfs);
+	zil_commit(zfsvfs->z_log, zp->z_last_itx, FSYNC);
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Get the requested file attributes and place them in the provided
+ * vattr structure.
+ *
+ *	IN:	vp	- vnode of file.
+ *		vap	- va_mask identifies requested attributes.
+ *		flags	- [UNUSED]
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	vap	- attribute values.
+ *
+ *	RETURN:	0 (always succeeds)
+ */
+/* ARGSUSED */
+static int
+zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	znode_phys_t *pzp = zp->z_phys;
+	int	error;
+
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * Return all attributes.  It's cheaper to provide the answer
+	 * than to determine whether we were asked the question.
+	 */
+	mutex_enter(&zp->z_lock);
+
+	vap->va_type = vp->v_type;
+	vap->va_mode = pzp->zp_mode & MODEMASK;
+	vap->va_uid = zp->z_phys->zp_uid;
+	vap->va_gid = zp->z_phys->zp_gid;
+	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
+	vap->va_nodeid = zp->z_id;
+	vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX);	/* nlink_t limit! */
+	vap->va_size = pzp->zp_size;
+	vap->va_rdev = pzp->zp_rdev;
+	vap->va_seq = zp->z_seq;
+
+	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
+	ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
+	ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
+
+	/*
+	 * Owner should be allowed to always read_attributes
+	 */
+	if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) {
+		if (zp->z_phys->zp_uid != crgetuid(cr)) {
+			mutex_exit(&zp->z_lock);
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	mutex_exit(&zp->z_lock);
+
+	dmu_object_size_from_db(zp->z_dbuf, &vap->va_blksize, &vap->va_nblocks);
+
+	if (zp->z_blksz == 0) {
+		/*
+		 * Block size hasn't been set; suggest maximal I/O transfers.
+		 */
+		vap->va_blksize = zfsvfs->z_max_blksz;
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Set the file attributes to the values contained in the
+ * vattr structure.
+ *
+ *	IN:	vp	- vnode of file to be modified.
+ *		vap	- new attribute values.
+ *		flags	- ATTR_UTIME set if non-default time values provided.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	vp - ctime updated, mtime updated if size changed.
+ */
+/* ARGSUSED */
+static int
+zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+	caller_context_t *ct)
+{
+	struct znode	*zp = VTOZ(vp);
+	znode_phys_t	*pzp = zp->z_phys;
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	dmu_tx_t	*tx;
+	uint_t		mask = vap->va_mask;
+	uint_t		mask_applied = 0;
+	vattr_t		oldva;
+	uint64_t	new_mode;
+	int		have_grow_lock;
+	int		need_policy = FALSE;
+	int		err;
+
+	if (mask == 0)
+		return (0);
+
+	if (mask & AT_NOSET)
+		return (EINVAL);
+
+	if (mask & AT_SIZE && vp->v_type == VDIR)
+		return (EISDIR);
+
+	ZFS_ENTER(zfsvfs);
+
+top:
+	have_grow_lock = FALSE;
+
+	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+		ZFS_EXIT(zfsvfs);
+		return (EROFS);
+	}
+
+	/*
+	 * First validate permissions
+	 */
+
+	if (mask & AT_SIZE) {
+		err = zfs_zaccess(zp, ACE_WRITE_DATA, cr);
+		if (err) {
+			ZFS_EXIT(zfsvfs);
+			return (err);
+		}
+	}
+
+	if (mask & (AT_ATIME|AT_MTIME))
+		need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr);
+
+	if (mask & (AT_UID|AT_GID)) {
+		int	idmask = (mask & (AT_UID|AT_GID));
+		int	take_owner;
+		int	take_group;
+
+		/*
+		 * Take ownership or chgrp to group we are a member of
+		 */
+
+		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
+		take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr);
+
+		/*
+		 * If both AT_UID and AT_GID are set then take_owner and
+		 * take_group must both be set in order to allow taking
+		 * ownership.
+		 *
+		 * Otherwise, send the check through secpolicy_vnode_setattr()
+		 *
+		 */
+
+		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
+		    ((idmask == AT_UID) && take_owner) ||
+		    ((idmask == AT_GID) && take_group)) {
+			if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) {
+				/*
+				 * Remove setuid/setgid for non-privileged users
+				 */
+				if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
+				    secpolicy_vnode_setid_retain(cr,
+				    (vap->va_mode & S_ISUID) != 0 &&
+				    (mask & AT_UID) != 0 &&
+				    vap->va_uid == 0) != 0) {
+					vap->va_mode = pzp->zp_mode;
+					vap->va_mask |= AT_MODE;
+					vap->va_mode &= ~(S_ISUID|S_ISGID);
+				}
+			} else {
+				need_policy =  TRUE;
+			}
+		} else {
+			need_policy =  TRUE;
+		}
+	}
+
+	if (mask & AT_MODE)
+		need_policy = TRUE;
+
+	if (need_policy) {
+		mutex_enter(&zp->z_lock);
+		oldva.va_mode = pzp->zp_mode;
+		oldva.va_uid = zp->z_phys->zp_uid;
+		oldva.va_gid = zp->z_phys->zp_gid;
+		mutex_exit(&zp->z_lock);
+		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
+		    (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp);
+		if (err) {
+			ZFS_EXIT(zfsvfs);
+			return (err);
+		}
+	}
+
+	/*
+	 * secpolicy_vnode_setattr, or take ownership may have
+	 * changed va_mask
+	 */
+	mask = vap->va_mask;
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+
+	if (mask & AT_MODE) {
+
+		new_mode = (pzp->zp_mode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+		if (zp->z_phys->zp_acl.z_acl_extern_obj)
+			dmu_tx_hold_write(tx,
+			    pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE);
+		else
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+			    0, ZFS_ACL_SIZE(MAX_ACL_SIZE));
+	}
+
+	if (mask & AT_SIZE) {
+		uint64_t off = vap->va_size;
+		/*
+		 * Grab the grow_lock to serialize this change with
+		 * respect to other file manipulations.
+		 */
+		rw_enter(&zp->z_grow_lock, RW_WRITER);
+		have_grow_lock = TRUE;
+		if (off < zp->z_phys->zp_size)
+			dmu_tx_hold_free(tx, zp->z_id, off, DMU_OBJECT_END);
+		else if (zp->z_phys->zp_size &&
+		    zp->z_blksz < zfsvfs->z_max_blksz && off > zp->z_blksz)
+			/* we will rewrite this block if we grow */
+			dmu_tx_hold_write(tx, zp->z_id, 0, zp->z_phys->zp_size);
+	}
+
+	err = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (err) {
+		dmu_tx_abort(tx);
+		if (have_grow_lock)
+			rw_exit(&zp->z_grow_lock);
+		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (err);
+	}
+
+	dmu_buf_will_dirty(zp->z_dbuf, tx);
+
+	/*
+	 * Set each attribute requested.
+	 * We group settings according to the locks they need to acquire.
+	 *
+	 * Note: you cannot set ctime directly, although it will be
+	 * updated as a side-effect of calling this function.
+	 */
+	if (mask & AT_SIZE) {
+		/*
+		 * XXX - Note, we are not providing any open
+		 * mode flags here (like FNDELAY), so we may
+		 * block if there are locks present... this
+		 * should be addressed in openat().
+		 */
+		err = zfs_freesp(zp, vap->va_size, 0, 0, tx, cr);
+		if (err) {
+			mutex_enter(&zp->z_lock);
+			goto out;
+		}
+		mask_applied |= AT_SIZE;
+	}
+
+	mask_applied = mask;	/* no errors after this point */
+
+	mutex_enter(&zp->z_lock);
+
+	if (mask & AT_MODE) {
+		err = zfs_acl_chmod_setattr(zp, new_mode, tx);
+		ASSERT3U(err, ==, 0);
+	}
+
+	if ((mask & AT_UID) && vap->va_uid != oldva.va_uid)
+		zp->z_phys->zp_uid = (uint64_t)vap->va_uid;
+
+	if ((mask & AT_GID) && vap->va_gid != oldva.va_gid)
+		zp->z_phys->zp_gid = (uint64_t)vap->va_gid;
+
+	if (mask & AT_ATIME)
+		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
+
+	if (mask & AT_MTIME)
+		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+
+	if (mask_applied & AT_SIZE)
+		zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
+	else if (mask_applied != 0)
+		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+
+out:
+	if (mask_applied != 0)
+		seq = zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap,
+		    mask_applied);
+
+	mutex_exit(&zp->z_lock);
+
+	if (have_grow_lock)
+		rw_exit(&zp->z_grow_lock);
+
+	dmu_tx_commit(tx);
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (err);
+}
+
+/*
+ * Search back through the directory tree, using the ".." entries.
+ * Lock each directory in the chain to prevent concurrent renames.
+ * Fail any attempt to move a directory into one of its own descendants.
+ * XXX - z_parent_lock can overlap with map or grow locks
+ */
+typedef struct zfs_zlock {
+	krwlock_t	*zl_rwlock;	/* lock we acquired */
+	znode_t		*zl_znode;	/* znode we held */
+	struct zfs_zlock *zl_next;	/* next in list */
+} zfs_zlock_t;
+
+static int
+zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
+{
+	zfs_zlock_t	*zl;
+	znode_t 	*zp = tdzp;
+	uint64_t	rootid = zp->z_zfsvfs->z_root;
+	uint64_t	*oidp = &zp->z_id;
+	krwlock_t	*rwlp = &szp->z_parent_lock;
+	krw_t		rw = RW_WRITER;
+
+	/*
+	 * First pass write-locks szp and compares to zp->z_id.
+	 * Later passes read-lock zp and compare to zp->z_parent.
+	 */
+	do {
+		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
+		zl->zl_rwlock = rwlp;
+		zl->zl_znode = NULL;
+		zl->zl_next = *zlpp;
+		*zlpp = zl;
+
+		rw_enter(rwlp, rw);
+
+		if (*oidp == szp->z_id)		/* We're a descendant of szp */
+			return (EINVAL);
+
+		if (*oidp == rootid)		/* We've hit the top */
+			return (0);
+
+		if (rw == RW_READER) {		/* i.e. not the first pass */
+			int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
+			if (error)
+				return (error);
+			zl->zl_znode = zp;
+		}
+		oidp = &zp->z_phys->zp_parent;
+		rwlp = &zp->z_parent_lock;
+		rw = RW_READER;
+
+	} while (zp->z_id != sdzp->z_id);
+
+	return (0);
+}
+
+/*
+ * Drop locks and release vnodes that were held by zfs_rename_lock().
+ */
+static void
+zfs_rename_unlock(zfs_zlock_t **zlpp)
+{
+	zfs_zlock_t *zl;
+
+	while ((zl = *zlpp) != NULL) {
+		if (zl->zl_znode != NULL)
+			VN_RELE(ZTOV(zl->zl_znode));
+		rw_exit(zl->zl_rwlock);
+		*zlpp = zl->zl_next;
+		kmem_free(zl, sizeof (*zl));
+	}
+}
+
+/*
+ * Move an entry from the provided source directory to the target
+ * directory.  Change the entry name as indicated.
+ *
+ *	IN:	sdvp	- Source directory containing the "old entry".
+ *		snm	- Old entry name.
+ *		tdvp	- Target directory to contain the "new entry".
+ *		tnm	- New entry name.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	sdvp,tdvp - ctime|mtime updated
+ */
+static int
+zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr)
+{
+	znode_t		*tdzp, *szp, *tzp;
+	znode_t		*sdzp = VTOZ(sdvp);
+	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	vnode_t		*realvp;
+	zfs_dirlock_t	*sdl, *tdl;
+	dmu_tx_t	*tx;
+	zfs_zlock_t	*zl;
+	int		cmp, serr, terr, error;
+
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * Make sure we have the real vp for the target directory.
+	 */
+	if (VOP_REALVP(tdvp, &realvp) == 0)
+		tdvp = realvp;
+
+	if (tdvp->v_vfsp != sdvp->v_vfsp) {
+		ZFS_EXIT(zfsvfs);
+		return (EXDEV);
+	}
+
+	tdzp = VTOZ(tdvp);
+top:
+	szp = NULL;
+	tzp = NULL;
+	zl = NULL;
+
+	/*
+	 * This is to prevent the creation of links into attribute space
+	 * by renaming a linked file into/outof an attribute directory.
+	 * See the comment in zfs_link() for why this is considered bad.
+	 */
+	if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
+	    (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	/*
+	 * Lock source and target directory entries.  To prevent deadlock,
+	 * a lock ordering must be defined.  We lock the directory with
+	 * the smallest object id first, or if it's a tie, the one with
+	 * the lexically first name.
+	 */
+	if (sdzp->z_id < tdzp->z_id) {
+		cmp = -1;
+	} else if (sdzp->z_id > tdzp->z_id) {
+		cmp = 1;
+	} else {
+		cmp = strcmp(snm, tnm);
+		if (cmp == 0) {
+			/*
+			 * POSIX: "If the old argument and the new argument
+			 * both refer to links to the same existing file,
+			 * the rename() function shall return successfully
+			 * and perform no other action."
+			 */
+			ZFS_EXIT(zfsvfs);
+			return (0);
+		}
+	}
+	if (cmp < 0) {
+		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
+		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
+	} else {
+		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
+		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
+	}
+
+	if (serr) {
+		/*
+		 * Source entry invalid or not there.
+		 */
+		if (!terr) {
+			zfs_dirent_unlock(tdl);
+			if (tzp)
+				VN_RELE(ZTOV(tzp));
+		}
+		if (strcmp(snm, "..") == 0)
+			serr = EINVAL;
+		ZFS_EXIT(zfsvfs);
+		return (serr);
+	}
+	if (terr) {
+		zfs_dirent_unlock(sdl);
+		VN_RELE(ZTOV(szp));
+		if (strcmp(tnm, "..") == 0)
+			terr = EINVAL;
+		ZFS_EXIT(zfsvfs);
+		return (terr);
+	}
+
+	/*
+	 * Must have write access at the source to remove the old entry
+	 * and write access at the target to create the new entry.
+	 * Note that if target and source are the same, this can be
+	 * done in a single check.
+	 */
+
+	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
+		goto out;
+
+	if (ZTOV(szp)->v_type == VDIR) {
+		/*
+		 * Check to make sure rename is valid.
+		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
+		 */
+		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
+			goto out;
+	}
+
+	/*
+	 * Does target exist?
+	 */
+	if (tzp) {
+		/*
+		 * Source and target must be the same type.
+		 */
+		if (ZTOV(szp)->v_type == VDIR) {
+			if (ZTOV(tzp)->v_type != VDIR) {
+				error = ENOTDIR;
+				goto out;
+			}
+		} else {
+			if (ZTOV(tzp)->v_type == VDIR) {
+				error = EISDIR;
+				goto out;
+			}
+		}
+		/*
+		 * POSIX dictates that when the source and target
+		 * entries refer to the same file object, rename
+		 * must do nothing and exit without error.
+		 */
+		if (szp->z_id == tzp->z_id) {
+			error = 0;
+			goto out;
+		}
+	}
+
+	vnevent_rename_src(ZTOV(szp));
+	if (tzp)
+		vnevent_rename_dest(ZTOV(tzp));
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, szp->z_id);	/* nlink changes */
+	dmu_tx_hold_bonus(tx, sdzp->z_id);	/* nlink changes */
+	if (sdzp != tdzp) {
+		dmu_tx_hold_zap(tx, sdzp->z_id, 1);
+		dmu_tx_hold_zap(tx, tdzp->z_id, 1);
+		dmu_tx_hold_bonus(tx, tdzp->z_id);	/* nlink changes */
+	} else {
+		dmu_tx_hold_zap(tx, sdzp->z_id, 2);
+	}
+	if (tzp) {
+		dmu_tx_hold_bonus(tx, tzp->z_id);	/* nlink changes */
+	}
+	dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		if (zl != NULL)
+			zfs_rename_unlock(&zl);
+		zfs_dirent_unlock(sdl);
+		zfs_dirent_unlock(tdl);
+		VN_RELE(ZTOV(szp));
+		if (tzp)
+			VN_RELE(ZTOV(tzp));
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (tzp)	/* Attempt to remove the existing target */
+		error = zfs_link_destroy(tdl, tzp, tx, 0, NULL);
+
+	if (error == 0) {
+		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+		if (error == 0) {
+			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+			ASSERT(error == 0);
+			seq = zfs_log_rename(zilog, tx, TX_RENAME,
+			    sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
+		}
+	}
+
+	dmu_tx_commit(tx);
+out:
+	if (zl != NULL)
+		zfs_rename_unlock(&zl);
+
+	zfs_dirent_unlock(sdl);
+	zfs_dirent_unlock(tdl);
+
+	VN_RELE(ZTOV(szp));
+	if (tzp)
+		VN_RELE(ZTOV(tzp));
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Insert the indicated symbolic reference entry into the directory.
+ *
+ *	IN:	dvp	- Directory to contain new symbolic link.
+ *		link	- Name for new symlink entry.
+ *		vap	- Attributes of new entry.
+ *		target	- Target path of new symlink.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime updated
+ */
+static int
+zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr)
+{
+	znode_t		*zp, *dzp = VTOZ(dvp);
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	uint64_t	zoid;
+	int		len = strlen(link);
+	int		error;
+
+	ASSERT(vap->va_type == VLNK);
+
+	ZFS_ENTER(zfsvfs);
+top:
+	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (len > MAXPATHLEN) {
+		ZFS_EXIT(zfsvfs);
+		return (ENAMETOOLONG);
+	}
+
+	/*
+	 * Attempt to lock directory; fail if entry already exists.
+	 */
+	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
+	dmu_tx_hold_bonus(tx, dzp->z_id);
+	dmu_tx_hold_zap(tx, dzp->z_id, 1);
+	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_dirent_unlock(dl);
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	dmu_buf_will_dirty(dzp->z_dbuf, tx);
+
+	/*
+	 * Create a new object for the symlink.
+	 * Put the link content into bonus buffer if it will fit;
+	 * otherwise, store it just like any other file data.
+	 */
+	zoid = 0;
+	if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
+		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len);
+		if (len != 0)
+			bcopy(link, zp->z_phys + 1, len);
+	} else {
+		dmu_buf_t *dbp;
+		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
+
+		rw_enter(&zp->z_grow_lock, RW_WRITER);
+		error = zfs_grow_blocksize(zp, len, tx);
+		rw_exit(&zp->z_grow_lock);
+		if (error)
+			goto out;
+
+		dbp = dmu_buf_hold(zfsvfs->z_os, zoid, 0);
+		dmu_buf_will_dirty(dbp, tx);
+
+		ASSERT3U(len, <=, dbp->db_size);
+		bcopy(link, dbp->db_data, len);
+		dmu_buf_rele(dbp);
+	}
+	zp->z_phys->zp_size = len;
+
+	/*
+	 * Insert the new object into the directory.
+	 */
+	(void) zfs_link_create(dl, zp, tx, ZNEW);
+out:
+	if (error == 0)
+		seq = zfs_log_symlink(zilog, tx, TX_SYMLINK,
+		    dzp, zp, name, link);
+
+	dmu_tx_commit(tx);
+
+	zfs_dirent_unlock(dl);
+
+	VN_RELE(ZTOV(zp));
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Return, in the buffer contained in the provided uio structure,
+ * the symbolic path referred to by vp.
+ *
+ *	IN:	vp	- vnode of symbolic link.
+ *		uoip	- structure to contain the link path.
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	uio	- structure to contain the link path.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	vp - atime updated
+ */
+/* ARGSUSED */
+static int
+zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	size_t		bufsz;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+
+	bufsz = (size_t)zp->z_phys->zp_size;
+	if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
+		error = uiomove(zp->z_phys + 1,
+		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+	} else {
+		dmu_buf_t *dbp = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0);
+		if ((error = dmu_buf_read_canfail(dbp)) != 0) {
+			dmu_buf_rele(dbp);
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+		error = uiomove(dbp->db_data,
+		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+		dmu_buf_rele(dbp);
+	}
+
+	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Insert a new entry into directory tdvp referencing svp.
+ *
+ *	IN:	tdvp	- Directory to contain new entry.
+ *		svp	- vnode of new entry.
+ *		name	- name of new entry.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	tdvp - ctime|mtime updated
+ *	 svp - ctime updated
+ */
+/* ARGSUSED */
+static int
+zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr)
+{
+	znode_t		*dzp = VTOZ(tdvp);
+	znode_t		*tzp, *szp;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	vnode_t		*realvp;
+	int		error;
+
+	ASSERT(tdvp->v_type == VDIR);
+
+	ZFS_ENTER(zfsvfs);
+
+	if (VOP_REALVP(svp, &realvp) == 0)
+		svp = realvp;
+
+	if (svp->v_vfsp != tdvp->v_vfsp) {
+		ZFS_EXIT(zfsvfs);
+		return (EXDEV);
+	}
+
+	szp = VTOZ(svp);
+top:
+	/*
+	 * We do not support links between attributes and non-attributes
+	 * because of the potential security risk of creating links
+	 * into "normal" file space in order to circumvent restrictions
+	 * imposed in attribute space.
+	 */
+	if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
+	    (dzp->z_phys->zp_flags & ZFS_XATTR)) {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	/*
+	 * POSIX dictates that we return EPERM here.
+	 * Better choices include ENOTSUP or EISDIR.
+	 */
+	if (svp->v_type == VDIR) {
+		ZFS_EXIT(zfsvfs);
+		return (EPERM);
+	}
+
+	if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) &&
+	    secpolicy_basic_link(cr) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (EPERM);
+	}
+
+	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Attempt to lock directory; fail if entry already exists.
+	 */
+	if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_bonus(tx, szp->z_id);
+	dmu_tx_hold_zap(tx, dzp->z_id, 1);
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_dirent_unlock(dl);
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	error = zfs_link_create(dl, szp, tx, 0);
+
+	if (error == 0)
+		seq = zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name);
+
+	dmu_tx_commit(tx);
+
+	zfs_dirent_unlock(dl);
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * zfs_null_putapage() is used when the file system has been force
+ * unmounted. It just drops the pages.
+ */
+/* ARGSUSED */
+static int
+zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
+		size_t *lenp, int flags, cred_t *cr)
+{
+	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
+		size_t *lenp, int flags, cred_t *cr)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	dmu_tx_t	*tx;
+	u_offset_t	off;
+	ssize_t		len;
+	caddr_t		va;
+	int		err;
+
+top:
+	rw_enter(&zp->z_grow_lock, RW_READER);
+
+	off = pp->p_offset;
+	len = MIN(PAGESIZE, zp->z_phys->zp_size - off);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_write(tx, zp->z_id, off, len);
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	err = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		rw_exit(&zp->z_grow_lock);
+		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		goto out;
+	}
+
+	va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
+
+	dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
+
+	ppmapout(va);
+
+	zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+	seq = zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0, NULL);
+	dmu_tx_commit(tx);
+
+	rw_exit(&zp->z_grow_lock);
+
+	pvn_write_done(pp, B_WRITE | flags);
+	if (offp)
+		*offp = off;
+	if (lenp)
+		*lenp = len;
+
+	zil_commit(zilog, seq, 0);
+out:
+	return (err);
+}
+
+/*
+ * Copy the portion of the file indicated from pages into the file.
+ * The pages are stored in a page list attached to the files vnode.
+ *
+ *	IN:	vp	- vnode of file to push page data to.
+ *		off	- position in file to put data.
+ *		len	- amount of data to write.
+ *		flags	- flags to control the operation.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	vp - ctime|mtime updated
+ */
+static int
+zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	page_t		*pp;
+	size_t		io_len;
+	u_offset_t	io_off;
+	int		error = 0;
+
+	ZFS_ENTER(zfsvfs);
+
+	ASSERT(zp->z_dbuf_held && zp->z_phys);
+
+	if (len == 0) {
+		/*
+		 * Search the entire vp list for pages >= off.
+		 */
+		error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage,
+		    flags, cr);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (off > zp->z_phys->zp_size) {
+		/* past end of file */
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	len = MIN(len, zp->z_phys->zp_size - off);
+
+	io_off = off;
+	while (io_off < off + len) {
+		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
+			pp  = page_lookup(vp, io_off,
+				(flags & (B_INVAL | B_FREE)) ?
+					SE_EXCL : SE_SHARED);
+		} else {
+			pp = page_lookup_nowait(vp, io_off,
+				(flags & B_FREE) ? SE_EXCL : SE_SHARED);
+		}
+
+		if (pp != NULL && pvn_getdirty(pp, flags)) {
+			int err;
+
+			/*
+			 * Found a dirty page to push
+			 */
+			if (err =
+			    zfs_putapage(vp, pp, &io_off, &io_len, flags, cr))
+				error = err;
+		} else {
+			io_len = PAGESIZE;
+		}
+		io_off += io_len;
+	}
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+void
+zfs_inactive(vnode_t *vp, cred_t *cr)
+{
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+
+	rw_enter(&zfsvfs->z_um_lock, RW_READER);
+	if (zfsvfs->z_unmounted2) {
+		ASSERT(zp->z_dbuf_held == 0);
+
+		if (vn_has_cached_data(vp)) {
+			(void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
+			    B_INVAL, cr);
+		}
+
+		vp->v_count = 0; /* count arrives as 1 */
+		zfs_znode_free(zp);
+		rw_exit(&zfsvfs->z_um_lock);
+		VFS_RELE(zfsvfs->z_vfs);
+		return;
+	}
+
+	/*
+	 * Attempt to push any data in the page cache.  If this fails
+	 * we will get kicked out later in zfs_zinactive().
+	 */
+	if (vn_has_cached_data(vp))
+		(void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL, cr);
+
+	if (zp->z_atime_dirty && zp->z_reap == 0) {
+		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+		dmu_tx_hold_bonus(tx, zp->z_id);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+		} else {
+			dmu_buf_will_dirty(zp->z_dbuf, tx);
+			mutex_enter(&zp->z_lock);
+			zp->z_atime_dirty = 0;
+			mutex_exit(&zp->z_lock);
+			dmu_tx_commit(tx);
+		}
+	}
+
+	zfs_zinactive(zp);
+	rw_exit(&zfsvfs->z_um_lock);
+}
+
+/*
+ * Bounds-check the seek operation.
+ *
+ *	IN:	vp	- vnode seeking within
+ *		ooff	- old file offset
+ *		noffp	- pointer to new file offset
+ *
+ *	RETURN:	0 if success
+ *		EINVAL if new offset invalid
+ */
+/* ARGSUSED */
+static int
+zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp)
+{
+	if (vp->v_type == VDIR)
+		return (0);
+	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+}
+
+/*
+ * Pre-filter the generic locking function to trap attempts to place
+ * a mandatory lock on a memory mapped file.
+ */
+static int
+zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
+    flk_callback_t *flk_cbp, cred_t *cr)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	uint_t cnt = 1;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * If file is being mapped, disallow frlock.  We set the mapcnt to
+	 * -1 here to signal that we are in the process of setting a lock.
+	 * This prevents a race with zfs_map().
+	 * XXX - well, sort of; since zfs_map() does not change z_mapcnt,
+	 * we could be in the middle of zfs_map() and still call fs_frlock().
+	 * Also, we are doing no checking in zfs_addmap() (where z_mapcnt
+	 * *is* manipulated).
+	 */
+	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
+	    (int)(cnt = atomic_cas_32(&zp->z_mapcnt, 0, -1)) > 0) {
+		ZFS_EXIT(zfsvfs);
+		return (EAGAIN);
+	}
+	error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr);
+	ASSERT((cnt != 0) || ((int)atomic_cas_32(&zp->z_mapcnt, -1, 0) == -1));
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * If we can't find a page in the cache, we will create a new page
+ * and fill it with file data.  For efficiency, we may try to fill
+ * multiple pages as once (klustering).
+ */
+static int
+zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
+    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
+{
+	znode_t *zp = VTOZ(vp);
+	page_t *pp, *cur_pp;
+	objset_t *os = zp->z_zfsvfs->z_os;
+	caddr_t va;
+	u_offset_t io_off, total;
+	uint64_t oid = zp->z_id;
+	size_t io_len;
+	int err;
+
+	/*
+	 * If we are only asking for a single page don't bother klustering.
+	 */
+	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE ||
+	    off > zp->z_phys->zp_size) {
+		io_off = off;
+		io_len = PAGESIZE;
+		pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr);
+	} else {
+		/*
+		 * Try to fill a kluster of pages (a blocks worth).
+		 */
+		size_t klen;
+		u_offset_t koff;
+
+		if (!ISP2(zp->z_blksz)) {
+			/* Only one block in the file. */
+			klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
+			koff = 0;
+		} else {
+			klen = plsz;
+			koff = P2ALIGN(off, (u_offset_t)klen);
+		}
+		if (klen > zp->z_phys->zp_size)
+			klen = P2ROUNDUP(zp->z_phys->zp_size,
+			    (uint64_t)PAGESIZE);
+		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
+			    &io_len, koff, klen, 0);
+	}
+	if (pp == NULL) {
+		/*
+		 * Some other thread entered the page before us.
+		 * Return to zfs_getpage to retry the lookup.
+		 */
+		*pl = NULL;
+		return (0);
+	}
+
+	/*
+	 * Fill the pages in the kluster.
+	 */
+	cur_pp = pp;
+	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
+		ASSERT(io_off == cur_pp->p_offset);
+		va = ppmapin(cur_pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
+		err = dmu_read_canfail(os, oid, io_off, PAGESIZE, va);
+		ppmapout(va);
+		if (err) {
+			/* On error, toss the entire kluster */
+			pvn_read_done(pp, B_ERROR);
+			return (err);
+		}
+		cur_pp = cur_pp->p_next;
+	}
+out:
+	/*
+	 * Fill in the page list array from the kluster.  If
+	 * there are too many pages in the kluster, return
+	 * as many pages as possible starting from the desired
+	 * offset `off'.
+	 * NOTE: the page list will always be null terminated.
+	 */
+	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
+
+	return (0);
+}
+
+/*
+ * Return pointers to the pages for the file region [off, off + len]
+ * in the pl array.  If plsz is greater than len, this function may
+ * also return page pointers from before or after the specified
+ * region (i.e. some region [off', off' + plsz]).  These additional
+ * pages are only returned if they are already in the cache, or were
+ * created as part of a klustered read.
+ *
+ *	IN:	vp	- vnode of file to get data from.
+ *		off	- position in file to get data from.
+ *		len	- amount of data to retrieve.
+ *		plsz	- length of provided page list.
+ *		seg	- segment to obtain pages for.
+ *		addr	- virtual address of fault.
+ *		rw	- mode of created pages.
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	protp	- protection mode of created pages.
+ *		pl	- list of pages created.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	vp - atime updated
+ */
+/* ARGSUSED */
+static int
+zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
+	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
+	enum seg_rw rw, cred_t *cr)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	page_t		*pp, **pl0 = pl;
+	int		cnt = 0, need_unlock = 0, err = 0;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (protp)
+		*protp = PROT_ALL;
+
+	ASSERT(zp->z_dbuf_held && zp->z_phys);
+
+	/* no faultahead (for now) */
+	if (pl == NULL) {
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/* can't fault past EOF */
+	if (off >= zp->z_phys->zp_size) {
+		ZFS_EXIT(zfsvfs);
+		return (EFAULT);
+	}
+
+	/*
+	 * Make sure nobody restructures the file (changes block size)
+	 * in the middle of the getpage.
+	 */
+	rw_enter(&zp->z_grow_lock, RW_READER);
+
+	/*
+	 * If we already own the lock, then we must be page faulting
+	 * in the middle of a write to this file (i.e., we are writing
+	 * to this file using data from a mapped region of the file).
+	 */
+	if (!rw_owner(&zp->z_map_lock)) {
+		rw_enter(&zp->z_map_lock, RW_WRITER);
+		need_unlock = TRUE;
+	}
+
+	/*
+	 * Loop through the requested range [off, off + len] looking
+	 * for pages.  If we don't find a page, we will need to create
+	 * a new page and fill it with data from the file.
+	 */
+	while (len > 0) {
+		if (plsz < PAGESIZE)
+			break;
+		if (pp = page_lookup(vp, off, SE_SHARED)) {
+			*pl++ = pp;
+			off += PAGESIZE;
+			addr += PAGESIZE;
+			len -= PAGESIZE;
+			plsz -= PAGESIZE;
+		} else {
+			err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw);
+			/*
+			 * klustering may have changed our region
+			 * to be block aligned.
+			 */
+			if (((pp = *pl) != 0) && (off != pp->p_offset)) {
+				int delta = off - pp->p_offset;
+				len += delta;
+				off -= delta;
+				addr -= delta;
+			}
+			while (*pl) {
+				pl++;
+				cnt++;
+				off += PAGESIZE;
+				addr += PAGESIZE;
+				plsz -= PAGESIZE;
+				if (len > PAGESIZE)
+					len -= PAGESIZE;
+				else
+					len = 0;
+			}
+		}
+		if (err)
+			goto out;
+	}
+
+	/*
+	 * Fill out the page array with any pages already in the cache.
+	 */
+	while (plsz > 0) {
+		pp = page_lookup_nowait(vp, off, SE_SHARED);
+		if (pp == NULL)
+			break;
+		*pl++ = pp;
+		off += PAGESIZE;
+		plsz -= PAGESIZE;
+	}
+
+	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+out:
+	if (err) {
+		/*
+		 * Release any pages we have locked.
+		 */
+		while (pl > pl0)
+			page_unlock(*--pl);
+	}
+	*pl = NULL;
+
+	if (need_unlock)
+		rw_exit(&zp->z_map_lock);
+	rw_exit(&zp->z_grow_lock);
+
+	ZFS_EXIT(zfsvfs);
+	return (err);
+}
+
+static int
+zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	segvn_crargs_t	vn_a;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (vp->v_flag & VNOMAP) {
+		ZFS_EXIT(zfsvfs);
+		return (ENOSYS);
+	}
+
+	if (off < 0 || len > MAXOFFSET_T - off) {
+		ZFS_EXIT(zfsvfs);
+		return (ENXIO);
+	}
+
+	if (vp->v_type != VREG) {
+		ZFS_EXIT(zfsvfs);
+		return (ENODEV);
+	}
+
+	/*
+	 * If file is locked, disallow mapping.
+	 * XXX - since we don't modify z_mapcnt here, there is nothing
+	 * to stop a file lock being placed immediately after we complete
+	 * this check.
+	 */
+	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
+		if (vn_has_flocks(vp) || zp->z_mapcnt == -1) {
+			ZFS_EXIT(zfsvfs);
+			return (EAGAIN);
+		}
+	}
+
+	as_rangelock(as);
+	if ((flags & MAP_FIXED) == 0) {
+		map_addr(addrp, len, off, 1, flags);
+		if (*addrp == NULL) {
+			as_rangeunlock(as);
+			ZFS_EXIT(zfsvfs);
+			return (ENOMEM);
+		}
+	} else {
+		/*
+		 * User specified address - blow away any previous mappings
+		 */
+		(void) as_unmap(as, *addrp, len);
+	}
+
+	vn_a.vp = vp;
+	vn_a.offset = (u_offset_t)off;
+	vn_a.type = flags & MAP_TYPE;
+	vn_a.prot = prot;
+	vn_a.maxprot = maxprot;
+	vn_a.cred = cr;
+	vn_a.amp = NULL;
+	vn_a.flags = flags & ~MAP_TYPE;
+
+	error = as_map(as, *addrp, len, segvn_create, &vn_a);
+
+	as_rangeunlock(as);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
+{
+	/*
+	 * XXX - shouldn't we be checking for file locks here?
+	 */
+	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, 0);
+	atomic_add_32(&VTOZ(vp)->z_mapcnt, btopr(len));
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr)
+{
+	atomic_add_32(&VTOZ(vp)->z_mapcnt, -btopr(len));
+	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, 0);
+	return (0);
+}
+
+/*
+ * Free or allocate space in a file.  Currently, this function only
+ * supports the `F_FREESP' command.  However, this command is somewhat
+ * misnamed, as its functionality includes the ability to allocate as
+ * well as free space.
+ *
+ *	IN:	vp	- vnode of file to free data in.
+ *		cmd	- action to take (only F_FREESP supported).
+ *		bfp	- section of file to free/alloc.
+ *		flag	- current file open mode flags.
+ *		offset	- current file offset.
+ *		cr	- credentials of caller [UNUSED].
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	vp - ctime|mtime updated
+ *
+ * NOTE: This function is limited in that it will only permit space to
+ *   be freed at the end of a file.  In essence, this function simply
+ *   allows one to set the file size.
+ */
+/* ARGSUSED */
+static int
+zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
+    offset_t offset, cred_t *cr, caller_context_t *ct)
+{
+	dmu_tx_t	*tx;
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	uint64_t	seq = 0;
+	uint64_t	off, len;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+
+top:
+	if (cmd != F_FREESP) {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	if (error = convoff(vp, bfp, 0, offset)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (bfp->l_len < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	off = bfp->l_start;
+	len = bfp->l_len;
+	tx = dmu_tx_create(zfsvfs->z_os);
+	/*
+	 * Grab the grow_lock to serialize this change with
+	 * respect to other file size changes.
+	 */
+	dmu_tx_hold_bonus(tx, zp->z_id);
+	rw_enter(&zp->z_grow_lock, RW_WRITER);
+	if (off + len > zp->z_blksz && zp->z_blksz < zfsvfs->z_max_blksz &&
+	    off >= zp->z_phys->zp_size) {
+		/*
+		 * We are increasing the length of the file,
+		 * and this may mean a block size increase.
+		 */
+		dmu_tx_hold_write(tx, zp->z_id, 0,
+		    MIN(off + len, zfsvfs->z_max_blksz));
+	} else if (off < zp->z_phys->zp_size) {
+		/*
+		 * If len == 0, we are truncating the file.
+		 */
+		dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
+	}
+
+	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (error) {
+		dmu_tx_abort(tx);
+		rw_exit(&zp->z_grow_lock);
+		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+			txg_wait_open(dmu_objset_pool(zfsvfs->z_os), 0);
+			goto top;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	error = zfs_freesp(zp, off, len, flag, tx, cr);
+
+	if (error == 0) {
+		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+		seq = zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+	}
+
+	rw_exit(&zp->z_grow_lock);
+
+	dmu_tx_commit(tx);
+
+	zil_commit(zilog, seq, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+static int
+zfs_fid(vnode_t *vp, fid_t *fidp)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	uint32_t	gen = (uint32_t)zp->z_phys->zp_gen;
+	uint64_t	object = zp->z_id;
+	zfid_short_t	*zfid;
+	int		size, i;
+
+	ZFS_ENTER(zfsvfs);
+
+	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
+	if (fidp->fid_len < size) {
+		fidp->fid_len = size;
+		return (ENOSPC);
+	}
+
+	zfid = (zfid_short_t *)fidp;
+
+	zfid->zf_len = size;
+
+	for (i = 0; i < sizeof (zfid->zf_object); i++)
+		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+	/* Must have a non-zero generation number to distinguish from .zfs */
+	if (gen == 0)
+		gen = 1;
+	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+	if (size == LONG_FID_LEN) {
+		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
+		zfid_long_t	*zlfid;
+
+		zlfid = (zfid_long_t *)fidp;
+
+		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
+
+		/* XXX - this should be the generation number for the objset */
+		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+			zlfid->zf_setgen[i] = 0;
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+static int
+zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
+{
+	znode_t		*zp, *xzp;
+	zfsvfs_t	*zfsvfs;
+	zfs_dirlock_t	*dl;
+	int		error;
+
+	switch (cmd) {
+	case _PC_LINK_MAX:
+		*valp = ULONG_MAX;
+		return (0);
+
+	case _PC_FILESIZEBITS:
+		*valp = 64;
+		return (0);
+
+	case _PC_XATTR_EXISTS:
+		zp = VTOZ(vp);
+		zfsvfs = zp->z_zfsvfs;
+		ZFS_ENTER(zfsvfs);
+		*valp = 0;
+		error = zfs_dirent_lock(&dl, zp, "", &xzp,
+		    ZXATTR | ZEXISTS | ZSHARED);
+		if (error == 0) {
+			zfs_dirent_unlock(dl);
+			if (!zfs_dirempty(xzp))
+				*valp = 1;
+			VN_RELE(ZTOV(xzp));
+		} else if (error == ENOENT) {
+			/*
+			 * If there aren't extended attributes, it's the
+			 * same as having zero of them.
+			 */
+			error = 0;
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+
+	case _PC_ACL_ENABLED:
+		*valp = _ACL_ACE_ENABLED;
+		return (0);
+
+	case _PC_MIN_HOLE_SIZE:
+		*valp = (ulong_t)SPA_MINBLOCKSIZE;
+		return (0);
+
+	default:
+		return (fs_pathconf(vp, cmd, valp, cr));
+	}
+}
+
+/*ARGSUSED*/
+static int
+zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	error = zfs_getacl(zp, vsecp, cr);
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*ARGSUSED*/
+static int
+zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	error = zfs_setacl(zp, vsecp, cr);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Predeclare these here so that the compiler assumes that
+ * this is an "old style" function declaration that does
+ * not include arguments => we won't get type mismatch errors
+ * in the initializations that follow.
+ */
+static int zfs_inval();
+static int zfs_isdir();
+
+static int
+zfs_inval()
+{
+	return (EINVAL);
+}
+
+static int
+zfs_isdir()
+{
+	return (EISDIR);
+}
+/*
+ * Directory vnode operations template
+ */
+vnodeops_t *zfs_dvnodeops;
+const fs_operation_def_t zfs_dvnodeops_template[] = {
+	VOPNAME_OPEN, zfs_open,
+	VOPNAME_CLOSE, zfs_close,
+	VOPNAME_READ, zfs_isdir,
+	VOPNAME_WRITE, zfs_isdir,
+	VOPNAME_IOCTL, zfs_ioctl,
+	VOPNAME_GETATTR, zfs_getattr,
+	VOPNAME_SETATTR, zfs_setattr,
+	VOPNAME_ACCESS, zfs_access,
+	VOPNAME_LOOKUP, zfs_lookup,
+	VOPNAME_CREATE, zfs_create,
+	VOPNAME_REMOVE, zfs_remove,
+	VOPNAME_LINK, zfs_link,
+	VOPNAME_RENAME, zfs_rename,
+	VOPNAME_MKDIR, zfs_mkdir,
+	VOPNAME_RMDIR, zfs_rmdir,
+	VOPNAME_READDIR, zfs_readdir,
+	VOPNAME_SYMLINK, zfs_symlink,
+	VOPNAME_FSYNC, zfs_fsync,
+	VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive,
+	VOPNAME_FID, zfs_fid,
+	VOPNAME_SEEK, zfs_seek,
+	VOPNAME_PATHCONF, zfs_pathconf,
+	VOPNAME_GETSECATTR, zfs_getsecattr,
+	VOPNAME_SETSECATTR, zfs_setsecattr,
+	NULL, NULL
+};
+
+/*
+ * Regular file vnode operations template
+ */
+vnodeops_t *zfs_fvnodeops;
+const fs_operation_def_t zfs_fvnodeops_template[] = {
+	VOPNAME_OPEN, zfs_open,
+	VOPNAME_CLOSE, zfs_close,
+	VOPNAME_READ, zfs_read,
+	VOPNAME_WRITE, zfs_write,
+	VOPNAME_IOCTL, zfs_ioctl,
+	VOPNAME_GETATTR, zfs_getattr,
+	VOPNAME_SETATTR, zfs_setattr,
+	VOPNAME_ACCESS, zfs_access,
+	VOPNAME_LOOKUP, zfs_lookup,
+	VOPNAME_RENAME, zfs_rename,
+	VOPNAME_FSYNC, zfs_fsync,
+	VOPNAME_INACTIVE, (fs_generic_func_p)zfs_inactive,
+	VOPNAME_FID, zfs_fid,
+	VOPNAME_SEEK, zfs_seek,
+	VOPNAME_FRLOCK, zfs_frlock,
+	VOPNAME_SPACE, zfs_space,
+	VOPNAME_GETPAGE, zfs_getpage,
+	VOPNAME_PUTPAGE, zfs_putpage,
+	VOPNAME_MAP, (fs_generic_func_p) zfs_map,
+	VOPNAME_ADDMAP, (fs_generic_func_p) zfs_addmap,
+	VOPNAME_DELMAP, zfs_delmap,
+	VOPNAME_PATHCONF, zfs_pathconf,
+	VOPNAME_GETSECATTR, zfs_getsecattr,
+	VOPNAME_SETSECATTR, zfs_setsecattr,
+	VOPNAME_VNEVENT, fs_vnevent_support,
+	NULL, NULL
+};
+
+/*
+ * Symbolic link vnode operations template
+ */
+vnodeops_t *zfs_symvnodeops;
+const fs_operation_def_t zfs_symvnodeops_template[] = {
+	VOPNAME_GETATTR, zfs_getattr,
+	VOPNAME_SETATTR, zfs_setattr,
+	VOPNAME_ACCESS, zfs_access,
+	VOPNAME_RENAME, zfs_rename,
+	VOPNAME_READLINK, zfs_readlink,
+	VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive,
+	VOPNAME_FID, zfs_fid,
+	VOPNAME_PATHCONF, zfs_pathconf,
+	VOPNAME_VNEVENT, fs_vnevent_support,
+	NULL, NULL
+};
+
+/*
+ * Extended attribute directory vnode operations template
+ *	This template is identical to the directory vnodes
+ *	operation template except for restricted operations:
+ *		VOP_MKDIR()
+ *		VOP_SYMLINK()
+ * Note that there are other restrictions embedded in:
+ *	zfs_create()	- restrict type to VREG
+ *	zfs_link()	- no links into/out of attribute space
+ *	zfs_rename()	- no moves into/out of attribute space
+ */
+vnodeops_t *zfs_xdvnodeops;
+const fs_operation_def_t zfs_xdvnodeops_template[] = {
+	VOPNAME_OPEN, zfs_open,
+	VOPNAME_CLOSE, zfs_close,
+	VOPNAME_IOCTL, zfs_ioctl,
+	VOPNAME_GETATTR, zfs_getattr,
+	VOPNAME_SETATTR, zfs_setattr,
+	VOPNAME_ACCESS, zfs_access,
+	VOPNAME_LOOKUP, zfs_lookup,
+	VOPNAME_CREATE, zfs_create,
+	VOPNAME_REMOVE, zfs_remove,
+	VOPNAME_LINK, zfs_link,
+	VOPNAME_RENAME, zfs_rename,
+	VOPNAME_MKDIR, zfs_inval,
+	VOPNAME_RMDIR, zfs_rmdir,
+	VOPNAME_READDIR, zfs_readdir,
+	VOPNAME_SYMLINK, zfs_inval,
+	VOPNAME_FSYNC, zfs_fsync,
+	VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive,
+	VOPNAME_FID, zfs_fid,
+	VOPNAME_SEEK, zfs_seek,
+	VOPNAME_PATHCONF, zfs_pathconf,
+	VOPNAME_GETSECATTR, zfs_getsecattr,
+	VOPNAME_SETSECATTR, zfs_setsecattr,
+	VOPNAME_VNEVENT, fs_vnevent_support,
+	NULL, NULL
+};
+
+/*
+ * Error vnode operations template
+ */
+vnodeops_t *zfs_evnodeops;
+const fs_operation_def_t zfs_evnodeops_template[] = {
+	VOPNAME_INACTIVE, (fs_generic_func_p) zfs_inactive,
+	VOPNAME_PATHCONF, zfs_pathconf,
+	NULL, NULL
+};
diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c
new file mode 100644
index 000000000000..1ff11e29b88d
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c
@@ -0,0 +1,1286 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/mntent.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/atomic.h>
+#include <vm/pvn.h>
+#include "fs/fs_subr.h"
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_znode.h>
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/fs/zfs.h>
+
+struct kmem_cache *znode_cache = NULL;
+
+/*
+ * Note that znodes can be on one of 2 states:
+ *	ZCACHE_mru	- recently used, currently cached
+ *	ZCACHE_mfu	- frequently used, currently cached
+ * When there are no active references to the znode, they
+ * are linked onto one of the lists in zcache.  These are the
+ * only znodes that can be evicted.
+ */
+
+typedef struct zcache_state {
+	list_t	list;	/* linked list of evictable znodes in state */
+	uint64_t lcnt;	/* total number of znodes in the linked list */
+	uint64_t cnt;	/* total number of all znodes in this state */
+	uint64_t hits;
+	kmutex_t mtx;
+} zcache_state_t;
+
+/* The 2 states: */
+static zcache_state_t ZCACHE_mru;
+static zcache_state_t ZCACHE_mfu;
+
+static struct zcache {
+	zcache_state_t	*mru;
+	zcache_state_t	*mfu;
+	uint64_t	p;		/* Target size of mru */
+	uint64_t	c;		/* Target size of cache */
+	uint64_t	c_max;		/* Maximum target cache size */
+
+	/* performance stats */
+	uint64_t	missed;
+	uint64_t	evicted;
+	uint64_t	skipped;
+} zcache;
+
+void zcache_kmem_reclaim(void);
+
+#define	ZCACHE_MINTIME (hz>>4) /* 62 ms */
+
+/*
+ * Move the supplied znode to the indicated state.  The mutex
+ * for the znode must be held by the caller.
+ */
+static void
+zcache_change_state(zcache_state_t *new_state, znode_t *zp)
+{
+	/* ASSERT(MUTEX_HELD(hash_mtx)); */
+	ASSERT(zp->z_active);
+
+	if (zp->z_zcache_state) {
+		ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
+		atomic_add_64(&zp->z_zcache_state->cnt, -1);
+	}
+	atomic_add_64(&new_state->cnt, 1);
+	zp->z_zcache_state = new_state;
+}
+
+static void
+zfs_zcache_evict(znode_t *zp, kmutex_t *hash_mtx)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	ASSERT(zp->z_phys);
+	ASSERT(zp->z_dbuf_held);
+
+	zp->z_dbuf_held = 0;
+	mutex_exit(&zp->z_lock);
+	dmu_buf_rele(zp->z_dbuf);
+	mutex_exit(hash_mtx);
+	VFS_RELE(zfsvfs->z_vfs);
+}
+
+/*
+ * Evict znodes from list until we've removed the specified number
+ */
+static void
+zcache_evict_state(zcache_state_t *state, int64_t cnt, zfsvfs_t *zfsvfs)
+{
+	int znodes_evicted = 0;
+	znode_t *zp, *zp_prev;
+	kmutex_t *hash_mtx;
+
+	ASSERT(state == zcache.mru || state == zcache.mfu);
+
+	mutex_enter(&state->mtx);
+
+	for (zp = list_tail(&state->list); zp; zp = zp_prev) {
+		zp_prev = list_prev(&state->list, zp);
+		if (zfsvfs && zp->z_zfsvfs != zfsvfs)
+			continue;
+		hash_mtx = ZFS_OBJ_MUTEX(zp);
+		if (mutex_tryenter(hash_mtx)) {
+			mutex_enter(&zp->z_lock);
+			list_remove(&zp->z_zcache_state->list, zp);
+			zp->z_zcache_state->lcnt -= 1;
+			ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
+			atomic_add_64(&zp->z_zcache_state->cnt, -1);
+			zp->z_zcache_state = NULL;
+			zp->z_zcache_access = 0;
+			/* drops z_lock and hash_mtx */
+			zfs_zcache_evict(zp, hash_mtx);
+			znodes_evicted += 1;
+			atomic_add_64(&zcache.evicted, 1);
+			if (znodes_evicted >= cnt)
+				break;
+		} else {
+			atomic_add_64(&zcache.skipped, 1);
+		}
+	}
+	mutex_exit(&state->mtx);
+
+	if (znodes_evicted < cnt)
+		dprintf("only evicted %lld znodes from %x",
+		    (longlong_t)znodes_evicted, state);
+}
+
+static void
+zcache_adjust(void)
+{
+	uint64_t mrucnt = zcache.mru->lcnt;
+	uint64_t mfucnt = zcache.mfu->lcnt;
+	uint64_t p = zcache.p;
+	uint64_t c = zcache.c;
+
+	if (mrucnt > p)
+		zcache_evict_state(zcache.mru, mrucnt - p, NULL);
+
+	if (mfucnt > 0 && mrucnt + mfucnt > c) {
+		int64_t toevict = MIN(mfucnt, mrucnt + mfucnt - c);
+		zcache_evict_state(zcache.mfu, toevict, NULL);
+	}
+}
+
+/*
+ * Flush all *evictable* data from the cache.
+ * NOTE: this will not touch "active" (i.e. referenced) data.
+ */
+void
+zfs_zcache_flush(zfsvfs_t *zfsvfs)
+{
+	zcache_evict_state(zcache.mru, zcache.mru->lcnt, zfsvfs);
+	zcache_evict_state(zcache.mfu, zcache.mfu->lcnt, zfsvfs);
+}
+
+static void
+zcache_try_grow(int64_t cnt)
+{
+	int64_t size;
+	/*
+	 * If we're almost to the current target cache size,
+	 * increment the target cache size
+	 */
+	size = zcache.mru->lcnt + zcache.mfu->lcnt;
+	if ((zcache.c - size) <= 1) {
+		atomic_add_64(&zcache.c, cnt);
+		if (zcache.c > zcache.c_max)
+			zcache.c = zcache.c_max;
+		else if (zcache.p + cnt < zcache.c)
+			atomic_add_64(&zcache.p, cnt);
+	}
+}
+
+/*
+ * This routine is called whenever a znode is accessed.
+ */
+static void
+zcache_access(znode_t *zp, kmutex_t *hash_mtx)
+{
+	ASSERT(MUTEX_HELD(hash_mtx));
+
+	if (zp->z_zcache_state == NULL) {
+		/*
+		 * This znode is not in the cache.
+		 * Add the new znode to the MRU state.
+		 */
+
+		zcache_try_grow(1);
+
+		ASSERT(zp->z_zcache_access == 0);
+		zp->z_zcache_access = lbolt;
+		zcache_change_state(zcache.mru, zp);
+		mutex_exit(hash_mtx);
+
+		/*
+		 * If we are using less than 2/3 of our total target
+		 * cache size, bump up the target size for the MRU
+		 * list.
+		 */
+		if (zcache.mru->lcnt + zcache.mfu->lcnt < zcache.c*2/3) {
+			zcache.p = zcache.mru->lcnt + zcache.c/6;
+		}
+
+		zcache_adjust();
+
+		atomic_add_64(&zcache.missed, 1);
+	} else if (zp->z_zcache_state == zcache.mru) {
+		/*
+		 * This znode has been "accessed" only once so far,
+		 * Move it to the MFU state.
+		 */
+		if (lbolt > zp->z_zcache_access + ZCACHE_MINTIME) {
+			/*
+			 * More than 125ms have passed since we
+			 * instantiated this buffer.  Move it to the
+			 * most frequently used state.
+			 */
+			zp->z_zcache_access = lbolt;
+			zcache_change_state(zcache.mfu, zp);
+		}
+		atomic_add_64(&zcache.mru->hits, 1);
+		mutex_exit(hash_mtx);
+	} else {
+		ASSERT(zp->z_zcache_state == zcache.mfu);
+		/*
+		 * This buffer has been accessed more than once.
+		 * Keep it in the MFU state.
+		 */
+		atomic_add_64(&zcache.mfu->hits, 1);
+		mutex_exit(hash_mtx);
+	}
+}
+
+static void
+zcache_init(void)
+{
+	zcache.c = 20;
+	zcache.c_max = 50;
+
+	zcache.mru = &ZCACHE_mru;
+	zcache.mfu = &ZCACHE_mfu;
+
+	list_create(&zcache.mru->list, sizeof (znode_t),
+	    offsetof(znode_t, z_zcache_node));
+	list_create(&zcache.mfu->list, sizeof (znode_t),
+	    offsetof(znode_t, z_zcache_node));
+}
+
+static void
+zcache_fini(void)
+{
+	zfs_zcache_flush(NULL);
+
+	list_destroy(&zcache.mru->list);
+	list_destroy(&zcache.mfu->list);
+}
+
+/*ARGSUSED*/
+static void
+znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
+{
+	znode_t *zp = user_ptr;
+	vnode_t *vp = ZTOV(zp);
+
+	if (vp->v_count == 0) {
+		vn_invalid(vp);
+		zfs_znode_free(zp);
+	}
+}
+
+/*ARGSUSED*/
+static int
+zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+	znode_t *zp = buf;
+
+	zp->z_vnode = vn_alloc(KM_SLEEP);
+	zp->z_vnode->v_data = (caddr_t)zp;
+	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
+	rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
+	rw_init(&zp->z_grow_lock, NULL, RW_DEFAULT, NULL);
+	rw_init(&zp->z_append_lock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
+	zp->z_dbuf_held = 0;
+	zp->z_dirlocks = 0;
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_znode_cache_destructor(void *buf, void *cdarg)
+{
+	znode_t *zp = buf;
+
+	ASSERT(zp->z_dirlocks == 0);
+	mutex_destroy(&zp->z_lock);
+	rw_destroy(&zp->z_map_lock);
+	rw_destroy(&zp->z_grow_lock);
+	rw_destroy(&zp->z_append_lock);
+	mutex_destroy(&zp->z_acl_lock);
+
+	ASSERT(zp->z_dbuf_held == 0);
+	ASSERT(ZTOV(zp)->v_count == 0);
+	vn_free(ZTOV(zp));
+}
+
+void
+zfs_znode_init(void)
+{
+	/*
+	 * Initialize zcache
+	 */
+	ASSERT(znode_cache == NULL);
+	znode_cache = kmem_cache_create("zfs_znode_cache",
+	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
+	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
+
+	zcache_init();
+}
+
+void
+zfs_znode_fini(void)
+{
+	zcache_fini();
+
+	/*
+	 * Cleanup vfs & vnode ops
+	 */
+	zfs_remove_op_tables();
+
+	/*
+	 * Cleanup zcache
+	 */
+	if (znode_cache)
+		kmem_cache_destroy(znode_cache);
+	znode_cache = NULL;
+}
+
+struct vnodeops *zfs_dvnodeops;
+struct vnodeops *zfs_fvnodeops;
+struct vnodeops *zfs_symvnodeops;
+struct vnodeops *zfs_xdvnodeops;
+struct vnodeops *zfs_evnodeops;
+
+void
+zfs_remove_op_tables()
+{
+	/*
+	 * Remove vfs ops
+	 */
+	ASSERT(zfsfstype);
+	(void) vfs_freevfsops_by_type(zfsfstype);
+	zfsfstype = 0;
+
+	/*
+	 * Remove vnode ops
+	 */
+	if (zfs_dvnodeops)
+		vn_freevnodeops(zfs_dvnodeops);
+	if (zfs_fvnodeops)
+		vn_freevnodeops(zfs_fvnodeops);
+	if (zfs_symvnodeops)
+		vn_freevnodeops(zfs_symvnodeops);
+	if (zfs_xdvnodeops)
+		vn_freevnodeops(zfs_xdvnodeops);
+	if (zfs_evnodeops)
+		vn_freevnodeops(zfs_evnodeops);
+
+	zfs_dvnodeops = NULL;
+	zfs_fvnodeops = NULL;
+	zfs_symvnodeops = NULL;
+	zfs_xdvnodeops = NULL;
+	zfs_evnodeops = NULL;
+}
+
+extern const fs_operation_def_t zfs_dvnodeops_template[];
+extern const fs_operation_def_t zfs_fvnodeops_template[];
+extern const fs_operation_def_t zfs_xdvnodeops_template[];
+extern const fs_operation_def_t zfs_symvnodeops_template[];
+extern const fs_operation_def_t zfs_evnodeops_template[];
+
+int
+zfs_create_op_tables()
+{
+	int error;
+
+	/*
+	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
+	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
+	 * In this case we just return as the ops vectors are already set up.
+	 */
+	if (zfs_dvnodeops)
+		return (0);
+
+	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
+	    &zfs_dvnodeops);
+	if (error)
+		return (error);
+
+	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
+	    &zfs_fvnodeops);
+	if (error)
+		return (error);
+
+	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
+	    &zfs_symvnodeops);
+	if (error)
+		return (error);
+
+	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
+	    &zfs_xdvnodeops);
+	if (error)
+		return (error);
+
+	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
+	    &zfs_evnodeops);
+
+	return (error);
+}
+
+/*
+ * zfs_init_fs - Initialize the zfsvfs struct and the file system
+ *	incore "master" object.  Verify version compatibility.
+ */
+int
+zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
+{
+	extern int zfsfstype;
+
+	objset_t	*os = zfsvfs->z_os;
+	uint64_t	zoid;
+	uint64_t	version = ZFS_VERSION;
+	int		i, error;
+	dmu_object_info_t doi;
+	dmu_objset_stats_t *stats;
+
+	*zpp = NULL;
+
+	/*
+	 * XXX - hack to auto-create the pool root filesystem at
+	 * the first attempted mount.
+	 */
+	if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
+		dmu_tx_t *tx = dmu_tx_create(os);
+
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 3); /* master node */
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1); /* delete queue */
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		ASSERT3U(error, ==, 0);
+		zfs_create_fs(os, cr, tx);
+		dmu_tx_commit(tx);
+	}
+
+	if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_VERSION_OBJ, 8, 1, &version)) {
+		return (EINVAL);
+	} else if (version != ZFS_VERSION) {
+		(void) printf("Mismatched versions:  File system "
+		    "is version %lld on-disk format, which is "
+		    "incompatible with this software version %lld!",
+		    (u_longlong_t)version, ZFS_VERSION);
+		return (ENOTSUP);
+	}
+
+	/*
+	 * The fsid is 64 bits, composed of an 8-bit fs type, which
+	 * separates our fsid from any other filesystem types, and a
+	 * 56-bit objset unique ID.  The objset unique ID is unique to
+	 * all objsets open on this system, provided by unique_create().
+	 * The 8-bit fs type must be put in the low bits of fsid[1]
+	 * because that's where other Solaris filesystems put it.
+	 */
+	stats = kmem_alloc(sizeof (dmu_objset_stats_t), KM_SLEEP);
+	dmu_objset_stats(os, stats);
+	ASSERT((stats->dds_fsid_guid & ~((1ULL<<56)-1)) == 0);
+	zfsvfs->z_vfs->vfs_fsid.val[0] = stats->dds_fsid_guid;
+	zfsvfs->z_vfs->vfs_fsid.val[1] = ((stats->dds_fsid_guid>>32) << 8) |
+	    zfsfstype & 0xFF;
+	kmem_free(stats, sizeof (dmu_objset_stats_t));
+	stats = NULL;
+
+	if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid)) {
+		return (EINVAL);
+	}
+	ASSERT(zoid != 0);
+	zfsvfs->z_root = zoid;
+
+	/*
+	 * Create the per mount vop tables.
+	 */
+
+	/*
+	 * Initialize zget mutex's
+	 */
+	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+	error = zfs_zget(zfsvfs, zoid, zpp);
+	if (error)
+		return (error);
+	ASSERT3U((*zpp)->z_id, ==, zoid);
+
+	if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid)) {
+		return (EINVAL);
+	}
+
+	zfsvfs->z_dqueue = zoid;
+
+	/*
+	 * Initialize delete head structure
+	 * Thread(s) will be started/stopped via
+	 * readonly_changed_cb() depending
+	 * on whether this is rw/ro mount.
+	 */
+	list_create(&zfsvfs->z_delete_head.z_znodes,
+	    sizeof (znode_t), offsetof(znode_t, z_list_node));
+
+	return (0);
+}
+
+/*
+ * Construct a new znode/vnode and intialize.
+ *
+ * This does not do a call to dmu_set_user() that is
+ * up to the caller to do, in case you don't want to
+ * return the znode
+ */
+znode_t *
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
+{
+	znode_t	*zp;
+	vnode_t *vp;
+
+	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+
+	ASSERT(zp->z_dirlocks == NULL);
+
+	zp->z_phys = db->db_data;
+	zp->z_zfsvfs = zfsvfs;
+	zp->z_active = 1;
+	zp->z_reap = 0;
+	zp->z_atime_dirty = 0;
+	zp->z_dbuf_held = 0;
+	zp->z_mapcnt = 0;
+	zp->z_last_itx = 0;
+	zp->z_dbuf = db;
+	zp->z_id = obj_num;
+	zp->z_blksz = blksz;
+	zp->z_seq = 0x7A4653;
+
+	bzero(&zp->z_zcache_node, sizeof (list_node_t));
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	list_insert_tail(&zfsvfs->z_all_znodes, zp);
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	vp = ZTOV(zp);
+	vn_reinit(vp);
+
+	vp->v_vfsp = zfsvfs->z_parent->z_vfs;
+	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
+
+	switch (vp->v_type) {
+	case VDIR:
+		if (zp->z_phys->zp_flags & ZFS_XATTR) {
+			vn_setops(vp, zfs_xdvnodeops);
+			vp->v_flag |= V_XATTRDIR;
+		} else
+			vn_setops(vp, zfs_dvnodeops);
+		break;
+	case VBLK:
+	case VCHR:
+		vp->v_rdev = (dev_t)zp->z_phys->zp_rdev;
+		/*FALLTHROUGH*/
+	case VFIFO:
+	case VSOCK:
+	case VDOOR:
+		vn_setops(vp, zfs_fvnodeops);
+		break;
+	case VREG:
+		vp->v_flag |= VMODSORT;
+		vn_setops(vp, zfs_fvnodeops);
+		break;
+	case VLNK:
+		vn_setops(vp, zfs_symvnodeops);
+		break;
+	default:
+		vn_setops(vp, zfs_evnodeops);
+		break;
+	}
+
+	return (zp);
+}
+
+static void
+zfs_znode_dmu_init(znode_t *zp)
+{
+	znode_t		*nzp;
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	dmu_buf_t	*db = zp->z_dbuf;
+
+	mutex_enter(&zp->z_lock);
+
+	nzp = dmu_buf_set_user(db, zp, &zp->z_phys, znode_pageout_func);
+
+	/*
+	 * there should be no
+	 * concurrent zgets on this object.
+	 */
+	ASSERT3P(nzp, ==, NULL);
+
+	/*
+	 * Slap on VROOT if we are the root znode
+	 */
+	if (zp->z_id == zfsvfs->z_root) {
+		ZTOV(zp)->v_flag |= VROOT;
+	}
+
+	zp->z_zcache_state = NULL;
+	zp->z_zcache_access = 0;
+
+	ASSERT(zp->z_dbuf_held == 0);
+	zp->z_dbuf_held = 1;
+	VFS_HOLD(zfsvfs->z_vfs);
+	mutex_exit(&zp->z_lock);
+	vn_exists(ZTOV(zp));
+}
+
+/*
+ * Create a new DMU object to hold a zfs znode.
+ *
+ *	IN:	dzp	- parent directory for new znode
+ *		vap	- file attributes for new znode
+ *		tx	- dmu transaction id for zap operations
+ *		cr	- credentials of caller
+ *		flag	- flags:
+ *			  IS_ROOT_NODE	- new object will be root
+ *			  IS_XATTR	- new object is an attribute
+ *			  IS_REPLAY	- intent log replay
+ *
+ *	OUT:	oid	- ID of created object
+ *
+ */
+void
+zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
+	uint_t flag, znode_t **zpp, int bonuslen)
+{
+	dmu_buf_t	*dbp;
+	znode_phys_t	*pzp;
+	znode_t		*zp;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	timestruc_t	now;
+	uint64_t	gen;
+	int		err;
+
+	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
+
+	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
+		*oid = vap->va_nodeid;
+		flag |= IS_REPLAY;
+		now = vap->va_ctime;		/* see zfs_replay_create() */
+		gen = vap->va_nblocks;		/* ditto */
+	} else {
+		*oid = 0;
+		gethrestime(&now);
+		gen = dmu_tx_get_txg(tx);
+	}
+
+	/*
+	 * Create a new DMU object.
+	 */
+	if (vap->va_type == VDIR) {
+		if (flag & IS_REPLAY) {
+			err = zap_create_claim(zfsvfs->z_os, *oid,
+			    DMU_OT_DIRECTORY_CONTENTS,
+			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+			ASSERT3U(err, ==, 0);
+		} else {
+			*oid = zap_create(zfsvfs->z_os,
+			    DMU_OT_DIRECTORY_CONTENTS,
+			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+		}
+	} else {
+		if (flag & IS_REPLAY) {
+			err = dmu_object_claim(zfsvfs->z_os, *oid,
+			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
+			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+			ASSERT3U(err, ==, 0);
+		} else {
+			*oid = dmu_object_alloc(zfsvfs->z_os,
+			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
+			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+		}
+	}
+	dbp = dmu_bonus_hold(zfsvfs->z_os, *oid);
+	dmu_buf_will_dirty(dbp, tx);
+
+	/*
+	 * Initialize the znode physical data to zero.
+	 */
+	ASSERT(dbp->db_size >= sizeof (znode_phys_t));
+	bzero(dbp->db_data, dbp->db_size);
+	pzp = dbp->db_data;
+
+	/*
+	 * If this is the root, fix up the half-initialized parent pointer
+	 * to reference the just-allocated physical data area.
+	 */
+	if (flag & IS_ROOT_NODE) {
+		dzp->z_phys = pzp;
+		dzp->z_id = *oid;
+	}
+
+	/*
+	 * If parent is an xattr, so am I.
+	 */
+	if (dzp->z_phys->zp_flags & ZFS_XATTR)
+		flag |= IS_XATTR;
+
+	if (vap->va_type == VBLK || vap->va_type == VCHR) {
+		pzp->zp_rdev = vap->va_rdev;
+	}
+
+	if (vap->va_type == VDIR) {
+		pzp->zp_size = 2;		/* contents ("." and "..") */
+		pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+	}
+
+	pzp->zp_parent = dzp->z_id;
+	if (flag & IS_XATTR)
+		pzp->zp_flags |= ZFS_XATTR;
+
+	pzp->zp_gen = gen;
+
+	ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
+	ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
+
+	if (vap->va_mask & AT_ATIME) {
+		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
+	} else {
+		ZFS_TIME_ENCODE(&now, pzp->zp_atime);
+	}
+
+	if (vap->va_mask & AT_MTIME) {
+		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+	} else {
+		ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
+	}
+
+	pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+	zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0);
+
+	zfs_perm_init(zp, dzp, flag, vap, tx, cr);
+
+	if (zpp) {
+		kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp);
+
+		mutex_enter(hash_mtx);
+		zfs_znode_dmu_init(zp);
+		zcache_access(zp, hash_mtx);
+		*zpp = zp;
+	} else {
+		ZTOV(zp)->v_count = 0;
+		dmu_buf_rele(dbp);
+		zfs_znode_free(zp);
+	}
+}
+
+int
+zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+	dmu_object_info_t doi;
+	dmu_buf_t	*db;
+	znode_t		*zp;
+
+	*zpp = NULL;
+
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+	db = dmu_bonus_hold(zfsvfs->z_os, obj_num);
+	if (db == NULL) {
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (ENOENT);
+	}
+
+	dmu_object_info_from_db(db, &doi);
+	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
+		dmu_buf_rele(db);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (EINVAL);
+	}
+	dmu_buf_read(db);
+
+	ASSERT(db->db_object == obj_num);
+	ASSERT(db->db_offset == -1);
+	ASSERT(db->db_data != NULL);
+
+	zp = dmu_buf_get_user(db);
+
+	if (zp != NULL) {
+		mutex_enter(&zp->z_lock);
+
+		ASSERT3U(zp->z_id, ==, obj_num);
+		if (zp->z_reap) {
+			dmu_buf_rele(db);
+			mutex_exit(&zp->z_lock);
+			ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+			return (ENOENT);
+		} else if (zp->z_dbuf_held) {
+			dmu_buf_rele(db);
+		} else {
+			zp->z_dbuf_held = 1;
+			VFS_HOLD(zfsvfs->z_vfs);
+		}
+
+		if (zp->z_active == 0) {
+			zp->z_active = 1;
+			if (list_link_active(&zp->z_zcache_node)) {
+				mutex_enter(&zp->z_zcache_state->mtx);
+				list_remove(&zp->z_zcache_state->list, zp);
+				zp->z_zcache_state->lcnt -= 1;
+				mutex_exit(&zp->z_zcache_state->mtx);
+			}
+		}
+		VN_HOLD(ZTOV(zp));
+		mutex_exit(&zp->z_lock);
+		zcache_access(zp, ZFS_OBJ_MUTEX(zp));
+		*zpp = zp;
+		return (0);
+	}
+
+	/*
+	 * Not found create new znode/vnode
+	 */
+	zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
+	ASSERT3U(zp->z_id, ==, obj_num);
+	zfs_znode_dmu_init(zp);
+	zcache_access(zp, ZFS_OBJ_MUTEX(zp));
+	*zpp = zp;
+	return (0);
+}
+
+void
+zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
+	if (zp->z_phys->zp_acl.z_acl_extern_obj) {
+		error = dmu_object_free(zfsvfs->z_os,
+		    zp->z_phys->zp_acl.z_acl_extern_obj, tx);
+		ASSERT3U(error, ==, 0);
+	}
+	if (zp->z_zcache_state) {
+		ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
+		atomic_add_64(&zp->z_zcache_state->cnt, -1);
+	}
+	error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
+	ASSERT3U(error, ==, 0);
+	zp->z_dbuf_held = 0;
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
+	dmu_buf_rele(zp->z_dbuf);
+}
+
+void
+zfs_zinactive(znode_t *zp)
+{
+	vnode_t	*vp = ZTOV(zp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	uint64_t z_id = zp->z_id;
+
+	ASSERT(zp->z_dbuf_held && zp->z_phys);
+
+	/*
+	 * Don't allow a zfs_zget() while were trying to release this znode
+	 */
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
+
+	mutex_enter(&zp->z_lock);
+	mutex_enter(&vp->v_lock);
+	vp->v_count--;
+	if (vp->v_count > 0 || vn_has_cached_data(vp)) {
+		/*
+		 * If the hold count is greater than zero, somebody has
+		 * obtained a new reference on this znode while we were
+		 * processing it here, so we are done.  If we still have
+		 * mapped pages then we are also done, since we don't
+		 * want to inactivate the znode until the pages get pushed.
+		 *
+		 * XXX - if vn_has_cached_data(vp) is true, but count == 0,
+		 * this seems like it would leave the znode hanging with
+		 * no chance to go inactive...
+		 */
+		mutex_exit(&vp->v_lock);
+		mutex_exit(&zp->z_lock);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+		return;
+	}
+	mutex_exit(&vp->v_lock);
+	zp->z_active = 0;
+
+	/*
+	 * If this was the last reference to a file with no links,
+	 * remove the file from the file system.
+	 */
+	if (zp->z_reap) {
+		mutex_exit(&zp->z_lock);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+		ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
+		atomic_add_64(&zp->z_zcache_state->cnt, -1);
+		zp->z_zcache_state = NULL;
+		/* XATTR files are not put on the delete queue */
+		if (zp->z_phys->zp_flags & ZFS_XATTR) {
+			zfs_rmnode(zp);
+		} else {
+			mutex_enter(&zfsvfs->z_delete_head.z_mutex);
+			list_insert_tail(&zfsvfs->z_delete_head.z_znodes, zp);
+			zfsvfs->z_delete_head.z_znode_count++;
+			cv_broadcast(&zfsvfs->z_delete_head.z_cv);
+			mutex_exit(&zfsvfs->z_delete_head.z_mutex);
+		}
+		VFS_RELE(zfsvfs->z_vfs);
+		return;
+	}
+
+	/*
+	 * If the file system for this znode is no longer mounted,
+	 * evict the znode now, don't put it in the cache.
+	 */
+	if (zfsvfs->z_unmounted1) {
+		zfs_zcache_evict(zp, ZFS_OBJ_MUTEX(zp));
+		return;
+	}
+
+	/* put znode on evictable list */
+	mutex_enter(&zp->z_zcache_state->mtx);
+	list_insert_head(&zp->z_zcache_state->list, zp);
+	zp->z_zcache_state->lcnt += 1;
+	mutex_exit(&zp->z_zcache_state->mtx);
+	mutex_exit(&zp->z_lock);
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+}
+
+void
+zfs_znode_free(znode_t *zp)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	list_remove(&zfsvfs->z_all_znodes, zp);
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	kmem_cache_free(znode_cache, zp);
+}
+
+void
+zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+{
+	timestruc_t	now;
+
+	ASSERT(MUTEX_HELD(&zp->z_lock));
+
+	gethrestime(&now);
+
+	if (tx) {
+		dmu_buf_will_dirty(zp->z_dbuf, tx);
+		zp->z_atime_dirty = 0;
+		zp->z_seq++;
+	} else {
+		zp->z_atime_dirty = 1;
+	}
+
+	if (flag & AT_ATIME)
+		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
+
+	if (flag & AT_MTIME)
+		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
+
+	if (flag & AT_CTIME)
+		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
+}
+
+/*
+ * Update the requested znode timestamps with the current time.
+ * If we are in a transaction, then go ahead and mark the znode
+ * dirty in the transaction so the timestamps will go to disk.
+ * Otherwise, we will get pushed next time the znode is updated
+ * in a transaction, or when this znode eventually goes inactive.
+ *
+ * Why is this OK?
+ *  1 - Only the ACCESS time is ever updated outside of a transaction.
+ *  2 - Multiple consecutive updates will be collapsed into a single
+ *	znode update by the transaction grouping semantics of the DMU.
+ */
+void
+zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+{
+	mutex_enter(&zp->z_lock);
+	zfs_time_stamper_locked(zp, flag, tx);
+	mutex_exit(&zp->z_lock);
+}
+
+/*
+ * Grow the block size for a file.  This may involve migrating data
+ * from the bonus buffer into a data block (when we grow beyond the
+ * bonus buffer data area).
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		size	- requested block size
+ *		tx	- open transaction.
+ *
+ * 	RETURN:	0 if success
+ *		error code if failure
+ *
+ * NOTE: this function assumes that the znode is write locked.
+ */
+int
+zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
+{
+	int		error;
+	u_longlong_t	dummy;
+
+	ASSERT(rw_write_held(&zp->z_grow_lock));
+
+	if (size <= zp->z_blksz)
+		return (0);
+	/*
+	 * If the file size is already greater than the current blocksize,
+	 * we will not grow.  If there is more than one block in a file,
+	 * the blocksize cannot change.
+	 */
+	if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
+		return (0);
+
+	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
+	    size, 0, tx);
+	if (error == ENOTSUP)
+		return (0);
+	ASSERT3U(error, ==, 0);
+
+	/* What blocksize did we actually get? */
+	dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
+
+	return (0);
+}
+
+/*
+ * This is a dummy interface used when pvn_vplist_dirty() should *not*
+ * be calling back into the fs for a putpage().  E.g.: when truncating
+ * a file, the pages being "thrown away* don't need to be written out.
+ */
+/* ARGSUSED */
+static int
+zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
+    int flags, cred_t *cr)
+{
+	ASSERT(0);
+	return (0);
+}
+
+/*
+ * Free space in a file.  Currently, this function only
+ * supports freeing space at the end of the file.
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		from	- start of section to free.
+ *		len	- length of section to free (0 => to EOF).
+ *		flag	- current file open mode flags.
+ *		tx	- open transaction.
+ *
+ * 	RETURN:	0 if success
+ *		error code if failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t from, uint64_t len, int flag, dmu_tx_t *tx,
+	cred_t *cr)
+{
+	vnode_t *vp = ZTOV(zp);
+	uint64_t size = zp->z_phys->zp_size;
+	uint64_t end = from + len;
+	int have_grow_lock, error;
+
+	have_grow_lock = RW_WRITE_HELD(&zp->z_grow_lock);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (len == 0 && size == from) {
+		return (0);
+	}
+
+	/*
+	 * Check for any locks in the region to be freed.
+	 */
+	if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) {
+		uint64_t	start;
+
+		if (size > from)
+			start = from;
+		else
+			start = size;
+		if (error = chklock(vp, FWRITE, start, 0, flag, NULL))
+			return (error);
+	}
+
+	if (end > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+	    zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
+		uint64_t new_blksz;
+		/*
+		 * We are growing the file past the current block size.
+		 */
+		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+			ASSERT(!ISP2(zp->z_blksz));
+			new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
+		} else {
+			new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
+		}
+		error = zfs_grow_blocksize(zp, new_blksz, tx);
+		ASSERT(error == 0);
+	}
+	if (end > size || len == 0)
+		zp->z_phys->zp_size = end;
+	if (from > size)
+		return (0);
+
+	if (have_grow_lock)
+		rw_downgrade(&zp->z_grow_lock);
+	/*
+	 * Clear any mapped pages in the truncated region.
+	 */
+	rw_enter(&zp->z_map_lock, RW_WRITER);
+	if (vn_has_cached_data(vp)) {
+		page_t *pp;
+		uint64_t start = from & PAGEMASK;
+		int off = from & PAGEOFFSET;
+
+		if (off != 0 && (pp = page_lookup(vp, start, SE_SHARED))) {
+			/*
+			 * We need to zero a partial page.
+			 */
+			pagezero(pp, off, PAGESIZE - off);
+			start += PAGESIZE;
+			page_unlock(pp);
+		}
+		error = pvn_vplist_dirty(vp, start, zfs_no_putpage,
+		    B_INVAL | B_TRUNC, cr);
+		ASSERT(error == 0);
+	}
+	rw_exit(&zp->z_map_lock);
+
+	if (!have_grow_lock)
+		rw_enter(&zp->z_grow_lock, RW_READER);
+
+	if (len == 0)
+		len = -1;
+	else if (end > size)
+		len = size - from;
+	dmu_free_range(zp->z_zfsvfs->z_os, zp->z_id, from, len, tx);
+
+	if (!have_grow_lock)
+		rw_exit(&zp->z_grow_lock);
+
+	return (0);
+}
+
+
+void
+zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
+{
+	zfsvfs_t	zfsvfs;
+	uint64_t	moid, doid, roid = 0;
+	uint64_t	version = ZFS_VERSION;
+	int		error;
+	znode_t		*rootzp = NULL;
+	vnode_t		*vp;
+	vattr_t		vattr;
+
+	/*
+	 * First attempt to create master node.
+	 */
+	moid = MASTER_NODE_OBJ;
+	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
+	    DMU_OT_NONE, 0, tx);
+	ASSERT(error == 0);
+
+	/*
+	 * Set starting attributes.
+	 */
+
+	error = zap_update(os, moid, ZFS_VERSION_OBJ, 8, 1, &version, tx);
+	ASSERT(error == 0);
+
+	/*
+	 * Create a delete queue.
+	 */
+	doid = zap_create(os, DMU_OT_DELETE_QUEUE, DMU_OT_NONE, 0, tx);
+
+	error = zap_add(os, moid, ZFS_DELETE_QUEUE, 8, 1, &doid, tx);
+	ASSERT(error == 0);
+
+	/*
+	 * Create root znode.  Create minimal znode/vnode/zfsvfs
+	 * to allow zfs_mknode to work.
+	 */
+	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
+	vattr.va_type = VDIR;
+	vattr.va_mode = S_IFDIR|0755;
+	vattr.va_uid = 0;
+	vattr.va_gid = 3;
+
+	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+	rootzp->z_zfsvfs = &zfsvfs;
+	rootzp->z_active = 1;
+	rootzp->z_reap = 0;
+	rootzp->z_atime_dirty = 0;
+	rootzp->z_dbuf_held = 0;
+
+	vp = ZTOV(rootzp);
+	vn_reinit(vp);
+	vp->v_type = VDIR;
+
+	bzero(&zfsvfs, sizeof (zfsvfs_t));
+
+	zfsvfs.z_os = os;
+	zfsvfs.z_assign = TXG_NOWAIT;
+	zfsvfs.z_parent = &zfsvfs;
+
+	mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
+	    offsetof(znode_t, z_link_node));
+
+	zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0);
+	ASSERT3U(rootzp->z_id, ==, roid);
+	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx);
+	ASSERT(error == 0);
+
+	ZTOV(rootzp)->v_count = 0;
+	kmem_cache_free(znode_cache, rootzp);
+}
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
new file mode 100644
index 000000000000..1adc8ca3dfe2
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -0,0 +1,1242 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/arc.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vdev.h>
+
+
+/*
+ * The zfs intent log (ZIL) saves transaction records of system calls
+ * that change the file system in memory with enough information
+ * to be able to replay them. These are stored in memory until
+ * either the DMU transaction group (txg) commits them to the stable pool
+ * and they can be discarded, or they are flushed to the stable log
+ * (also in the pool) due to a fsync, O_DSYNC or other synchronous
+ * requirement. In the event of a panic or power fail then those log
+ * records (transactions) are replayed.
+ *
+ * There is one ZIL per file system. Its on-disk (pool) format consists
+ * of 3 parts:
+ *
+ * 	- ZIL header
+ * 	- ZIL blocks
+ * 	- ZIL records
+ *
+ * A log record holds a system call transaction. Log blocks can
+ * hold many log records and the blocks are chained together.
+ * Each ZIL block contains a block pointer (blkptr_t) to the next
+ * ZIL block in the chain. The ZIL header points to the first
+ * block in the chain. Note there is not a fixed place in the pool
+ * to hold blocks. They are dynamically allocated and freed as
+ * needed from the blocks available. Figure X shows the ZIL structure:
+ */
+
+/*
+ * These global ZIL switches affect all pools
+ */
+int zil_disable = 0;	/* disable intent logging */
+int zil_always = 0;	/* make every transaction synchronous */
+int zil_purge = 0;	/* at pool open, just throw everything away */
+int zil_noflush = 0;	/* don't flush write cache buffers on disks */
+
+static kmem_cache_t *zil_lwb_cache;
+
+static int
+zil_dva_compare(const void *x1, const void *x2)
+{
+	const dva_t *dva1 = x1;
+	const dva_t *dva2 = x2;
+
+	if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
+		return (-1);
+	if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
+		return (1);
+
+	if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
+		return (-1);
+	if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
+		return (1);
+
+	return (0);
+}
+
+static void
+zil_dva_tree_init(avl_tree_t *t)
+{
+	avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t),
+	    offsetof(zil_dva_node_t, zn_node));
+}
+
+static void
+zil_dva_tree_fini(avl_tree_t *t)
+{
+	zil_dva_node_t *zn;
+	void *cookie = NULL;
+
+	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
+		kmem_free(zn, sizeof (zil_dva_node_t));
+
+	avl_destroy(t);
+}
+
+static int
+zil_dva_tree_add(avl_tree_t *t, dva_t *dva)
+{
+	zil_dva_node_t *zn;
+	avl_index_t where;
+
+	if (avl_find(t, dva, &where) != NULL)
+		return (EEXIST);
+
+	zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP);
+	zn->zn_dva = *dva;
+	avl_insert(t, zn, where);
+
+	return (0);
+}
+
+/*
+ * Read a log block, make sure it's valid, and byteswap it if necessary.
+ */
+static int
+zil_read_log_block(zilog_t *zilog, blkptr_t *bp, char *buf)
+{
+	uint64_t blksz = BP_GET_LSIZE(bp);
+	zil_trailer_t *ztp = (zil_trailer_t *)(buf + blksz) - 1;
+	zio_cksum_t cksum;
+	int error;
+
+	error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, blksz,
+	    NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
+	if (error) {
+		dprintf_bp(bp, "zilog %p bp %p read failed, error %d: ",
+		    zilog, bp, error);
+		return (error);
+	}
+
+	if (BP_SHOULD_BYTESWAP(bp))
+		byteswap_uint64_array(buf, blksz);
+
+	/*
+	 * Sequence numbers should be... sequential.  The checksum verifier for
+	 * the next block should be: <logid[0], logid[1], objset id, seq + 1>.
+	 */
+	cksum = bp->blk_cksum;
+	cksum.zc_word[3]++;
+	if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)) != 0) {
+		dprintf_bp(bp, "zilog %p bp %p stale pointer: ", zilog, bp);
+		return (ESTALE);
+	}
+
+	if (BP_IS_HOLE(&ztp->zit_next_blk)) {
+		dprintf_bp(bp, "zilog %p bp %p hole: ", zilog, bp);
+		return (ENOENT);
+	}
+
+	if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) {
+		dprintf("zilog %p bp %p nused exceeds blksz\n", zilog, bp);
+		return (EOVERFLOW);
+	}
+
+	dprintf_bp(bp, "zilog %p bp %p good block: ", zilog, bp);
+
+	return (0);
+}
+
+/*
+ * Parse the intent log, and call parse_func for each valid record within.
+ */
+void
+zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+    zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
+{
+	blkptr_t blk;
+	char *lrbuf, *lrp;
+	zil_trailer_t *ztp;
+	int reclen, error;
+
+	blk = zilog->zl_header->zh_log;
+	if (BP_IS_HOLE(&blk))
+		return;
+
+	/*
+	 * Starting at the block pointed to by zh_log we read the log chain.
+	 * For each block in the chain we strongly check that block to
+	 * ensure its validity.  We stop when an invalid block is found.
+	 * For each block pointer in the chain we call parse_blk_func().
+	 * For each record in each valid block we call parse_lr_func().
+	 */
+	zil_dva_tree_init(&zilog->zl_dva_tree);
+	lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+	for (;;) {
+		error = zil_read_log_block(zilog, &blk, lrbuf);
+
+		if (parse_blk_func != NULL)
+			parse_blk_func(zilog, &blk, arg, txg);
+
+		if (error)
+			break;
+
+		ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
+		blk = ztp->zit_next_blk;
+
+		if (parse_lr_func == NULL)
+			continue;
+
+		for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
+			lr_t *lr = (lr_t *)lrp;
+			reclen = lr->lrc_reclen;
+			ASSERT3U(reclen, >=, sizeof (lr_t));
+			parse_lr_func(zilog, lr, arg, txg);
+		}
+	}
+	zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
+	zil_dva_tree_fini(&zilog->zl_dva_tree);
+}
+
+/* ARGSUSED */
+static void
+zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+{
+	spa_t *spa = zilog->zl_spa;
+	int err;
+
+	dprintf_bp(bp, "first_txg %llu: ", first_txg);
+
+	/*
+	 * Claim log block if not already committed and not already claimed.
+	 */
+	if (bp->blk_birth >= first_txg &&
+	    zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
+		err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL));
+		ASSERT(err == 0);
+	}
+}
+
+static void
+zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
+{
+	if (lrc->lrc_txtype == TX_WRITE) {
+		lr_write_t *lr = (lr_write_t *)lrc;
+		zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
+	}
+}
+
+/* ARGSUSED */
+static void
+zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
+{
+	zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
+}
+
+static void
+zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
+{
+	/*
+	 * If we previously claimed it, we need to free it.
+	 */
+	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
+		lr_write_t *lr = (lr_write_t *)lrc;
+		blkptr_t *bp = &lr->lr_blkptr;
+		if (bp->blk_birth >= claim_txg &&
+		    !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
+			(void) arc_free(NULL, zilog->zl_spa,
+			    dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
+		}
+	}
+}
+
+/*
+ * Create an on-disk intent log.
+ */
+static void
+zil_create(zilog_t *zilog)
+{
+	lwb_t *lwb;
+	uint64_t txg;
+	dmu_tx_t *tx;
+	blkptr_t blk;
+	int error;
+
+	ASSERT(zilog->zl_header->zh_claim_txg == 0);
+	ASSERT(zilog->zl_header->zh_replay_seq == 0);
+
+	/*
+	 * Initialize the log header block.
+	 */
+	tx = dmu_tx_create(zilog->zl_os);
+	(void) dmu_tx_assign(tx, TXG_WAIT);
+	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+	txg = dmu_tx_get_txg(tx);
+
+	/*
+	 * Allocate the first log block and assign its checksum verifier.
+	 */
+	error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
+	    ZIL_MIN_BLKSZ, &blk, txg);
+	if (error == 0) {
+		ZIO_SET_CHECKSUM(&blk.blk_cksum,
+		    spa_get_random(-1ULL), spa_get_random(-1ULL),
+		    dmu_objset_id(zilog->zl_os), 1ULL);
+
+		/*
+		 * Allocate a log write buffer (lwb) for the first log block.
+		 */
+		lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+		lwb->lwb_zilog = zilog;
+		lwb->lwb_blk = blk;
+		lwb->lwb_nused = 0;
+		lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
+		lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
+		lwb->lwb_max_txg = txg;
+		lwb->lwb_seq = 0;
+		lwb->lwb_state = UNWRITTEN;
+		mutex_enter(&zilog->zl_lock);
+		list_insert_tail(&zilog->zl_lwb_list, lwb);
+		mutex_exit(&zilog->zl_lock);
+	}
+
+	dmu_tx_commit(tx);
+	txg_wait_synced(zilog->zl_dmu_pool, txg);
+}
+
+/*
+ * In one tx, free all log blocks and clear the log header.
+ */
+void
+zil_destroy(zilog_t *zilog)
+{
+	dmu_tx_t *tx;
+	uint64_t txg;
+
+	mutex_enter(&zilog->zl_destroy_lock);
+
+	if (BP_IS_HOLE(&zilog->zl_header->zh_log)) {
+		mutex_exit(&zilog->zl_destroy_lock);
+		return;
+	}
+
+	tx = dmu_tx_create(zilog->zl_os);
+	(void) dmu_tx_assign(tx, TXG_WAIT);
+	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+	txg = dmu_tx_get_txg(tx);
+
+	zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx,
+	    zilog->zl_header->zh_claim_txg);
+	zilog->zl_destroy_txg = txg;
+
+	dmu_tx_commit(tx);
+	txg_wait_synced(zilog->zl_dmu_pool, txg);
+
+	mutex_exit(&zilog->zl_destroy_lock);
+}
+
+void
+zil_claim(char *osname, void *txarg)
+{
+	dmu_tx_t *tx = txarg;
+	uint64_t first_txg = dmu_tx_get_txg(tx);
+	zilog_t *zilog;
+	zil_header_t *zh;
+	objset_t *os;
+	int error;
+
+	error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_STANDARD, &os);
+	if (error) {
+		cmn_err(CE_WARN, "can't process intent log for %s", osname);
+		return;
+	}
+
+	zilog = dmu_objset_zil(os);
+	zh = zilog->zl_header;
+
+	/*
+	 * Claim all log blocks if we haven't already done so.
+	 */
+	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
+	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
+		zh->zh_claim_txg = first_txg;
+		zil_parse(zilog, zil_claim_log_block, zil_claim_log_record,
+		    tx, first_txg);
+		dsl_dataset_dirty(dmu_objset_ds(os), tx);
+	}
+	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
+	dmu_objset_close(os);
+}
+
+void
+zil_add_vdev(zilog_t *zilog, uint64_t vdev, uint64_t seq)
+{
+	zil_vdev_t *zv;
+
+	if (zil_noflush)
+		return;
+
+	ASSERT(MUTEX_HELD(&zilog->zl_lock));
+	zv = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP);
+	zv->vdev = vdev;
+	zv->seq = seq;
+	list_insert_tail(&zilog->zl_vdev_list, zv);
+}
+
+
+void
+zil_flush_vdevs(zilog_t *zilog, uint64_t seq)
+{
+	vdev_t *vd;
+	zil_vdev_t *zv, *zv2;
+	zio_t *zio;
+	spa_t *spa;
+	uint64_t vdev;
+
+	if (zil_noflush)
+		return;
+
+	ASSERT(MUTEX_HELD(&zilog->zl_lock));
+
+	spa = zilog->zl_spa;
+	zio = NULL;
+
+	while ((zv = list_head(&zilog->zl_vdev_list)) != NULL &&
+	    zv->seq <= seq) {
+		vdev = zv->vdev;
+		list_remove(&zilog->zl_vdev_list, zv);
+		kmem_free(zv, sizeof (zil_vdev_t));
+
+		/*
+		 * remove all chained entries <= seq with same vdev
+		 */
+		zv = list_head(&zilog->zl_vdev_list);
+		while (zv && zv->seq <= seq) {
+			zv2 = list_next(&zilog->zl_vdev_list, zv);
+			if (zv->vdev == vdev) {
+				list_remove(&zilog->zl_vdev_list, zv);
+				kmem_free(zv, sizeof (zil_vdev_t));
+			}
+			zv = zv2;
+		}
+
+		/* flush the write cache for this vdev */
+		mutex_exit(&zilog->zl_lock);
+		if (zio == NULL)
+			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+		vd = vdev_lookup_top(spa, vdev);
+		ASSERT(vd);
+		(void) zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
+		    NULL, NULL, ZIO_PRIORITY_NOW,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
+		mutex_enter(&zilog->zl_lock);
+	}
+
+	/*
+	 * Wait for all the flushes to complete.  Not all devices actually
+	 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
+	 */
+	if (zio != NULL)
+		(void) zio_wait(zio);
+}
+
+/*
+ * Function called when a log block write completes
+ */
+static void
+zil_lwb_write_done(zio_t *zio)
+{
+	lwb_t *prev;
+	lwb_t *lwb = zio->io_private;
+	zilog_t *zilog = lwb->lwb_zilog;
+	uint64_t max_seq;
+
+	/*
+	 * Now that we've written this log block, we have a stable pointer
+	 * to the next block in the chain, so it's OK to let the txg in
+	 * which we allocated the next block sync.
+	 */
+	txg_rele_to_sync(&lwb->lwb_txgh);
+
+	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+	mutex_enter(&zilog->zl_lock);
+	lwb->lwb_buf = NULL;
+	if (zio->io_error) {
+		zilog->zl_log_error = B_TRUE;
+		mutex_exit(&zilog->zl_lock);
+		cv_broadcast(&zilog->zl_cv_seq);
+		return;
+	}
+
+	prev = list_prev(&zilog->zl_lwb_list, lwb);
+	if (prev && prev->lwb_state != SEQ_COMPLETE) {
+		/* There's an unwritten buffer in the chain before this one */
+		lwb->lwb_state = SEQ_INCOMPLETE;
+		mutex_exit(&zilog->zl_lock);
+		return;
+	}
+
+	max_seq = lwb->lwb_seq;
+	lwb->lwb_state = SEQ_COMPLETE;
+	/*
+	 * We must also follow up the chain for already written buffers
+	 * to see if we can set zl_ss_seq even higher.
+	 */
+	while (lwb = list_next(&zilog->zl_lwb_list, lwb)) {
+		if (lwb->lwb_state != SEQ_INCOMPLETE)
+			break;
+		lwb->lwb_state = SEQ_COMPLETE;
+		/* lwb_seq will be zero if we've written an empty buffer */
+		if (lwb->lwb_seq) {
+			ASSERT3U(max_seq, <, lwb->lwb_seq);
+			max_seq = lwb->lwb_seq;
+		}
+	}
+	zilog->zl_ss_seq = MAX(max_seq, zilog->zl_ss_seq);
+	mutex_exit(&zilog->zl_lock);
+	cv_broadcast(&zilog->zl_cv_seq);
+}
+
+/*
+ * Start a log block write and advance to the next log block.
+ * Calls are serialized.
+ */
+static lwb_t *
+zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
+{
+	lwb_t *nlwb;
+	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
+	uint64_t txg;
+	uint64_t zil_blksz;
+	int error;
+
+	ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
+
+	/*
+	 * Allocate the next block and save its address in this block
+	 * before writing it in order to establish the log chain.
+	 * Note that if the allocation of nlwb synced before we wrote
+	 * the block that points at it (lwb), we'd leak it if we crashed.
+	 * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
+	 */
+	txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
+	txg_rele_to_quiesce(&lwb->lwb_txgh);
+
+	/*
+	 * Pick a ZIL blocksize based upon the size of the outstanding
+	 * in-memory transactions, or if none the same size as the
+	 * last block.
+	 */
+	if (zilog->zl_itx_list_sz) {
+		zil_blksz = zilog->zl_itx_list_sz + sizeof (*ztp);
+		zil_blksz = P2ROUNDUP(zil_blksz, ZIL_MIN_BLKSZ);
+		if (zil_blksz > ZIL_MAX_BLKSZ)
+			zil_blksz = ZIL_MAX_BLKSZ;
+		zilog->zl_prev_blk_sz = zil_blksz;
+	} else {
+		zil_blksz = zilog->zl_prev_blk_sz;
+	}
+
+	error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
+	    zil_blksz, &ztp->zit_next_blk, txg);
+	if (error) {
+		txg_rele_to_sync(&lwb->lwb_txgh);
+		return (NULL);
+	}
+
+	ASSERT3U(ztp->zit_next_blk.blk_birth, ==, txg);
+	ztp->zit_nused = lwb->lwb_nused;
+	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
+	ztp->zit_next_blk.blk_cksum = lwb->lwb_blk.blk_cksum;
+	ztp->zit_next_blk.blk_cksum.zc_word[3]++;
+
+	/*
+	 * Allocate a new log write buffer (lwb).
+	 */
+	nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+
+	nlwb->lwb_zilog = zilog;
+	nlwb->lwb_blk = ztp->zit_next_blk;
+	nlwb->lwb_nused = 0;
+	nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
+	nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
+	nlwb->lwb_max_txg = txg;
+	nlwb->lwb_seq = 0;
+	nlwb->lwb_state = UNWRITTEN;
+
+	/*
+	 * Put new lwb at the end of the log chain,
+	 * and record the vdev for later flushing
+	 */
+	mutex_enter(&zilog->zl_lock);
+	list_insert_tail(&zilog->zl_lwb_list, nlwb);
+	zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(&(lwb->lwb_blk))),
+	    lwb->lwb_seq);
+	mutex_exit(&zilog->zl_lock);
+
+	/*
+	 * write the old log block
+	 */
+	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
+	zio_nowait(zio_rewrite(NULL, zilog->zl_spa, ZIO_CHECKSUM_ZILOG, 0,
+	    &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb,
+	    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED));
+
+	return (nlwb);
+}
+
+static lwb_t *
+zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
+{
+	lr_t *lrc = &itx->itx_lr; /* common log record */
+	uint64_t seq = lrc->lrc_seq;
+	uint64_t txg = lrc->lrc_txg;
+	uint64_t reclen = lrc->lrc_reclen;
+	int error;
+
+	if (lwb == NULL)
+		return (NULL);
+	ASSERT(lwb->lwb_buf != NULL);
+
+	/*
+	 * If it's a write, fetch the data or get its blkptr as appropriate.
+	 */
+	if (lrc->lrc_txtype == TX_WRITE) {
+		lr_write_t *lr = (lr_write_t *)lrc;
+		if (txg > spa_freeze_txg(zilog->zl_spa))
+			txg_wait_synced(zilog->zl_dmu_pool, txg);
+
+		if (!itx->itx_data_copied &&
+		    (error = zilog->zl_get_data(itx->itx_private, lr)) != 0) {
+			if (error != ENOENT && error != EALREADY) {
+				txg_wait_synced(zilog->zl_dmu_pool, txg);
+				mutex_enter(&zilog->zl_lock);
+				zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq);
+				zil_add_vdev(zilog,
+				    DVA_GET_VDEV(BP_IDENTITY(&(lr->lr_blkptr))),
+				    seq);
+				mutex_exit(&zilog->zl_lock);
+				return (lwb);
+			}
+			mutex_enter(&zilog->zl_lock);
+			zil_add_vdev(zilog,
+			    DVA_GET_VDEV(BP_IDENTITY(&(lr->lr_blkptr))), seq);
+			mutex_exit(&zilog->zl_lock);
+			return (lwb);
+		}
+	}
+
+	/*
+	 * If this record won't fit in the current log block, start a new one.
+	 */
+	if (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)) {
+		lwb = zil_lwb_write_start(zilog, lwb);
+		if (lwb == NULL)
+			return (NULL);
+		if (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)) {
+			txg_wait_synced(zilog->zl_dmu_pool, txg);
+			mutex_enter(&zilog->zl_lock);
+			zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq);
+			mutex_exit(&zilog->zl_lock);
+			return (lwb);
+		}
+	}
+
+	bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
+	lwb->lwb_nused += reclen;
+	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
+	ASSERT3U(lwb->lwb_seq, <, seq);
+	lwb->lwb_seq = seq;
+	ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
+	ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
+
+	return (lwb);
+}
+
+itx_t *
+zil_itx_create(int txtype, size_t lrsize)
+{
+	itx_t *itx;
+
+	lrsize = P2ROUNDUP(lrsize, sizeof (uint64_t));
+
+	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
+	itx->itx_lr.lrc_txtype = txtype;
+	itx->itx_lr.lrc_reclen = lrsize;
+	itx->itx_lr.lrc_seq = 0;	/* defensive */
+
+	return (itx);
+}
+
+uint64_t
+zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+{
+	uint64_t seq;
+
+	ASSERT(itx->itx_lr.lrc_seq == 0);
+
+	mutex_enter(&zilog->zl_lock);
+	list_insert_tail(&zilog->zl_itx_list, itx);
+	zilog->zl_itx_list_sz += itx->itx_lr.lrc_reclen;
+	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
+	itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
+	mutex_exit(&zilog->zl_lock);
+
+	return (seq);
+}
+
+/*
+ * Free up all in-memory intent log transactions that have now been synced.
+ */
+static void
+zil_itx_clean(zilog_t *zilog)
+{
+	uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa);
+	uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa);
+	uint64_t max_seq = 0;
+	itx_t *itx;
+
+	mutex_enter(&zilog->zl_lock);
+	while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
+	    itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
+		list_remove(&zilog->zl_itx_list, itx);
+		zilog->zl_itx_list_sz -= itx->itx_lr.lrc_reclen;
+		ASSERT3U(max_seq, <, itx->itx_lr.lrc_seq);
+		max_seq = itx->itx_lr.lrc_seq;
+		kmem_free(itx, offsetof(itx_t, itx_lr)
+		    + itx->itx_lr.lrc_reclen);
+	}
+	if (max_seq > zilog->zl_ss_seq) {
+		zilog->zl_ss_seq = max_seq;
+		cv_broadcast(&zilog->zl_cv_seq);
+	}
+	mutex_exit(&zilog->zl_lock);
+}
+
+void
+zil_clean(zilog_t *zilog)
+{
+	/*
+	 * Check for any log blocks that can be freed.
+	 * Log blocks are only freed when the log block allocation and
+	 * log records contained within are both known to be committed.
+	 */
+	mutex_enter(&zilog->zl_lock);
+	if (list_head(&zilog->zl_itx_list) != NULL)
+		(void) taskq_dispatch(zilog->zl_clean_taskq,
+		    (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP);
+	mutex_exit(&zilog->zl_lock);
+}
+
+/*
+ * Push zfs transactions to stable storage up to the supplied sequence number.
+ */
+void
+zil_commit(zilog_t *zilog, uint64_t seq, int ioflag)
+{
+	uint64_t txg;
+	uint64_t max_seq;
+	uint64_t reclen;
+	itx_t *itx;
+	lwb_t *lwb;
+	spa_t *spa;
+
+	if (zilog == NULL || seq == 0 ||
+	    ((ioflag & (FSYNC | FDSYNC | FRSYNC)) == 0 && !zil_always))
+		return;
+
+	spa = zilog->zl_spa;
+	mutex_enter(&zilog->zl_lock);
+
+	seq = MIN(seq, zilog->zl_itx_seq);	/* cap seq at largest itx seq */
+
+	for (;;) {
+		if (zilog->zl_ss_seq >= seq) {	/* already on stable storage */
+			cv_signal(&zilog->zl_cv_write);
+			mutex_exit(&zilog->zl_lock);
+			return;
+		}
+
+		if (zilog->zl_writer == B_FALSE) /* no one writing, do it */
+			break;
+
+		cv_wait(&zilog->zl_cv_write, &zilog->zl_lock);
+	}
+
+	zilog->zl_writer = B_TRUE;
+	max_seq = 0;
+
+	if (zilog->zl_suspend) {
+		lwb = NULL;
+	} else {
+		lwb = list_tail(&zilog->zl_lwb_list);
+		if (lwb == NULL) {
+			mutex_exit(&zilog->zl_lock);
+			zil_create(zilog);
+			mutex_enter(&zilog->zl_lock);
+			lwb = list_tail(&zilog->zl_lwb_list);
+		}
+	}
+
+	/*
+	 * Loop through in-memory log transactions filling log blocks,
+	 * until we reach the given sequence number and there's no more
+	 * room in the write buffer.
+	 */
+	for (;;) {
+		itx = list_head(&zilog->zl_itx_list);
+		if (itx == NULL)
+			break;
+
+		reclen = itx->itx_lr.lrc_reclen;
+		if ((itx->itx_lr.lrc_seq > seq) &&
+		    ((lwb == NULL) || (lwb->lwb_nused + reclen >
+		    ZIL_BLK_DATA_SZ(lwb))))
+			break;
+
+		list_remove(&zilog->zl_itx_list, itx);
+		txg = itx->itx_lr.lrc_txg;
+		ASSERT(txg);
+
+		mutex_exit(&zilog->zl_lock);
+		if (txg > spa_last_synced_txg(spa) ||
+		    txg > spa_freeze_txg(spa))
+			lwb = zil_lwb_commit(zilog, itx, lwb);
+		else
+			max_seq = itx->itx_lr.lrc_seq;
+		kmem_free(itx, offsetof(itx_t, itx_lr)
+		    + itx->itx_lr.lrc_reclen);
+		mutex_enter(&zilog->zl_lock);
+		zilog->zl_itx_list_sz -= reclen;
+	}
+
+	mutex_exit(&zilog->zl_lock);
+
+	/* write the last block out */
+	if (lwb != NULL && lwb->lwb_nused != 0)
+		lwb = zil_lwb_write_start(zilog, lwb);
+
+	/* wake up others waiting to start a write */
+	mutex_enter(&zilog->zl_lock);
+	zilog->zl_writer = B_FALSE;
+	cv_signal(&zilog->zl_cv_write);
+
+	if (max_seq > zilog->zl_ss_seq) {
+		zilog->zl_ss_seq = max_seq;
+		cv_broadcast(&zilog->zl_cv_seq);
+	}
+	/*
+	 * Wait if necessary for our seq to be committed.
+	 */
+	if (lwb) {
+		while (zilog->zl_ss_seq < seq && zilog->zl_log_error == 0)
+			cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);
+		zil_flush_vdevs(zilog, seq);
+	}
+	if (zilog->zl_log_error || lwb == NULL) {
+		zilog->zl_log_error = 0;
+		max_seq = zilog->zl_itx_seq;
+		mutex_exit(&zilog->zl_lock);
+		txg_wait_synced(zilog->zl_dmu_pool, 0);
+		mutex_enter(&zilog->zl_lock);
+		zilog->zl_ss_seq = MAX(max_seq, zilog->zl_ss_seq);
+		cv_broadcast(&zilog->zl_cv_seq);
+	}
+	mutex_exit(&zilog->zl_lock);
+}
+
+/*
+ * Called in syncing context to free committed log blocks and update log header.
+ */
+void
+zil_sync(zilog_t *zilog, dmu_tx_t *tx)
+{
+	uint64_t txg = dmu_tx_get_txg(tx);
+	spa_t *spa = zilog->zl_spa;
+	lwb_t *lwb;
+
+	ASSERT(zilog->zl_stop_sync == 0);
+
+	zilog->zl_header->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
+
+	if (zilog->zl_destroy_txg == txg) {
+		bzero(zilog->zl_header, sizeof (zil_header_t));
+		bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
+		zilog->zl_destroy_txg = 0;
+	}
+
+	mutex_enter(&zilog->zl_lock);
+	for (;;) {
+		lwb = list_head(&zilog->zl_lwb_list);
+		if (lwb == NULL) {
+			mutex_exit(&zilog->zl_lock);
+			return;
+		}
+		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
+			break;
+		list_remove(&zilog->zl_lwb_list, lwb);
+		zio_free_blk(spa, &lwb->lwb_blk, txg);
+		kmem_cache_free(zil_lwb_cache, lwb);
+	}
+	zilog->zl_header->zh_log = lwb->lwb_blk;
+	mutex_exit(&zilog->zl_lock);
+}
+
+void
+zil_init(void)
+{
+	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
+	    sizeof (struct lwb), NULL, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+zil_fini(void)
+{
+	kmem_cache_destroy(zil_lwb_cache);
+}
+
+zilog_t *
+zil_alloc(objset_t *os, zil_header_t *zh_phys)
+{
+	zilog_t *zilog;
+
+	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
+
+	zilog->zl_header = zh_phys;
+	zilog->zl_os = os;
+	zilog->zl_spa = dmu_objset_spa(os);
+	zilog->zl_dmu_pool = dmu_objset_pool(os);
+	zilog->zl_prev_blk_sz = ZIL_MIN_BLKSZ;
+
+	list_create(&zilog->zl_itx_list, sizeof (itx_t),
+	    offsetof(itx_t, itx_node));
+
+	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
+	    offsetof(lwb_t, lwb_node));
+
+	list_create(&zilog->zl_vdev_list, sizeof (zil_vdev_t),
+	    offsetof(zil_vdev_t, vdev_seq_node));
+
+	return (zilog);
+}
+
+void
+zil_free(zilog_t *zilog)
+{
+	lwb_t *lwb;
+	zil_vdev_t *zv;
+
+	zilog->zl_stop_sync = 1;
+
+	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+		list_remove(&zilog->zl_lwb_list, lwb);
+		if (lwb->lwb_buf != NULL)
+			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+		kmem_cache_free(zil_lwb_cache, lwb);
+	}
+	list_destroy(&zilog->zl_lwb_list);
+
+	while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
+		list_remove(&zilog->zl_vdev_list, zv);
+		kmem_free(zv, sizeof (zil_vdev_t));
+	}
+	list_destroy(&zilog->zl_vdev_list);
+
+	ASSERT(list_head(&zilog->zl_itx_list) == NULL);
+	list_destroy(&zilog->zl_itx_list);
+
+	kmem_free(zilog, sizeof (zilog_t));
+}
+
+/*
+ * Open an intent log.
+ */
+zilog_t *
+zil_open(objset_t *os, zil_get_data_t *get_data)
+{
+	zilog_t *zilog = dmu_objset_zil(os);
+
+	zilog->zl_get_data = get_data;
+	zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
+	    2, 2, TASKQ_PREPOPULATE);
+
+	return (zilog);
+}
+
+/*
+ * Close an intent log.
+ */
+void
+zil_close(zilog_t *zilog)
+{
+	txg_wait_synced(zilog->zl_dmu_pool, 0);
+	taskq_destroy(zilog->zl_clean_taskq);
+	zilog->zl_clean_taskq = NULL;
+	zilog->zl_get_data = NULL;
+
+	zil_itx_clean(zilog);
+	ASSERT(list_head(&zilog->zl_itx_list) == NULL);
+}
+
+/*
+ * Suspend an intent log.  While in suspended mode, we still honor
+ * synchronous semantics, but we rely on txg_wait_synced() to do it.
+ * We suspend the log briefly when taking a snapshot so that the snapshot
+ * contains all the data it's supposed to, and has an empty intent log.
+ */
+int
+zil_suspend(zilog_t *zilog)
+{
+	lwb_t *lwb;
+
+	mutex_enter(&zilog->zl_lock);
+	if (zilog->zl_header->zh_claim_txg != 0) {	/* unplayed log */
+		mutex_exit(&zilog->zl_lock);
+		return (EBUSY);
+	}
+	zilog->zl_suspend++;
+	mutex_exit(&zilog->zl_lock);
+
+	zil_commit(zilog, UINT64_MAX, FSYNC);
+
+	mutex_enter(&zilog->zl_lock);
+	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+		if (lwb->lwb_buf != NULL) {
+			/*
+			 * Wait for the buffer if it's in the process of
+			 * being written.
+			 */
+			if ((lwb->lwb_seq != 0) &&
+			    (lwb->lwb_state != SEQ_COMPLETE)) {
+				cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);
+				continue;
+			}
+			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+		}
+		list_remove(&zilog->zl_lwb_list, lwb);
+		kmem_cache_free(zil_lwb_cache, lwb);
+	}
+	mutex_exit(&zilog->zl_lock);
+
+	zil_destroy(zilog);
+
+	return (0);
+}
+
+void
+zil_resume(zilog_t *zilog)
+{
+	mutex_enter(&zilog->zl_lock);
+	ASSERT(zilog->zl_suspend != 0);
+	zilog->zl_suspend--;
+	mutex_exit(&zilog->zl_lock);
+}
+
+typedef struct zil_replay_arg {
+	objset_t	*zr_os;
+	zil_replay_func_t **zr_replay;
+	void		*zr_arg;
+	void		(*zr_rm_sync)(void *arg);
+	uint64_t	*zr_txgp;
+	boolean_t	zr_byteswap;
+	char		*zr_lrbuf;
+} zil_replay_arg_t;
+
+static void
+zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
+{
+	zil_replay_arg_t *zr = zra;
+	zil_header_t *zh = zilog->zl_header;
+	uint64_t reclen = lr->lrc_reclen;
+	uint64_t txtype = lr->lrc_txtype;
+	int pass, error;
+
+	if (zilog->zl_stop_replay)
+		return;
+
+	if (lr->lrc_txg < claim_txg)		/* already committed */
+		return;
+
+	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
+		return;
+
+	/*
+	 * Make a copy of the data so we can revise and extend it.
+	 */
+	bcopy(lr, zr->zr_lrbuf, reclen);
+
+	/*
+	 * The log block containing this lr may have been byteswapped
+	 * so that we can easily examine common fields like lrc_txtype.
+	 * However, the log is a mix of different data types, and only the
+	 * replay vectors know how to byteswap their records.  Therefore, if
+	 * the lr was byteswapped, undo it before invoking the replay vector.
+	 */
+	if (zr->zr_byteswap)
+		byteswap_uint64_array(zr->zr_lrbuf, reclen);
+
+	/*
+	 * If this is a TX_WRITE with a blkptr, suck in the data.
+	 */
+	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
+		lr_write_t *lrw = (lr_write_t *)lr;
+		blkptr_t *wbp = &lrw->lr_blkptr;
+		uint64_t wlen = lrw->lr_length;
+		char *wbuf = zr->zr_lrbuf + reclen;
+
+		if (BP_IS_HOLE(wbp)) {	/* compressed to a hole */
+			bzero(wbuf, wlen);
+		} else {
+			/*
+			 * A subsequent write may have overwritten this block,
+			 * in which case wbp may have been been freed and
+			 * reallocated, and our read of wbp may fail with a
+			 * checksum error.  We can safely ignore this because
+			 * the later write will provide the correct data.
+			 */
+			(void) zio_wait(zio_read(NULL, zilog->zl_spa,
+			    wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
+			    ZIO_PRIORITY_SYNC_READ,
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
+			(void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
+		}
+	}
+
+	/*
+	 * We must now do two things atomically: replay this log record,
+	 * and update the log header to reflect the fact that we did so.
+	 * We use the DMU's ability to assign into a specific txg to do this.
+	 */
+	for (pass = 1; /* CONSTANTCONDITION */; pass++) {
+		uint64_t replay_txg;
+		dmu_tx_t *replay_tx;
+
+		replay_tx = dmu_tx_create(zr->zr_os);
+		error = dmu_tx_assign(replay_tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(replay_tx);
+			break;
+		}
+
+		replay_txg = dmu_tx_get_txg(replay_tx);
+
+		if (txtype == 0 || txtype >= TX_MAX_TYPE) {
+			error = EINVAL;
+		} else {
+			/*
+			 * On the first pass, arrange for the replay vector
+			 * to fail its dmu_tx_assign().  That's the only way
+			 * to ensure that those code paths remain well tested.
+			 */
+			*zr->zr_txgp = replay_txg - (pass == 1);
+			error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
+			    zr->zr_byteswap);
+			*zr->zr_txgp = TXG_NOWAIT;
+		}
+
+		if (error == 0) {
+			dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
+			zilog->zl_replay_seq[replay_txg & TXG_MASK] =
+			    lr->lrc_seq;
+		}
+
+		dmu_tx_commit(replay_tx);
+
+		if (error != ERESTART)
+			break;
+
+		if (pass != 1)
+			txg_wait_open(spa_get_dsl(zilog->zl_spa),
+			    replay_txg + 1);
+
+		dprintf("pass %d, retrying\n", pass);
+	}
+
+	if (error) {
+		char *name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+		dmu_objset_name(zr->zr_os, name);
+		cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+		    "dataset %s, seq 0x%llx, txtype %llu\n",
+		    error, name,
+		    (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype);
+		zilog->zl_stop_replay = 1;
+		kmem_free(name, MAXNAMELEN);
+	}
+
+	/*
+	 * The DMU's dnode layer doesn't see removes until the txg commits,
+	 * so a subsequent claim can spuriously fail with EEXIST.
+	 * To prevent this, if we might have removed an object,
+	 * wait for the delete thread to delete it, and then
+	 * wait for the transaction group to sync.
+	 */
+	if (txtype == TX_REMOVE || txtype == TX_RMDIR || txtype == TX_RENAME) {
+		if (zr->zr_rm_sync != NULL)
+			zr->zr_rm_sync(zr->zr_arg);
+		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+	}
+}
+
+/*
+ * If this dataset has an intent log, replay it and destroy it.
+ */
+void
+zil_replay(objset_t *os, void *arg, uint64_t *txgp,
+	zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_sync)(void *arg))
+{
+	zilog_t *zilog = dmu_objset_zil(os);
+	zil_replay_arg_t zr;
+
+	zr.zr_os = os;
+	zr.zr_replay = replay_func;
+	zr.zr_arg = arg;
+	zr.zr_rm_sync = rm_sync;
+	zr.zr_txgp = txgp;
+	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zilog->zl_header->zh_log);
+	zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
+
+	/*
+	 * Wait for in-progress removes to sync before starting replay.
+	 */
+	if (rm_sync != NULL)
+		rm_sync(arg);
+	txg_wait_synced(zilog->zl_dmu_pool, 0);
+
+	zilog->zl_stop_replay = 0;
+	zil_parse(zilog, NULL, zil_replay_log_record, &zr,
+	    zilog->zl_header->zh_claim_txg);
+	kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
+
+	zil_destroy(zilog);
+}
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
new file mode 100644
index 000000000000..7323292859bb
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -0,0 +1,1698 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+
+static void zio_vdev_io_enter(zio_t *zio);
+static void zio_vdev_io_exit(zio_t *zio);
+
+/*
+ * ==========================================================================
+ * I/O priority table
+ * ==========================================================================
+ */
+uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
+	0,	/* ZIO_PRIORITY_NOW		*/
+	0,	/* ZIO_PRIORITY_SYNC_READ	*/
+	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
+	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
+	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
+	4,	/* ZIO_PRIORITY_FREE		*/
+	0,	/* ZIO_PRIORITY_CACHE_FILL	*/
+	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
+	10,	/* ZIO_PRIORITY_RESILVER	*/
+	20,	/* ZIO_PRIORITY_SCRUB		*/
+};
+
+/*
+ * ==========================================================================
+ * I/O type descriptions
+ * ==========================================================================
+ */
+char *zio_type_name[ZIO_TYPES] = {
+	"null", "read", "write", "free", "claim", "ioctl" };
+
+/* At or above this size, force gang blocking - for testing */
+uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
+
+typedef struct zio_sync_pass {
+	int	zp_defer_free;		/* defer frees after this pass */
+	int	zp_dontcompress;	/* don't compress after this pass */
+	int	zp_rewrite;		/* rewrite new bps after this pass */
+} zio_sync_pass_t;
+
+zio_sync_pass_t zio_sync_pass = {
+	1,	/* zp_defer_free */
+	4,	/* zp_dontcompress */
+	1,	/* zp_rewrite */
+};
+
+/*
+ * ==========================================================================
+ * I/O kmem caches
+ * ==========================================================================
+ */
+kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+
+void
+zio_init(void)
+{
+	size_t c;
+
+	/*
+	 * For small buffers, we want a cache for each multiple of
+	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
+	 * for each quarter-power of 2.  For large buffers, we want
+	 * a cache for each multiple of PAGESIZE.
+	 */
+	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
+		size_t p2 = size;
+		size_t align = 0;
+
+		while (p2 & (p2 - 1))
+			p2 &= p2 - 1;
+
+		if (size <= 4 * SPA_MINBLOCKSIZE) {
+			align = SPA_MINBLOCKSIZE;
+		} else if (P2PHASE(size, PAGESIZE) == 0) {
+			align = PAGESIZE;
+		} else if (P2PHASE(size, p2 >> 2) == 0) {
+			align = p2 >> 2;
+		}
+
+		if (align != 0) {
+			char name[30];
+			(void) sprintf(name, "zio_buf_%lu", size);
+			zio_buf_cache[c] = kmem_cache_create(name, size,
+			    align, NULL, NULL, NULL, NULL, NULL, 0);
+			dprintf("creating cache for size %5lx align %5lx\n",
+			    size, align);
+		}
+	}
+
+	while (--c != 0) {
+		ASSERT(zio_buf_cache[c] != NULL);
+		if (zio_buf_cache[c - 1] == NULL)
+			zio_buf_cache[c - 1] = zio_buf_cache[c];
+	}
+}
+
+void
+zio_fini(void)
+{
+	size_t c;
+	kmem_cache_t *last_cache = NULL;
+
+	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+		if (zio_buf_cache[c] != last_cache) {
+			last_cache = zio_buf_cache[c];
+			kmem_cache_destroy(zio_buf_cache[c]);
+		}
+		zio_buf_cache[c] = NULL;
+	}
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free I/O buffers
+ * ==========================================================================
+ */
+void *
+zio_buf_alloc(size_t size)
+{
+	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+	return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP));
+}
+
+void
+zio_buf_free(void *buf, size_t size)
+{
+	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+	kmem_cache_free(zio_buf_cache[c], buf);
+}
+
+/*
+ * ==========================================================================
+ * Push and pop I/O transform buffers
+ * ==========================================================================
+ */
+static void
+zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
+{
+	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
+
+	zt->zt_data = data;
+	zt->zt_size = size;
+	zt->zt_bufsize = bufsize;
+
+	zt->zt_next = zio->io_transform_stack;
+	zio->io_transform_stack = zt;
+
+	zio->io_data = data;
+	zio->io_size = size;
+}
+
+static void
+zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
+{
+	zio_transform_t *zt = zio->io_transform_stack;
+
+	*data = zt->zt_data;
+	*size = zt->zt_size;
+	*bufsize = zt->zt_bufsize;
+
+	zio->io_transform_stack = zt->zt_next;
+	kmem_free(zt, sizeof (zio_transform_t));
+
+	if ((zt = zio->io_transform_stack) != NULL) {
+		zio->io_data = zt->zt_data;
+		zio->io_size = zt->zt_size;
+	}
+}
+
+static void
+zio_clear_transform_stack(zio_t *zio)
+{
+	void *data;
+	uint64_t size, bufsize;
+
+	ASSERT(zio->io_transform_stack != NULL);
+
+	zio_pop_transform(zio, &data, &size, &bufsize);
+	while (zio->io_transform_stack != NULL) {
+		zio_buf_free(data, bufsize);
+		zio_pop_transform(zio, &data, &size, &bufsize);
+	}
+}
+
+/*
+ * ==========================================================================
+ * Create the various types of I/O (read, write, free)
+ * ==========================================================================
+ */
+static zio_t *
+zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    void *data, uint64_t size, zio_done_func_t *done, void *private,
+    zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
+{
+	zio_t *zio;
+
+	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
+
+	zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
+	zio->io_parent = pio;
+	zio->io_spa = spa;
+	zio->io_txg = txg;
+	if (bp != NULL) {
+		zio->io_bp = bp;
+		zio->io_bp_copy = *bp;
+		zio->io_bp_orig = *bp;
+		/* XXBP - Need to inherit this when it matters */
+		zio->io_dva_index = 0;
+	}
+	zio->io_done = done;
+	zio->io_private = private;
+	zio->io_type = type;
+	zio->io_priority = priority;
+	zio->io_stage = stage;
+	zio->io_pipeline = pipeline;
+	zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES;
+	zio->io_timestamp = lbolt64;
+	zio->io_flags = flags;
+	zio_push_transform(zio, data, size, size);
+
+	if (pio == NULL) {
+		if (!(flags & ZIO_FLAG_CONFIG_HELD))
+			spa_config_enter(zio->io_spa, RW_READER);
+		zio->io_root = zio;
+	} else {
+		zio->io_root = pio->io_root;
+
+		mutex_enter(&pio->io_lock);
+		if (stage < ZIO_STAGE_READY)
+			pio->io_children_notready++;
+		pio->io_children_notdone++;
+		zio->io_sibling_next = pio->io_child;
+		zio->io_sibling_prev = NULL;
+		if (pio->io_child != NULL)
+			pio->io_child->io_sibling_prev = zio;
+		pio->io_child = zio;
+		mutex_exit(&pio->io_lock);
+	}
+
+	return (zio);
+}
+
+zio_t *
+zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
+	int flags)
+{
+	zio_t *zio;
+
+	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
+	    ZIO_WAIT_FOR_CHILDREN_PIPELINE);
+
+	return (zio);
+}
+
+zio_t *
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
+{
+	return (zio_null(NULL, spa, done, private, flags));
+}
+
+zio_t *
+zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
+    uint64_t size, zio_done_func_t *done, void *private,
+    int priority, int flags)
+{
+	zio_t *zio;
+	dva_t *dva;
+
+	ASSERT3U(size, ==, BP_GET_LSIZE(bp));
+
+	zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
+	    ZIO_TYPE_READ, priority, flags, ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
+
+	/*
+	 * Work off our copy of the bp so the caller can free it.
+	 */
+	zio->io_bp = &zio->io_bp_copy;
+
+	bp = zio->io_bp;
+	dva = ZIO_GET_DVA(zio);
+
+	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+		uint64_t csize = BP_GET_PSIZE(bp);
+		void *cbuf = zio_buf_alloc(csize);
+
+		zio_push_transform(zio, cbuf, csize, csize);
+		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
+	}
+
+	if (DVA_GET_GANG(dva)) {
+		uint64_t gsize = SPA_GANGBLOCKSIZE;
+		void *gbuf = zio_buf_alloc(gsize);
+
+		zio_push_transform(zio, gbuf, gsize, gsize);
+		zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
+	}
+
+	return (zio);
+}
+
+zio_t *
+zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+    zio_done_func_t *done, void *private, int priority, int flags)
+{
+	zio_t *zio;
+
+	ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
+	    checksum < ZIO_CHECKSUM_FUNCTIONS);
+
+	ASSERT(compress >= ZIO_COMPRESS_OFF &&
+	    compress < ZIO_COMPRESS_FUNCTIONS);
+
+	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+	    ZIO_TYPE_WRITE, priority, flags,
+	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
+
+	zio->io_checksum = checksum;
+	zio->io_compress = compress;
+
+	if (compress != ZIO_COMPRESS_OFF)
+		zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
+
+	if (bp->blk_birth != txg) {
+		/* XXX the bp usually (always?) gets re-zeroed later */
+		BP_ZERO(bp);
+		BP_SET_LSIZE(bp, size);
+		BP_SET_PSIZE(bp, size);
+	}
+
+	return (zio);
+}
+
+zio_t *
+zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
+    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+    zio_done_func_t *done, void *private, int priority, int flags)
+{
+	zio_t *zio;
+
+	/* XXBP - We need to re-evaluate when to insert pipeline stages */
+	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+	    ZIO_TYPE_WRITE, priority, flags,
+	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
+
+	zio->io_checksum = checksum;
+	zio->io_compress = ZIO_COMPRESS_OFF;
+
+	return (zio);
+}
+
+static zio_t *
+zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
+    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+    zio_done_func_t *done, void *private, int priority, int flags)
+{
+	zio_t *zio;
+
+	BP_ZERO(bp);
+	BP_SET_LSIZE(bp, size);
+	BP_SET_PSIZE(bp, size);
+	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+
+	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+	    ZIO_TYPE_WRITE, priority, flags,
+	    ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
+
+	zio->io_checksum = checksum;
+	zio->io_compress = ZIO_COMPRESS_OFF;
+
+	return (zio);
+}
+
+zio_t *
+zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    zio_done_func_t *done, void *private)
+{
+	zio_t *zio;
+
+	ASSERT(!BP_IS_HOLE(bp));
+
+	if (txg == spa->spa_syncing_txg &&
+	    spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
+		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
+		return (zio_null(pio, spa, NULL, NULL, 0));
+	}
+
+	/* XXBP - We need to re-evaluate when to insert pipeline stages */
+	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
+	    ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, 0,
+	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
+
+	zio->io_bp = &zio->io_bp_copy;
+
+	return (zio);
+}
+
+zio_t *
+zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    zio_done_func_t *done, void *private)
+{
+	zio_t *zio;
+
+	/*
+	 * A claim is an allocation of a specific block.  Claims are needed
+	 * to support immediate writes in the intent log.  The issue is that
+	 * immediate writes contain committed data, but in a txg that was
+	 * *not* committed.  Upon opening the pool after an unclean shutdown,
+	 * the intent log claims all blocks that contain immediate write data
+	 * so that the SPA knows they're in use.
+	 *
+	 * All claims *must* be resolved in the first txg -- before the SPA
+	 * starts allocating blocks -- so that nothing is allocated twice.
+	 */
+	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
+	ASSERT3U(spa_first_txg(spa), <=, txg);
+
+	/* XXBP - We need to re-evaluate when to insert pipeline stages */
+	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
+	    ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
+	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+
+	zio->io_bp = &zio->io_bp_copy;
+
+	return (zio);
+}
+
+zio_t *
+zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+    zio_done_func_t *done, void *private, int priority, int flags)
+{
+	zio_t *zio;
+	int c;
+
+	if (vd->vdev_children == 0) {
+		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+		    ZIO_TYPE_IOCTL, priority, flags,
+		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
+
+		zio->io_vd = vd;
+		zio->io_cmd = cmd;
+	} else {
+		zio = zio_null(pio, spa, NULL, NULL, flags);
+
+		for (c = 0; c < vd->vdev_children; c++)
+			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
+			    done, private, priority, flags));
+	}
+
+	return (zio);
+}
+
+static void
+zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
+    int checksum)
+{
+	ASSERT(vd->vdev_children == 0);
+
+	ASSERT(size <= SPA_MAXBLOCKSIZE);
+	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
+	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
+
+	ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
+	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+	ASSERT3U(offset + size, <=, vd->vdev_psize);
+
+	BP_ZERO(bp);
+
+	BP_SET_LSIZE(bp, size);
+	BP_SET_PSIZE(bp, size);
+
+	BP_SET_CHECKSUM(bp, checksum);
+	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+	if (checksum != ZIO_CHECKSUM_OFF)
+		ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
+}
+
+zio_t *
+zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+    void *data, int checksum, zio_done_func_t *done, void *private,
+    int priority, int flags)
+{
+	zio_t *zio;
+	blkptr_t blk;
+
+	zio_phys_bp_init(vd, &blk, offset, size, checksum);
+
+	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
+	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
+	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+
+	zio->io_vd = vd;
+	zio->io_offset = offset;
+
+	/*
+	 * Work off our copy of the bp so the caller can free it.
+	 */
+	zio->io_bp = &zio->io_bp_copy;
+
+	return (zio);
+}
+
+zio_t *
+zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+    void *data, int checksum, zio_done_func_t *done, void *private,
+    int priority, int flags)
+{
+	zio_block_tail_t *zbt;
+	void *wbuf;
+	zio_t *zio;
+	blkptr_t blk;
+
+	zio_phys_bp_init(vd, &blk, offset, size, checksum);
+
+	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
+	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
+	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+
+	zio->io_vd = vd;
+	zio->io_offset = offset;
+
+	zio->io_bp = &zio->io_bp_copy;
+	zio->io_checksum = checksum;
+
+	if (zio_checksum_table[checksum].ci_zbt) {
+		/*
+		 * zbt checksums are necessarily destructive -- they modify
+		 * one word of the write buffer to hold the verifier/checksum.
+		 * Therefore, we must make a local copy in case the data is
+		 * being written to multiple places.
+		 */
+		wbuf = zio_buf_alloc(size);
+		bcopy(data, wbuf, size);
+		zio_push_transform(zio, wbuf, size, size);
+
+		zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
+		zbt->zbt_cksum = blk.blk_cksum;
+	}
+
+	return (zio);
+}
+
+/*
+ * Create a child I/O to do some work for us.  It has no associated bp.
+ */
+zio_t *
+zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
+	void *data, uint64_t size, int type, int priority, int flags,
+	zio_done_func_t *done, void *private)
+{
+	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
+	zio_t *cio;
+
+	if (type == ZIO_TYPE_READ && bp != NULL) {
+		/*
+		 * If we have the bp, then the child should perform the
+		 * checksum and the parent need not.  This pushes error
+		 * detection as close to the leaves as possible and
+		 * eliminates redundant checksums in the interior nodes.
+		 */
+		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
+		zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+	}
+
+	cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
+	    done, private, type, priority,
+	    (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
+	    ZIO_STAGE_VDEV_IO_SETUP - 1, pipeline);
+
+	cio->io_vd = vd;
+	cio->io_offset = offset;
+
+	return (cio);
+}
+
+/*
+ * ==========================================================================
+ * Initiate I/O, either sync or async
+ * ==========================================================================
+ */
+int
+zio_wait(zio_t *zio)
+{
+	int error;
+
+	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+
+	zio->io_waiter = curthread;
+
+	zio_next_stage_async(zio);
+
+	mutex_enter(&zio->io_lock);
+	while (zio->io_stalled != ZIO_STAGE_DONE)
+		cv_wait(&zio->io_cv, &zio->io_lock);
+	mutex_exit(&zio->io_lock);
+
+	error = zio->io_error;
+
+	kmem_free(zio, sizeof (zio_t));
+
+	return (error);
+}
+
+void
+zio_nowait(zio_t *zio)
+{
+	zio_next_stage_async(zio);
+}
+
+/*
+ * ==========================================================================
+ * I/O pipeline interlocks: parent/child dependency scoreboarding
+ * ==========================================================================
+ */
+static void
+zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
+{
+	mutex_enter(&zio->io_lock);
+	if (*countp == 0) {
+		ASSERT(zio->io_stalled == 0);
+		mutex_exit(&zio->io_lock);
+		zio_next_stage(zio);
+	} else {
+		if (zio->io_stage == ZIO_STAGE_VDEV_IO_START)
+			zio_vdev_io_exit(zio);
+		zio->io_stalled = stage;
+		mutex_exit(&zio->io_lock);
+	}
+}
+
+static void
+zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
+{
+	zio_t *pio = zio->io_parent;
+
+	mutex_enter(&pio->io_lock);
+	if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+		pio->io_error = zio->io_error;
+	if (--*countp == 0 && pio->io_stalled == stage) {
+		if (pio->io_stage == ZIO_STAGE_VDEV_IO_START)
+			zio_vdev_io_enter(pio);
+		pio->io_stalled = 0;
+		mutex_exit(&pio->io_lock);
+		zio_next_stage_async(pio);
+	} else {
+		mutex_exit(&pio->io_lock);
+	}
+}
+
+static void
+zio_wait_children_ready(zio_t *zio)
+{
+	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
+	    &zio->io_children_notready);
+}
+
+void
+zio_wait_children_done(zio_t *zio)
+{
+	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
+	    &zio->io_children_notdone);
+}
+
+static void
+zio_ready(zio_t *zio)
+{
+	zio_t *pio = zio->io_parent;
+
+	if (pio != NULL)
+		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
+		    &pio->io_children_notready);
+
+	if (zio->io_bp)
+		zio->io_bp_copy = *zio->io_bp;
+
+	zio_next_stage(zio);
+}
+
+static void
+zio_done(zio_t *zio)
+{
+	zio_t *pio = zio->io_parent;
+	spa_t *spa = zio->io_spa;
+	blkptr_t *bp = zio->io_bp;
+	vdev_t *vd = zio->io_vd;
+	char blkbuf[300];
+
+	ASSERT(zio->io_children_notready == 0);
+	ASSERT(zio->io_children_notdone == 0);
+
+	if (bp != NULL) {
+		ASSERT(bp->blk_pad[0] == 0);
+		ASSERT(bp->blk_pad[1] == 0);
+		ASSERT(bp->blk_pad[2] == 0);
+		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
+		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
+		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
+			ASSERT(!BP_SHOULD_BYTESWAP(bp));
+	}
+
+	if (vd != NULL)
+		vdev_stat_update(zio);
+
+	if (zio->io_error) {
+		sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy);
+		dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): error %d\n",
+		    zio->io_error == ECKSUM ? "bad checksum" : "I/O failure",
+		    zio_type_name[zio->io_type],
+		    vdev_description(vd),
+		    (u_longlong_t)zio->io_offset,
+		    zio, blkbuf, zio->io_error);
+	}
+
+	if (zio->io_numerrors != 0 && zio->io_type == ZIO_TYPE_WRITE) {
+		sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy);
+		dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): %d errors\n",
+		    "partial write",
+		    zio_type_name[zio->io_type],
+		    vdev_description(vd),
+		    (u_longlong_t)zio->io_offset,
+		    zio, blkbuf, zio->io_numerrors);
+	}
+
+	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+		sprintf_blkptr(blkbuf, bp ? bp : &zio->io_bp_copy);
+		panic("ZFS: %s (%s on %s off %llx: zio %p %s): error %d",
+		    zio->io_error == ECKSUM ? "bad checksum" : "I/O failure",
+		    zio_type_name[zio->io_type],
+		    vdev_description(vd),
+		    (u_longlong_t)zio->io_offset,
+		    zio, blkbuf, zio->io_error);
+	}
+
+	zio_clear_transform_stack(zio);
+
+	if (zio->io_done)
+		zio->io_done(zio);
+
+	ASSERT(zio->io_delegate_list == NULL);
+	ASSERT(zio->io_delegate_next == NULL);
+
+	if (pio != NULL) {
+		zio_t *next, *prev;
+
+		mutex_enter(&pio->io_lock);
+		next = zio->io_sibling_next;
+		prev = zio->io_sibling_prev;
+		if (next != NULL)
+			next->io_sibling_prev = prev;
+		if (prev != NULL)
+			prev->io_sibling_next = next;
+		if (pio->io_child == zio)
+			pio->io_child = next;
+		mutex_exit(&pio->io_lock);
+
+		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
+		    &pio->io_children_notdone);
+	}
+
+	if (pio == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_HELD))
+		spa_config_exit(spa);
+
+	if (zio->io_waiter != NULL) {
+		mutex_enter(&zio->io_lock);
+		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
+		zio->io_stalled = zio->io_stage;
+		cv_broadcast(&zio->io_cv);
+		mutex_exit(&zio->io_lock);
+	} else {
+		kmem_free(zio, sizeof (zio_t));
+	}
+}
+
+/*
+ * ==========================================================================
+ * Compression support
+ * ==========================================================================
+ */
+static void
+zio_write_compress(zio_t *zio)
+{
+	int compress = zio->io_compress;
+	blkptr_t *bp = zio->io_bp;
+	void *cbuf;
+	uint64_t lsize = zio->io_size;
+	uint64_t csize = lsize;
+	uint64_t cbufsize = 0;
+	int pass;
+
+	if (bp->blk_birth == zio->io_txg) {
+		/*
+		 * We're rewriting an existing block, which means we're
+		 * working on behalf of spa_sync().  For spa_sync() to
+		 * converge, it must eventually be the case that we don't
+		 * have to allocate new blocks.  But compression changes
+		 * the blocksize, which forces a reallocate, and makes
+		 * convergence take longer.  Therefore, after the first
+		 * few passes, stop compressing to ensure convergence.
+		 */
+		pass = spa_sync_pass(zio->io_spa);
+		if (pass > zio_sync_pass.zp_dontcompress)
+			compress = ZIO_COMPRESS_OFF;
+	} else {
+		ASSERT(BP_IS_HOLE(bp));
+		pass = 1;
+	}
+
+	if (compress != ZIO_COMPRESS_OFF)
+		if (!zio_compress_data(compress, zio->io_data, zio->io_size,
+		    &cbuf, &csize, &cbufsize))
+			compress = ZIO_COMPRESS_OFF;
+
+	if (compress != ZIO_COMPRESS_OFF && csize != 0)
+		zio_push_transform(zio, cbuf, csize, cbufsize);
+
+	/*
+	 * The final pass of spa_sync() must be all rewrites, but the first
+	 * few passes offer a trade-off: allocating blocks defers convergence,
+	 * but newly allocated blocks are sequential, so they can be written
+	 * to disk faster.  Therefore, we allow the first few passes of
+	 * spa_sync() to reallocate new blocks, but force rewrites after that.
+	 * There should only be a handful of blocks after pass 1 in any case.
+	 */
+	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
+	    pass > zio_sync_pass.zp_rewrite) {
+		ASSERT(csize != 0);
+		ASSERT3U(BP_GET_COMPRESS(bp), ==, compress);
+		ASSERT3U(BP_GET_LSIZE(bp), ==, lsize);
+
+		zio->io_pipeline = ZIO_REWRITE_PIPELINE;
+	} else {
+		if (bp->blk_birth == zio->io_txg) {
+			ASSERT3U(BP_GET_LSIZE(bp), ==, lsize);
+			bzero(bp, sizeof (blkptr_t));
+		}
+		if (csize == 0) {
+			BP_ZERO(bp);
+			zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
+		} else {
+			BP_SET_LSIZE(bp, lsize);
+			BP_SET_PSIZE(bp, csize);
+			BP_SET_COMPRESS(bp, compress);
+			zio->io_pipeline = ZIO_WRITE_ALLOCATE_PIPELINE;
+		}
+	}
+
+	zio_next_stage(zio);
+}
+
+static void
+zio_read_decompress(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	void *data;
+	uint64_t size;
+	uint64_t bufsize;
+	int compress = BP_GET_COMPRESS(bp);
+
+	ASSERT(compress != ZIO_COMPRESS_OFF);
+
+	zio_pop_transform(zio, &data, &size, &bufsize);
+
+	if (zio_decompress_data(compress, data, size,
+	    zio->io_data, zio->io_size))
+		zio->io_error = EIO;
+
+	zio_buf_free(data, bufsize);
+
+	zio_next_stage(zio);
+}
+
+/*
+ * ==========================================================================
+ * Gang block support
+ * ==========================================================================
+ */
+static void
+zio_gang_pipeline(zio_t *zio)
+{
+	/*
+	 * By default, the pipeline assumes that we're dealing with a gang
+	 * block.  If we're not, strip out any gang-specific stages.
+	 */
+	if (!DVA_GET_GANG(ZIO_GET_DVA(zio)))
+		zio->io_pipeline &= ~ZIO_GANG_STAGES;
+
+	zio_next_stage(zio);
+}
+
+static void
+zio_gang_byteswap(zio_t *zio)
+{
+	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
+
+	if (BP_SHOULD_BYTESWAP(zio->io_bp))
+		byteswap_uint64_array(zio->io_data, zio->io_size);
+}
+
+static void
+zio_get_gang_header(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	uint64_t gsize = SPA_GANGBLOCKSIZE;
+	void *gbuf = zio_buf_alloc(gsize);
+
+	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+
+	zio_push_transform(zio, gbuf, gsize, gsize);
+
+	zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
+	    NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
+	    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
+	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE));
+
+	zio_wait_children_done(zio);
+}
+
+static void
+zio_read_gang_members(zio_t *zio)
+{
+	zio_gbh_phys_t *gbh;
+	uint64_t gsize, gbufsize, loff, lsize;
+	int i;
+
+	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+
+	zio_gang_byteswap(zio);
+	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
+		blkptr_t *gbp = &gbh->zg_blkptr[i];
+		lsize = BP_GET_PSIZE(gbp);
+
+		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
+		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
+		ASSERT3U(loff + lsize, <=, zio->io_size);
+		ASSERT(i < SPA_GBH_NBLKPTRS);
+		ASSERT(!BP_IS_HOLE(gbp));
+
+		zio_nowait(zio_read(zio, zio->io_spa, gbp,
+		    (char *)zio->io_data + loff, lsize, NULL, NULL,
+		    zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT));
+	}
+
+	zio_buf_free(gbh, gbufsize);
+	zio_wait_children_done(zio);
+}
+
+static void
+zio_rewrite_gang_members(zio_t *zio)
+{
+	zio_gbh_phys_t *gbh;
+	uint64_t gsize, gbufsize, loff, lsize;
+	int i;
+
+	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
+
+	zio_gang_byteswap(zio);
+	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+	ASSERT(gsize == gbufsize);
+
+	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
+		blkptr_t *gbp = &gbh->zg_blkptr[i];
+		lsize = BP_GET_PSIZE(gbp);
+
+		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
+		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
+		ASSERT3U(loff + lsize, <=, zio->io_size);
+		ASSERT(i < SPA_GBH_NBLKPTRS);
+		ASSERT(!BP_IS_HOLE(gbp));
+
+		zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
+		    zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
+		    NULL, NULL, zio->io_priority, zio->io_flags));
+	}
+
+	zio_push_transform(zio, gbh, gsize, gbufsize);
+	zio_wait_children_ready(zio);
+}
+
+static void
+zio_free_gang_members(zio_t *zio)
+{
+	zio_gbh_phys_t *gbh;
+	uint64_t gsize, gbufsize;
+	int i;
+
+	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+
+	zio_gang_byteswap(zio);
+	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
+		blkptr_t *gbp = &gbh->zg_blkptr[i];
+
+		if (BP_IS_HOLE(gbp))
+			continue;
+		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
+		    gbp, NULL, NULL));
+	}
+
+	zio_buf_free(gbh, gbufsize);
+	zio_next_stage(zio);
+}
+
+static void
+zio_claim_gang_members(zio_t *zio)
+{
+	zio_gbh_phys_t *gbh;
+	uint64_t gsize, gbufsize;
+	int i;
+
+	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+
+	zio_gang_byteswap(zio);
+	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
+
+	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
+		blkptr_t *gbp = &gbh->zg_blkptr[i];
+		if (BP_IS_HOLE(gbp))
+			continue;
+		zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg,
+		    gbp, NULL, NULL));
+	}
+
+	zio_buf_free(gbh, gbufsize);
+	zio_next_stage(zio);
+}
+
+static void
+zio_write_allocate_gang_member_done(zio_t *zio)
+{
+	zio_t *pio = zio->io_parent;
+	dva_t *cdva = ZIO_GET_DVA(zio);
+	dva_t *pdva = ZIO_GET_DVA(pio);
+	uint64_t asize;
+
+	ASSERT(DVA_GET_GANG(pdva));
+
+	/* XXBP - Need to be careful here with multiple DVAs */
+	mutex_enter(&pio->io_lock);
+	asize = DVA_GET_ASIZE(pdva);
+	asize += DVA_GET_ASIZE(cdva);
+	DVA_SET_ASIZE(pdva, asize);
+	mutex_exit(&pio->io_lock);
+}
+
+static void
+zio_write_allocate_gang_members(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	dva_t *dva = ZIO_GET_DVA(zio);
+	zio_gbh_phys_t *gbh;
+	uint64_t resid = zio->io_size;
+	uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
+	uint64_t gsize, loff, lsize;
+	uint32_t gbps_left;
+	int error;
+	int i;
+
+	gsize = SPA_GANGBLOCKSIZE;
+	gbps_left = SPA_GBH_NBLKPTRS;
+
+	error = metaslab_alloc(zio->io_spa, gsize, dva, zio->io_txg);
+	if (error == ENOSPC)
+		panic("can't allocate gang block header");
+	ASSERT(error == 0);
+
+	DVA_SET_GANG(dva, 1);
+
+	bp->blk_birth = zio->io_txg;
+
+	gbh = zio_buf_alloc(gsize);
+	bzero(gbh, gsize);
+
+	for (loff = 0, i = 0; loff != zio->io_size;
+	    loff += lsize, resid -= lsize, gbps_left--, i++) {
+		blkptr_t *gbp = &gbh->zg_blkptr[i];
+		dva = &gbp->blk_dva[0];
+
+		ASSERT(gbps_left != 0);
+		maxalloc = MIN(maxalloc, resid);
+
+		while (resid <= maxalloc * gbps_left) {
+			error = metaslab_alloc(zio->io_spa, maxalloc, dva,
+			    zio->io_txg);
+			if (error == 0)
+				break;
+			ASSERT3U(error, ==, ENOSPC);
+			if (maxalloc == SPA_MINBLOCKSIZE)
+				panic("really out of space");
+			maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
+		}
+
+		if (resid <= maxalloc * gbps_left) {
+			lsize = maxalloc;
+			BP_SET_LSIZE(gbp, lsize);
+			BP_SET_PSIZE(gbp, lsize);
+			BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
+			gbp->blk_birth = zio->io_txg;
+			zio_nowait(zio_rewrite(zio, zio->io_spa,
+			    zio->io_checksum, zio->io_txg, gbp,
+			    (char *)zio->io_data + loff, lsize,
+			    zio_write_allocate_gang_member_done, NULL,
+			    zio->io_priority, zio->io_flags));
+		} else {
+			lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
+			ASSERT(lsize != SPA_MINBLOCKSIZE);
+			zio_nowait(zio_write_allocate(zio, zio->io_spa,
+			    zio->io_checksum, zio->io_txg, gbp,
+			    (char *)zio->io_data + loff, lsize,
+			    zio_write_allocate_gang_member_done, NULL,
+			    zio->io_priority, zio->io_flags));
+		}
+	}
+
+	ASSERT(resid == 0 && loff == zio->io_size);
+
+	zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
+
+	zio_push_transform(zio, gbh, gsize, gsize);
+	zio_wait_children_done(zio);
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free blocks
+ * ==========================================================================
+ */
+static void
+zio_dva_allocate(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	dva_t *dva = ZIO_GET_DVA(zio);
+	int error;
+
+	ASSERT(BP_IS_HOLE(bp));
+
+	/* For testing, make some blocks above a certain size be gang blocks */
+	if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
+		zio_write_allocate_gang_members(zio);
+		return;
+	}
+
+	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+
+	error = metaslab_alloc(zio->io_spa, zio->io_size, dva, zio->io_txg);
+
+	if (error == 0) {
+		bp->blk_birth = zio->io_txg;
+	} else if (error == ENOSPC) {
+		if (zio->io_size == SPA_MINBLOCKSIZE)
+			panic("really, truly out of space");
+		zio_write_allocate_gang_members(zio);
+		return;
+	} else {
+		zio->io_error = error;
+	}
+	zio_next_stage(zio);
+}
+
+static void
+zio_dva_free(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	dva_t *dva = ZIO_GET_DVA(zio);
+
+	ASSERT(!BP_IS_HOLE(bp));
+
+	metaslab_free(zio->io_spa, dva, zio->io_txg);
+
+	BP_ZERO(bp);
+
+	zio_next_stage(zio);
+}
+
+static void
+zio_dva_claim(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	dva_t *dva = ZIO_GET_DVA(zio);
+
+	ASSERT(!BP_IS_HOLE(bp));
+
+	zio->io_error = metaslab_claim(zio->io_spa, dva, zio->io_txg);
+
+	zio_next_stage(zio);
+}
+
+static void
+zio_dva_translate(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	dva_t *dva = ZIO_GET_DVA(zio);
+	uint64_t vdev = DVA_GET_VDEV(dva);
+	uint64_t offset = DVA_GET_OFFSET(dva);
+
+	ASSERT3U(zio->io_size, ==, ZIO_GET_IOSIZE(zio));
+
+	zio->io_offset = offset;
+
+	if ((zio->io_vd = vdev_lookup_top(spa, vdev)) == NULL)
+		zio->io_error = ENXIO;
+	else if (offset + zio->io_size > zio->io_vd->vdev_asize)
+		zio->io_error = EOVERFLOW;
+
+	zio_next_stage(zio);
+}
+
+/*
+ * ==========================================================================
+ * Read and write to physical devices
+ * ==========================================================================
+ */
+static void
+zio_vdev_io_enter(zio_t *zio)
+{
+	vdev_t *tvd = zio->io_vd->vdev_top;
+
+	mutex_enter(&tvd->vdev_io_lock);
+	ASSERT(zio->io_pending.list_next == NULL);
+	list_insert_tail(&tvd->vdev_io_pending, zio);
+	mutex_exit(&tvd->vdev_io_lock);
+}
+
+static void
+zio_vdev_io_exit(zio_t *zio)
+{
+	vdev_t *tvd = zio->io_vd->vdev_top;
+
+	mutex_enter(&tvd->vdev_io_lock);
+	ASSERT(zio->io_pending.list_next != NULL);
+	list_remove(&tvd->vdev_io_pending, zio);
+	if (list_head(&tvd->vdev_io_pending) == NULL)
+		cv_broadcast(&tvd->vdev_io_cv);
+	mutex_exit(&tvd->vdev_io_lock);
+}
+
+static void
+zio_vdev_io_retry(void *vdarg)
+{
+	vdev_t *vd = vdarg;
+	zio_t *zio, *zq;
+
+	ASSERT(vd == vd->vdev_top);
+
+	/* XXPOLICY */
+	delay(hz);
+
+	vdev_reopen(vd, &zq);
+
+	while ((zio = zq) != NULL) {
+		zq = zio->io_retry_next;
+		zio->io_retry_next = NULL;
+		dprintf("async retry #%d for I/O to %s offset %llx\n",
+		    zio->io_retries, vdev_description(vd), zio->io_offset);
+		zio_next_stage_async(zio);
+	}
+}
+
+static void
+zio_vdev_io_setup(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+
+	/* XXPOLICY */
+	if (zio->io_retries == 0 && vd == vd->vdev_top)
+		zio->io_flags |= ZIO_FLAG_FAILFAST;
+
+	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) {
+		zio->io_flags |= ZIO_FLAG_PHYSICAL;
+		zio->io_offset += VDEV_LABEL_START_SIZE;
+	}
+
+	zio_vdev_io_enter(zio);
+
+	zio_next_stage(zio);
+}
+
+static void
+zio_vdev_io_start(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	ASSERT(P2PHASE(zio->io_offset, 1ULL << zio->io_vd->vdev_ashift) == 0);
+	ASSERT(P2PHASE(zio->io_size, 1ULL << zio->io_vd->vdev_ashift) == 0);
+	ASSERT(bp == NULL || ZIO_GET_IOSIZE(zio) == zio->io_size);
+	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
+
+	vdev_io_start(zio);
+
+	/* zio_next_stage_async() gets called from io completion interrupt */
+}
+
+static void
+zio_vdev_io_done(zio_t *zio)
+{
+	vdev_io_done(zio);
+}
+
+/* XXPOLICY */
+static boolean_t
+zio_should_retry(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+
+	if (zio->io_error == 0)
+		return (B_FALSE);
+	if (zio->io_delegate_list != NULL)
+		return (B_FALSE);
+	if (vd != vd->vdev_top)
+		return (B_FALSE);
+	if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
+		return (B_FALSE);
+	if (zio->io_retries > 300 &&
+	    (zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL)))
+		return (B_FALSE);
+	if (zio->io_retries > 1 &&
+	    (zio->io_error == ECKSUM || zio->io_error == ENXIO))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+static void
+zio_vdev_io_assess(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_t *tvd = vd->vdev_top;
+
+	zio_vdev_io_exit(zio);
+
+	ASSERT(zio->io_vsd == NULL);
+
+	/*
+	 * If the I/O failed, determine whether we should attempt to retry it.
+	 */
+	/* XXPOLICY */
+	if (zio_should_retry(zio)) {
+		zio_t *zq;
+
+		ASSERT(tvd == vd);
+		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE));
+
+		zio->io_retries++;
+		zio->io_error = 0;
+		zio->io_flags &= ZIO_FLAG_VDEV_INHERIT;
+		/* XXPOLICY */
+		zio->io_flags &= ~ZIO_FLAG_FAILFAST;
+		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+		zio->io_stage = ZIO_STAGE_VDEV_IO_SETUP - 1;
+
+		dprintf("retry #%d for %s to %s offset %llx\n",
+		    zio->io_retries, zio_type_name[zio->io_type],
+		    vdev_description(vd), zio->io_offset);
+
+		/*
+		 * If this is the first retry, do it immediately.
+		 */
+		/* XXPOLICY */
+		if (zio->io_retries == 1) {
+			zio_next_stage_async(zio);
+			return;
+		}
+
+		/*
+		 * This was not the first retry, so go through the
+		 * longer enqueue/delay/vdev_reopen() process.
+		 */
+		mutex_enter(&tvd->vdev_io_lock);
+		ASSERT(zio->io_retry_next == NULL);
+		zio->io_retry_next = zq = tvd->vdev_io_retry;
+		tvd->vdev_io_retry = zio;
+		mutex_exit(&tvd->vdev_io_lock);
+		if (zq == NULL)
+			(void) taskq_dispatch(
+			    tvd->vdev_spa->spa_vdev_retry_taskq,
+			    zio_vdev_io_retry, tvd, TQ_SLEEP);
+		return;
+	}
+
+	zio_next_stage(zio);
+}
+
+void
+zio_vdev_io_reissue(zio_t *zio)
+{
+	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+	ASSERT(zio->io_error == 0);
+
+	zio->io_stage--;
+}
+
+void
+zio_vdev_io_redone(zio_t *zio)
+{
+	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
+
+	zio->io_stage--;
+}
+
+void
+zio_vdev_io_bypass(zio_t *zio)
+{
+	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+	ASSERT(zio->io_error == 0);
+
+	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
+	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
+}
+
+/*
+ * ==========================================================================
+ * Generate and verify checksums
+ * ==========================================================================
+ */
+static void
+zio_checksum_generate(zio_t *zio)
+{
+	int checksum = zio->io_checksum;
+	blkptr_t *bp = zio->io_bp;
+
+	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+
+	BP_SET_CHECKSUM(bp, checksum);
+	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+	zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
+
+	zio_next_stage(zio);
+}
+
+static void
+zio_gang_checksum_generate(zio_t *zio)
+{
+	zio_cksum_t zc;
+	zio_gbh_phys_t *gbh = zio->io_data;
+
+	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
+	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+
+	zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
+
+	zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
+
+	zio_next_stage(zio);
+}
+
+static void
+zio_checksum_verify(zio_t *zio)
+{
+	if (zio->io_bp != NULL) {
+		zio->io_error = zio_checksum_error(zio);
+		if (zio->io_error) {
+			dprintf("bad checksum on vdev %s\n",
+			    vdev_description(zio->io_vd));
+		}
+	}
+
+	zio_next_stage(zio);
+}
+
+/*
+ * Called by RAID-Z to ensure we don't compute the checksum twice.
+ */
+void
+zio_checksum_verified(zio_t *zio)
+{
+	zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+}
+
+/*
+ * Set the external verifier for a gang block based on stuff in the bp
+ */
+void
+zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
+{
+	zcp->zc_word[0] = DVA_GET_VDEV(ZIO_GET_DVA(zio));
+	zcp->zc_word[1] = DVA_GET_OFFSET(ZIO_GET_DVA(zio));
+	zcp->zc_word[2] = zio->io_bp->blk_birth;
+	zcp->zc_word[3] = 0;
+}
+
+/*
+ * ==========================================================================
+ * Define the pipeline
+ * ==========================================================================
+ */
+typedef void zio_pipe_stage_t(zio_t *zio);
+
+static void
+zio_badop(zio_t *zio)
+{
+	panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio);
+}
+
+zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
+	zio_badop,
+	zio_wait_children_ready,
+	zio_write_compress,
+	zio_checksum_generate,
+	zio_gang_pipeline,
+	zio_get_gang_header,
+	zio_rewrite_gang_members,
+	zio_free_gang_members,
+	zio_claim_gang_members,
+	zio_dva_allocate,
+	zio_dva_free,
+	zio_dva_claim,
+	zio_gang_checksum_generate,
+	zio_ready,
+	zio_dva_translate,
+	zio_vdev_io_setup,
+	zio_vdev_io_start,
+	zio_vdev_io_done,
+	zio_vdev_io_assess,
+	zio_wait_children_done,
+	zio_checksum_verify,
+	zio_read_gang_members,
+	zio_read_decompress,
+	zio_done,
+	zio_badop
+};
+
+/*
+ * Move an I/O to the next stage of the pipeline and execute that stage.
+ * There's no locking on io_stage because there's no legitimate way for
+ * multiple threads to be attempting to process the same I/O.
+ */
+void
+zio_next_stage(zio_t *zio)
+{
+	uint32_t pipeline = zio->io_pipeline;
+
+	ASSERT(!MUTEX_HELD(&zio->io_lock));
+
+	if (zio->io_error) {
+		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
+		    zio, vdev_description(zio->io_vd),
+		    zio->io_offset, zio->io_stage, zio->io_error);
+		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
+			pipeline &= ZIO_ERROR_PIPELINE_MASK;
+	}
+
+	while (((1U << ++zio->io_stage) & pipeline) == 0)
+		continue;
+
+	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
+	ASSERT(zio->io_stalled == 0);
+
+	zio_pipeline[zio->io_stage](zio);
+}
+
+void
+zio_next_stage_async(zio_t *zio)
+{
+	taskq_t *tq;
+	uint32_t pipeline = zio->io_pipeline;
+
+	ASSERT(!MUTEX_HELD(&zio->io_lock));
+
+	if (zio->io_error) {
+		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
+		    zio, vdev_description(zio->io_vd),
+		    zio->io_offset, zio->io_stage, zio->io_error);
+		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
+			pipeline &= ZIO_ERROR_PIPELINE_MASK;
+	}
+
+	while (((1U << ++zio->io_stage) & pipeline) == 0)
+		continue;
+
+	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
+	ASSERT(zio->io_stalled == 0);
+
+	/*
+	 * For performance, we'll probably want two sets of task queues:
+	 * per-CPU issue taskqs and per-CPU completion taskqs.  The per-CPU
+	 * part is for read performance: since we have to make a pass over
+	 * the data to checksum it anyway, we want to do this on the same CPU
+	 * that issued the read, because (assuming CPU scheduling affinity)
+	 * that thread is probably still there.  Getting this optimization
+	 * right avoids performance-hostile cache-to-cache transfers.
+	 *
+	 * Note that having two sets of task queues is also necessary for
+	 * correctness: if all of the issue threads get bogged down waiting
+	 * for dependent reads (e.g. metaslab freelist) to complete, then
+	 * there won't be any threads available to service I/O completion
+	 * interrupts.
+	 */
+	if ((1U << zio->io_stage) & zio->io_async_stages) {
+		if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE)
+			tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
+		else
+			tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type];
+		(void) taskq_dispatch(tq,
+		    (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
+	} else {
+		zio_pipeline[zio->io_stage](zio);
+	}
+}
+
+/*
+ * Try to allocate an intent log block.  Return 0 on success, errno on failure.
+ */
+int
+zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp,
+    uint64_t txg)
+{
+	int error;
+
+	spa_config_enter(spa, RW_READER);
+
+	BP_ZERO(bp);
+
+	error = metaslab_alloc(spa, size, BP_IDENTITY(bp), txg);
+
+	if (error == 0) {
+		BP_SET_CHECKSUM(bp, checksum);
+		BP_SET_LSIZE(bp, size);
+		BP_SET_PSIZE(bp, size);
+		BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+		BP_SET_TYPE(bp, DMU_OT_INTENT_LOG);
+		BP_SET_LEVEL(bp, 0);
+		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+		bp->blk_birth = txg;
+	}
+
+	spa_config_exit(spa);
+
+	return (error);
+}
+
+/*
+ * Free an intent log block.  We know it can't be a gang block, so there's
+ * nothing to do except metaslab_free() it.
+ */
+void
+zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
+{
+	ASSERT(DVA_GET_GANG(BP_IDENTITY(bp)) == 0);
+
+	dprintf_bp(bp, "txg %llu: ", txg);
+
+	spa_config_enter(spa, RW_READER);
+
+	metaslab_free(spa, BP_IDENTITY(bp), txg);
+
+	spa_config_exit(spa);
+}
diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c
new file mode 100644
index 000000000000..dc31527ce800
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c
@@ -0,0 +1,174 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * Checksum vectors.
+ *
+ * In the SPA, everything is checksummed.  We support checksum vectors
+ * for three distinct reasons:
+ *
+ *   1. Different kinds of data need different levels of protection.
+ *	For SPA metadata, we always want a very strong checksum.
+ *	For user data, we let users make the trade-off between speed
+ *	and checksum strength.
+ *
+ *   2. Cryptographic hash and MAC algorithms are an area of active research.
+ *	It is likely that in future hash functions will be at least as strong
+ *	as current best-of-breed, and may be substantially faster as well.
+ *	We want the ability to take advantage of these new hashes as soon as
+ *	they become available.
+ *
+ *   3. If someone develops hardware that can compute a strong hash quickly,
+ *	we want the ability to take advantage of that hardware.
+ *
+ * Of course, we don't want a checksum upgrade to invalidate existing
+ * data, so we store the checksum *function* in five bits of the DVA.
+ * This gives us room for up to 32 different checksum functions.
+ *
+ * When writing a block, we always checksum it with the latest-and-greatest
+ * checksum function of the appropriate strength.  When reading a block,
+ * we compare the expected checksum against the actual checksum, which we
+ * compute via the checksum function specified in the DVA encoding.
+ */
+
+/*ARGSUSED*/
+static void
+zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
+	NULL,			NULL,			0, 0,	"inherit",
+	NULL,			NULL,			0, 0,	"on",
+	zio_checksum_off,	zio_checksum_off,	0, 0,	"off",
+	zio_checksum_SHA256,	zio_checksum_SHA256,	1, 1,	"label",
+	zio_checksum_SHA256,	zio_checksum_SHA256,	1, 1,	"gang_header",
+	fletcher_2_native,	fletcher_2_byteswap,	0, 1,	"zilog",
+	fletcher_2_native,	fletcher_2_byteswap,	0, 0,	"fletcher2",
+	fletcher_4_native,	fletcher_4_byteswap,	1, 0,	"fletcher4",
+	zio_checksum_SHA256,	zio_checksum_SHA256,	1, 0,	"SHA256",
+};
+
+uint8_t
+zio_checksum_select(uint8_t child, uint8_t parent)
+{
+	ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
+	ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
+	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+	if (child == ZIO_CHECKSUM_INHERIT)
+		return (parent);
+
+	if (child == ZIO_CHECKSUM_ON)
+		return (ZIO_CHECKSUM_ON_VALUE);
+
+	return (child);
+}
+
+/*
+ * Generate the checksum.
+ */
+void
+zio_checksum(uint_t checksum, zio_cksum_t *zcp, void *data, uint64_t size)
+{
+	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+	zio_cksum_t zbt_cksum;
+
+	ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+	ASSERT(ci->ci_func[0] != NULL);
+
+	if (ci->ci_zbt) {
+		*zcp = zbt->zbt_cksum;
+		zbt->zbt_magic = ZBT_MAGIC;
+		ci->ci_func[0](data, size, &zbt_cksum);
+		zbt->zbt_cksum = zbt_cksum;
+	} else {
+		ci->ci_func[0](data, size, zcp);
+	}
+}
+
+int
+zio_checksum_error(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	dva_t *dva = ZIO_GET_DVA(zio);
+	zio_cksum_t zc = bp->blk_cksum;
+	uint_t checksum = DVA_GET_GANG(dva) ? ZIO_CHECKSUM_GANG_HEADER :
+	    BP_GET_CHECKSUM(bp);
+	int byteswap = BP_SHOULD_BYTESWAP(bp);
+	void *data = zio->io_data;
+	uint64_t size = zio->io_size;
+	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+	zio_cksum_t actual_cksum, expected_cksum;
+
+	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
+		return (EINVAL);
+
+	if (ci->ci_zbt) {
+		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
+			zio_set_gang_verifier(zio, &zc);
+
+		if (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)) {
+			expected_cksum = zbt->zbt_cksum;
+			byteswap_uint64_array(&expected_cksum,
+			    sizeof (zio_cksum_t));
+			zbt->zbt_cksum = zc;
+			byteswap_uint64_array(&zbt->zbt_cksum,
+			    sizeof (zio_cksum_t));
+			ci->ci_func[1](data, size, &actual_cksum);
+			zbt->zbt_cksum = expected_cksum;
+			byteswap_uint64_array(&zbt->zbt_cksum,
+			    sizeof (zio_cksum_t));
+		} else {
+			expected_cksum = zbt->zbt_cksum;
+			zbt->zbt_cksum = zc;
+			ci->ci_func[0](data, size, &actual_cksum);
+			zbt->zbt_cksum = expected_cksum;
+		}
+		zc = expected_cksum;
+	} else {
+		ASSERT(!DVA_GET_GANG(dva));
+		ci->ci_func[byteswap](data, size, &actual_cksum);
+	}
+
+	if ((actual_cksum.zc_word[0] - zc.zc_word[0]) |
+	    (actual_cksum.zc_word[1] - zc.zc_word[1]) |
+	    (actual_cksum.zc_word[2] - zc.zc_word[2]) |
+	    (actual_cksum.zc_word[3] - zc.zc_word[3]))
+		return (ECKSUM);
+
+	return (0);
+}
diff --git a/usr/src/uts/common/fs/zfs/zio_compress.c b/usr/src/uts/common/fs/zfs/zio_compress.c
new file mode 100644
index 000000000000..51d85172bbff
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zio_compress.c
@@ -0,0 +1,134 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/zfs_context.h>
+#include <sys/compress.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+
+/*
+ * Compression vectors.
+ */
+
+zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
+	NULL,			NULL,			"inherit",
+	NULL,			NULL,			"on",
+	NULL,			NULL,			"uncompressed",
+	lzjb_compress,		lzjb_decompress,	"lzjb",
+};
+
+uint8_t
+zio_compress_select(uint8_t child, uint8_t parent)
+{
+	ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
+	ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
+	ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON);
+
+	if (child == ZIO_COMPRESS_INHERIT)
+		return (parent);
+
+	if (child == ZIO_COMPRESS_ON)
+		return (ZIO_COMPRESS_ON_VALUE);
+
+	return (child);
+}
+
+int
+zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp,
+    uint64_t *destsizep, uint64_t *destbufsizep)
+{
+	uint64_t *word, *word_end;
+	uint64_t ciosize, gapsize, destbufsize;
+	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+	char *dest;
+	uint_t allzero;
+
+	ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+	ASSERT(ci->ci_compress != NULL);
+
+	/*
+	 * If the data is all zeroes, we don't even need to allocate
+	 * a block for it.  We indicate this by setting *destsizep = 0.
+	 */
+	allzero = 1;
+	word = src;
+	word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize);
+	while (word < word_end) {
+		if (*word++ != 0) {
+			allzero = 0;
+			break;
+		}
+	}
+	if (allzero) {
+		*destp = NULL;
+		*destsizep = 0;
+		*destbufsizep = 0;
+		return (1);
+	}
+
+	/* Compress at least 12.5% */
+	destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE);
+	if (destbufsize == 0)
+		return (0);
+	dest = zio_buf_alloc(destbufsize);
+
+	ciosize = ci->ci_compress(src, dest, (size_t)srcsize,
+	    (size_t)destbufsize);
+	if (ciosize > destbufsize) {
+		zio_buf_free(dest, destbufsize);
+		return (0);
+	}
+
+	/* Cool.  We compressed at least as much as we were hoping to. */
+
+	/* For security, make sure we don't write random heap crap to disk */
+	gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize;
+	if (gapsize != 0) {
+		bzero(dest + ciosize, gapsize);
+		ciosize += gapsize;
+	}
+
+	ASSERT3U(ciosize, <=, destbufsize);
+	ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0);
+	*destp = dest;
+	*destsizep = ciosize;
+	*destbufsizep = destbufsize;
+
+	return (1);
+}
+
+int
+zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
+	void *dest, uint64_t destsize)
+{
+	ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+
+	return (zio_compress_table[cpfunc].ci_decompress(src, dest,
+	    srcsize, destsize));
+}
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
new file mode 100644
index 000000000000..ceb9e24d72d7
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -0,0 +1,793 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * ZFS volume emulation driver.
+ *
+ * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
+ * Volumes are accessed through the symbolic links named:
+ *
+ * /dev/zvol/dsk/<pool_name>/<dataset_name>
+ * /dev/zvol/rdsk/<pool_name>/<dataset_name>
+ *
+ * These links are created by the ZFS-specific devfsadm link generator.
+ * Volumes are persistent through reboot.  No user command needs to be
+ * run before opening and using a device.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/aio_req.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+#include <sys/modctl.h>
+#include <sys/open.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dsl_prop.h>
+#include <sys/dkio.h>
+#include <sys/efi_partition.h>
+#include <sys/byteorder.h>
+#include <sys/pathname.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/crc32.h>
+#include <sys/dirent.h>
+#include <sys/policy.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/mkdev.h>
+
+#include "zfs_namecheck.h"
+
+#define	ZVOL_OBJ		1ULL
+#define	ZVOL_ZAP_OBJ		2ULL
+#define	ZVOL_MAX_MINOR		MAXMIN32
+
+static void *zvol_state;
+
+/*
+ * This lock protects the zvol_state structure from being modified
+ * while it's being used, e.g. an open that comes in before a create
+ * finishes.  It also protects temporary opens of the dataset so that,
+ * e.g., an open doesn't get a spurious EBUSY.
+ */
+static kmutex_t zvol_state_lock;
+static uint32_t zvol_minors;
+
+/*
+ * The in-core state of each volume.
+ */
+typedef struct zvol_state {
+	char		zv_name[MAXPATHLEN]; /* pool/dd name */
+	uint64_t	zv_volsize;	/* amount of space we advertise */
+	minor_t		zv_minor;	/* minor number */
+	uint8_t		zv_min_bs;	/* minimum addressable block shift */
+	uint8_t		zv_readonly;	/* hard readonly; like write-protect */
+	objset_t	*zv_objset;	/* objset handle */
+	uint32_t	zv_mode;	/* DS_MODE_* flags at open time */
+	uint32_t	zv_open_count[OTYPCNT];	/* open counts */
+	uint32_t	zv_total_opens;	/* total open count */
+} zvol_state_t;
+
+static void
+zvol_size_changed(zvol_state_t *zv, dev_t dev)
+{
+	dev = makedevice(getmajor(dev), zv->zv_minor);
+
+	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
+	    "Size", zv->zv_volsize) == DDI_SUCCESS);
+	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
+	    "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS);
+}
+
+int
+zvol_check_volsize(zfs_cmd_t *zc)
+{
+	if (zc->zc_volsize == 0)
+		return (EINVAL);
+
+	zc->zc_volsize = P2ROUNDUP(zc->zc_volsize, SPA_MAXBLOCKSIZE);
+#ifdef _ILP32
+	if (zc->zc_volsize - 1 > SPEC_MAXOFFSET_T)
+		return (EOVERFLOW);
+#endif
+	return (0);
+}
+
+int
+zvol_check_volblocksize(zfs_cmd_t *zc)
+{
+	if (zc->zc_volblocksize < SPA_MINBLOCKSIZE ||
+	    zc->zc_volblocksize > SPA_MAXBLOCKSIZE ||
+	    !ISP2(zc->zc_volblocksize))
+		return (EDOM);
+
+	return (0);
+}
+
+static void
+zvol_readonly_changed_cb(void *arg, uint64_t newval)
+{
+	zvol_state_t *zv = arg;
+
+	zv->zv_readonly = (uint8_t)newval;
+}
+
+int
+zvol_get_stats(zfs_cmd_t *zc, objset_t *os)
+{
+	int error;
+	dmu_object_info_t doi;
+
+	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &zc->zc_volsize);
+
+	if (error)
+		return (error);
+
+	error = dmu_object_info(os, ZVOL_OBJ, &doi);
+
+	if (error == 0)
+		zc->zc_volblocksize = doi.doi_data_block_size;
+
+	return (error);
+}
+
+/*
+ * Find a free minor number.
+ */
+static minor_t
+zvol_minor_alloc(void)
+{
+	minor_t minor;
+
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+
+	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++)
+		if (ddi_get_soft_state(zvol_state, minor) == NULL)
+			return (minor);
+
+	return (0);
+}
+
+static zvol_state_t *
+zvol_minor_lookup(char *name)
+{
+	minor_t minor;
+	zvol_state_t *zv;
+
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+
+	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) {
+		zv = ddi_get_soft_state(zvol_state, minor);
+		if (zv == NULL)
+			continue;
+		if (strcmp(zv->zv_name, name) == 0)
+			break;
+	}
+
+	return (zv);
+}
+
+void
+zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
+{
+	zfs_cmd_t *zc = arg;
+	int error;
+
+	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, zc->zc_volblocksize,
+	    DMU_OT_NONE, 0, tx);
+	ASSERT(error == 0);
+
+	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
+	    DMU_OT_NONE, 0, tx);
+	ASSERT(error == 0);
+
+	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &zc->zc_volsize, tx);
+	ASSERT(error == 0);
+}
+
+/*
+ * Create a minor node for the specified volume.
+ */
+int
+zvol_create_minor(zfs_cmd_t *zc)
+{
+	char *name = zc->zc_name;
+	dev_t dev = zc->zc_dev;
+	zvol_state_t *zv;
+	objset_t *os;
+	uint64_t volsize;
+	minor_t minor = 0;
+	struct pathname linkpath;
+	int ds_mode = DS_MODE_PRIMARY;
+	vnode_t *vp = NULL;
+	char *devpath;
+	size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + 1 + strlen(name) + 1;
+	char chrbuf[30], blkbuf[30];
+	int error;
+
+	mutex_enter(&zvol_state_lock);
+
+	if ((zv = zvol_minor_lookup(name)) != NULL) {
+		mutex_exit(&zvol_state_lock);
+		return (EEXIST);
+	}
+
+	if (strchr(name, '@') != 0)
+		ds_mode |= DS_MODE_READONLY;
+
+	error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
+
+	if (error) {
+		mutex_exit(&zvol_state_lock);
+		return (error);
+	}
+
+	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+
+	if (error) {
+		dmu_objset_close(os);
+		mutex_exit(&zvol_state_lock);
+		return (error);
+	}
+
+	/*
+	 * If there's an existing /dev/zvol symlink, try to use the
+	 * same minor number we used last time.
+	 */
+	devpath = kmem_alloc(devpathlen, KM_SLEEP);
+
+	(void) sprintf(devpath, "%s/%s", ZVOL_FULL_DEV_DIR, name);
+
+	error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp);
+
+	kmem_free(devpath, devpathlen);
+
+	if (error == 0 && vp->v_type != VLNK)
+		error = EINVAL;
+
+	if (error == 0) {
+		pn_alloc(&linkpath);
+		error = pn_getsymlink(vp, &linkpath, kcred);
+		if (error == 0) {
+			char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV);
+			if (ms != NULL) {
+				ms += strlen(ZVOL_PSEUDO_DEV);
+				minor = stoi(&ms);
+			}
+		}
+		pn_free(&linkpath);
+	}
+
+	if (vp != NULL)
+		VN_RELE(vp);
+
+	/*
+	 * If we found a minor but it's already in use, we must pick a new one.
+	 */
+	if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL)
+		minor = 0;
+
+	if (minor == 0)
+		minor = zvol_minor_alloc();
+
+	if (minor == 0) {
+		dmu_objset_close(os);
+		mutex_exit(&zvol_state_lock);
+		return (ENXIO);
+	}
+
+	if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) {
+		dmu_objset_close(os);
+		mutex_exit(&zvol_state_lock);
+		return (EAGAIN);
+	}
+
+	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, name);
+
+	(void) sprintf(chrbuf, "%uc,raw", minor);
+
+	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
+	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
+		ddi_soft_state_free(zvol_state, minor);
+		dmu_objset_close(os);
+		mutex_exit(&zvol_state_lock);
+		return (EAGAIN);
+	}
+
+	(void) sprintf(blkbuf, "%uc", minor);
+
+	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
+	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
+		ddi_remove_minor_node(zfs_dip, chrbuf);
+		ddi_soft_state_free(zvol_state, minor);
+		dmu_objset_close(os);
+		mutex_exit(&zvol_state_lock);
+		return (EAGAIN);
+	}
+
+	zv = ddi_get_soft_state(zvol_state, minor);
+
+	(void) strcpy(zv->zv_name, name);
+	zv->zv_min_bs = DEV_BSHIFT;
+	zv->zv_minor = minor;
+	zv->zv_volsize = volsize;
+	zv->zv_objset = os;
+	zv->zv_mode = ds_mode;
+
+	zvol_size_changed(zv, dev);
+
+	VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
+	    "readonly", zvol_readonly_changed_cb, zv) == 0);
+
+	zvol_minors++;
+
+	mutex_exit(&zvol_state_lock);
+
+	return (0);
+}
+
+/*
+ * Remove minor node for the specified volume.
+ */
+int
+zvol_remove_minor(zfs_cmd_t *zc)
+{
+	zvol_state_t *zv;
+	char namebuf[30];
+
+	mutex_enter(&zvol_state_lock);
+
+	if ((zv = zvol_minor_lookup(zc->zc_name)) == NULL) {
+		mutex_exit(&zvol_state_lock);
+		return (ENXIO);
+	}
+
+	if (zv->zv_total_opens != 0) {
+		mutex_exit(&zvol_state_lock);
+		return (EBUSY);
+	}
+
+	(void) sprintf(namebuf, "%uc,raw", zv->zv_minor);
+	ddi_remove_minor_node(zfs_dip, namebuf);
+
+	(void) sprintf(namebuf, "%uc", zv->zv_minor);
+	ddi_remove_minor_node(zfs_dip, namebuf);
+
+	VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
+	    "readonly", zvol_readonly_changed_cb, zv) == 0);
+
+	dmu_objset_close(zv->zv_objset);
+
+	zv->zv_objset = NULL;
+
+	ddi_soft_state_free(zvol_state, zv->zv_minor);
+
+	zvol_minors--;
+
+	mutex_exit(&zvol_state_lock);
+
+	return (0);
+}
+
+int
+zvol_set_volsize(zfs_cmd_t *zc)
+{
+	zvol_state_t *zv;
+	dev_t dev = zc->zc_dev;
+	dmu_tx_t *tx;
+	int error;
+
+	if ((error = zvol_check_volsize(zc)) != 0)
+		return (error);
+
+	mutex_enter(&zvol_state_lock);
+
+	if ((zv = zvol_minor_lookup(zc->zc_name)) == NULL) {
+		mutex_exit(&zvol_state_lock);
+		return (ENXIO);
+	}
+
+	if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
+		mutex_exit(&zvol_state_lock);
+		return (EROFS);
+	}
+
+	tx = dmu_tx_create(zv->zv_objset);
+	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, 1);
+	dmu_tx_hold_free(tx, ZVOL_OBJ, zc->zc_volsize, DMU_OBJECT_END);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		mutex_exit(&zvol_state_lock);
+		return (error);
+	}
+
+	error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
+	    &zc->zc_volsize, tx);
+	if (error == 0)
+		dmu_free_range(zv->zv_objset, ZVOL_OBJ, zc->zc_volsize,
+		    DMU_OBJECT_END, tx);
+
+	dmu_tx_commit(tx);
+
+	if (error == 0) {
+		zv->zv_volsize = zc->zc_volsize;
+		zvol_size_changed(zv, dev);
+	}
+
+	mutex_exit(&zvol_state_lock);
+
+	return (error);
+}
+
+int
+zvol_set_volblocksize(zfs_cmd_t *zc)
+{
+	zvol_state_t *zv;
+	dmu_tx_t *tx;
+	int error;
+
+	mutex_enter(&zvol_state_lock);
+
+	if ((zv = zvol_minor_lookup(zc->zc_name)) == NULL) {
+		mutex_exit(&zvol_state_lock);
+		return (ENXIO);
+	}
+
+	if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
+		mutex_exit(&zvol_state_lock);
+		return (EROFS);
+	}
+
+	tx = dmu_tx_create(zv->zv_objset);
+	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+	} else {
+		error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
+		    zc->zc_volblocksize, 0, tx);
+		if (error == ENOTSUP)
+			error = EBUSY;
+		dmu_tx_commit(tx);
+	}
+
+	mutex_exit(&zvol_state_lock);
+
+	return (error);
+}
+
+/*ARGSUSED*/
+int
+zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
+{
+	minor_t minor = getminor(*devp);
+	zvol_state_t *zv;
+
+	if (minor == 0)			/* This is the control device */
+		return (0);
+
+	mutex_enter(&zvol_state_lock);
+
+	zv = ddi_get_soft_state(zvol_state, minor);
+	if (zv == NULL) {
+		mutex_exit(&zvol_state_lock);
+		return (ENXIO);
+	}
+
+	ASSERT(zv->zv_objset != NULL);
+
+	if ((flag & FWRITE) &&
+	    (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY))) {
+		mutex_exit(&zvol_state_lock);
+		return (EROFS);
+	}
+
+	if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
+		zv->zv_open_count[otyp]++;
+		zv->zv_total_opens++;
+	}
+
+	mutex_exit(&zvol_state_lock);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+int
+zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
+{
+	minor_t minor = getminor(dev);
+	zvol_state_t *zv;
+
+	if (minor == 0)		/* This is the control device */
+		return (0);
+
+	mutex_enter(&zvol_state_lock);
+
+	zv = ddi_get_soft_state(zvol_state, minor);
+	if (zv == NULL) {
+		mutex_exit(&zvol_state_lock);
+		return (ENXIO);
+	}
+
+	/*
+	 * The next statement is a workaround for the following DDI bug:
+	 * 6343604 specfs race: multiple "last-close" of the same device
+	 */
+	if (zv->zv_total_opens == 0) {
+		mutex_exit(&zvol_state_lock);
+		return (0);
+	}
+
+	/*
+	 * If the open count is zero, this is a spurious close.
+	 * That indicates a bug in the kernel / DDI framework.
+	 */
+	ASSERT(zv->zv_open_count[otyp] != 0);
+	ASSERT(zv->zv_total_opens != 0);
+
+	/*
+	 * You may get multiple opens, but only one close.
+	 */
+	zv->zv_open_count[otyp]--;
+	zv->zv_total_opens--;
+
+	mutex_exit(&zvol_state_lock);
+
+	return (0);
+}
+
+int
+zvol_strategy(buf_t *bp)
+{
+	zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev));
+	uint64_t off, volsize;
+	size_t size, resid;
+	char *addr;
+	int error = 0;
+
+	if (zv == NULL) {
+		bioerror(bp, ENXIO);
+		biodone(bp);
+		return (0);
+	}
+
+	if (getminor(bp->b_edev) == 0) {
+		bioerror(bp, EINVAL);
+		biodone(bp);
+		return (0);
+	}
+
+	if (zv->zv_readonly && !(bp->b_flags & B_READ)) {
+		bioerror(bp, EROFS);
+		biodone(bp);
+		return (0);
+	}
+
+	off = ldbtob(bp->b_blkno);
+	volsize = zv->zv_volsize;
+
+	ASSERT(zv->zv_objset != NULL);
+
+	bp_mapin(bp);
+	addr = bp->b_un.b_addr;
+	resid = bp->b_bcount;
+
+	while (resid != 0 && off < volsize) {
+
+		size = MIN(resid, 1UL << 20);	/* cap at 1MB per tx */
+
+		if (size > volsize - off)	/* don't write past the end */
+			size = volsize - off;
+
+		if (bp->b_flags & B_READ) {
+			error = dmu_read_canfail(zv->zv_objset, ZVOL_OBJ,
+			    off, size, addr);
+		} else {
+			dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
+			error = dmu_tx_assign(tx, TXG_WAIT);
+			if (error) {
+				dmu_tx_abort(tx);
+			} else {
+				dmu_write(zv->zv_objset, ZVOL_OBJ,
+				    off, size, addr, tx);
+				dmu_tx_commit(tx);
+			}
+		}
+		if (error)
+			break;
+		off += size;
+		addr += size;
+		resid -= size;
+	}
+
+	if ((bp->b_resid = resid) == bp->b_bcount)
+		bioerror(bp, off > volsize ? EINVAL : error);
+
+	biodone(bp);
+	return (0);
+}
+
+/*ARGSUSED*/
+int
+zvol_read(dev_t dev, uio_t *uiop, cred_t *cr)
+{
+	return (physio(zvol_strategy, NULL, dev, B_READ, minphys, uiop));
+}
+
+/*ARGSUSED*/
+int
+zvol_write(dev_t dev, uio_t *uiop, cred_t *cr)
+{
+	return (physio(zvol_strategy, NULL, dev, B_WRITE, minphys, uiop));
+}
+
+/*ARGSUSED*/
+int
+zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr)
+{
+	return (aphysio(zvol_strategy, anocancel, dev, B_READ, minphys, aio));
+}
+
+/*ARGSUSED*/
+int
+zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr)
+{
+	return (aphysio(zvol_strategy, anocancel, dev, B_WRITE, minphys, aio));
+}
+
+/*
+ * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
+ */
+/*ARGSUSED*/
+int
+zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
+{
+	zvol_state_t *zv;
+	struct dk_cinfo dkc;
+	struct dk_minfo dkm;
+	dk_efi_t efi;
+	efi_gpt_t gpt;
+	efi_gpe_t gpe;
+	struct uuid uuid = EFI_RESERVED;
+	uint32_t crc;
+	int error = 0;
+
+	mutex_enter(&zvol_state_lock);
+
+	zv = ddi_get_soft_state(zvol_state, getminor(dev));
+
+	if (zv == NULL) {
+		mutex_exit(&zvol_state_lock);
+		return (ENXIO);
+	}
+
+	switch (cmd) {
+
+	case DKIOCINFO:
+		bzero(&dkc, sizeof (dkc));
+		(void) strcpy(dkc.dki_cname, "zvol");
+		(void) strcpy(dkc.dki_dname, "zvol");
+		dkc.dki_ctype = DKC_UNKNOWN;
+		dkc.dki_maxtransfer = 1 << 15;
+		mutex_exit(&zvol_state_lock);
+		if (ddi_copyout(&dkc, (void *)arg, sizeof (dkc), flag))
+			error = EFAULT;
+		return (error);
+
+	case DKIOCGMEDIAINFO:
+		bzero(&dkm, sizeof (dkm));
+		dkm.dki_lbsize = 1U << zv->zv_min_bs;
+		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
+		dkm.dki_media_type = DK_UNKNOWN;
+		mutex_exit(&zvol_state_lock);
+		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
+			error = EFAULT;
+		return (error);
+
+	case DKIOCGETEFI:
+		if (ddi_copyin((void *)arg, &efi, sizeof (dk_efi_t), flag)) {
+			mutex_exit(&zvol_state_lock);
+			return (EFAULT);
+		}
+
+		bzero(&gpt, sizeof (gpt));
+		bzero(&gpe, sizeof (gpe));
+
+		efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
+
+		if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) {
+			mutex_exit(&zvol_state_lock);
+			return (EINVAL);
+		}
+
+		efi.dki_length = sizeof (gpt) + sizeof (gpe);
+
+		gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
+		gpt.efi_gpt_Revision = LE_32(EFI_VERSION102);
+		gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
+		gpt.efi_gpt_FirstUsableLBA = LE_64(0ULL);
+		gpt.efi_gpt_LastUsableLBA =
+		    LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1);
+		gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
+		gpt.efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (gpe));
+
+		UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
+		gpe.efi_gpe_StartingLBA = gpt.efi_gpt_FirstUsableLBA;
+		gpe.efi_gpe_EndingLBA = gpt.efi_gpt_LastUsableLBA;
+
+		CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
+		gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
+
+		CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
+		gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
+
+		mutex_exit(&zvol_state_lock);
+		if (ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), flag) ||
+		    ddi_copyout(&gpe, efi.dki_data + 1, sizeof (gpe), flag))
+			error = EFAULT;
+		return (error);
+
+	default:
+		error = ENOTSUP;
+		break;
+
+	}
+	mutex_exit(&zvol_state_lock);
+	return (error);
+}
+
+int
+zvol_busy(void)
+{
+	return (zvol_minors != 0);
+}
+
+void
+zvol_init(void)
+{
+	VERIFY(ddi_soft_state_init(&zvol_state, sizeof (zvol_state_t), 1) == 0);
+	mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+zvol_fini(void)
+{
+	mutex_destroy(&zvol_state_lock);
+	ddi_soft_state_fini(&zvol_state);
+}
diff --git a/usr/src/uts/common/io/1394/s1394_hotplug.c b/usr/src/uts/common/io/1394/s1394_hotplug.c
index 9aa9850d484c..40c2e1902237 100644
--- a/usr/src/uts/common/io/1394/s1394_hotplug.c
+++ b/usr/src/uts/common/io/1394/s1394_hotplug.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -164,7 +164,7 @@ s1394_create_devinfo(s1394_hal_t *hal, s1394_node_t *node, uint32_t *unit_dir,
 	hal_dip = hal->halinfo.dip;
 
 	/* Allocate and init a new device node instance. */
-	result = ndi_devi_alloc(hal_dip, "unit", (dnode_t)DEVI_SID_NODEID,
+	result = ndi_devi_alloc(hal_dip, "unit", (pnode_t)DEVI_SID_NODEID,
 	    &target_dip);
 
 	if (result != NDI_SUCCESS) {
diff --git a/usr/src/uts/common/io/1394/targets/scsa1394/hba.c b/usr/src/uts/common/io/1394/targets/scsa1394/hba.c
index d0c31e825013..84829dcc2e3b 100644
--- a/usr/src/uts/common/io/1394/targets/scsa1394/hba.c
+++ b/usr/src/uts/common/io/1394/targets/scsa1394/hba.c
@@ -717,7 +717,7 @@ scsa1394_create_children(scsa1394_state_t *sp)
 		scsa1394_dtype2name(dtype, &node_name, &driver_name);
 
 		ndi_devi_alloc_sleep(sp->s_dip, node_name,
-		    (dnode_t)DEVI_SID_NODEID, &cdip);
+		    (pnode_t)DEVI_SID_NODEID, &cdip);
 
 		ret = ndi_prop_update_int(DDI_DEV_T_NONE, cdip, "target", 0);
 		if (ret != DDI_PROP_SUCCESS) {
diff --git a/usr/src/uts/common/io/consconfig_dacf.c b/usr/src/uts/common/io/consconfig_dacf.c
index cae6a1754bda..7ff3c0bbe7b9 100644
--- a/usr/src/uts/common/io/consconfig_dacf.c
+++ b/usr/src/uts/common/io/consconfig_dacf.c
@@ -314,7 +314,7 @@ consconfig_dprintf(int l, const char *fmt, ...)
 char *
 get_alias(char *alias, char *buf)
 {
-	dnode_t node;
+	pnode_t node;
 
 	/* OBP >= 2.4 has /aliases */
 	if ((node = prom_alias_node()) == OBP_BADNODE)
diff --git a/usr/src/uts/common/io/devinfo.c b/usr/src/uts/common/io/devinfo.c
index ec6ba40b056e..4fa54bc3e55a 100644
--- a/usr/src/uts/common/io/devinfo.c
+++ b/usr/src/uts/common/io/devinfo.c
@@ -157,7 +157,7 @@ struct di_dkey {
 	dev_info_t	*dk_dip;
 	major_t		dk_major;
 	int		dk_inst;
-	dnode_t		dk_nodeid;
+	pnode_t		dk_nodeid;
 };
 
 struct di_pkey {
diff --git a/usr/src/uts/common/io/dktp/dcdev/dadk.c b/usr/src/uts/common/io/dktp/dcdev/dadk.c
index 0e43bf3a1d6c..3e7c6bbaa4fe 100644
--- a/usr/src/uts/common/io/dktp/dcdev/dadk.c
+++ b/usr/src/uts/common/io/dktp/dcdev/dadk.c
@@ -159,6 +159,7 @@ static char	*dadk_cmds[] = {
 	"\030cdrom read offset",	/* DCMD_READOFFSET  24	*/
 	"\031cdrom read mode 2",	/* DCMD_READMODE2  25	*/
 	"\032cdrom volume control",	/* DCMD_VOLCTRL  26	*/
+	"\033flush cache",		/* DCMD_FLUSH_CACHE  27	*/
 	NULL
 };
 
@@ -384,6 +385,8 @@ int
 dadk_open(opaque_t objp, int flag)
 {
 	struct dadk *dadkp = (struct dadk *)objp;
+	int error;
+	int wce;
 
 	if (!dadkp->dad_rmb) {
 		if (dadkp->dad_phyg.g_cap) {
@@ -409,6 +412,23 @@ dadk_open(opaque_t objp, int flag)
 	    mutex_exit(&dadkp->dad_mutex);
 	}
 
+	/*
+	 * get write cache enable state
+	 * If there is an error, must assume that write cache
+	 * is enabled.
+	 * NOTE: Since there is currently no Solaris mechanism to
+	 * change the state of the Write Cache Enable feature,
+	 * this code just checks the value of the WCE bit
+	 * obtained at device init time.  If a mechanism
+	 * is added to the driver to change WCE, dad_wce
+	 * must be updated appropriately.
+	 */
+	error = CTL_IOCTL(dadkp->dad_ctlobjp, DIOCTL_GETWCE,
+	    (uintptr_t)&wce, 0);
+	mutex_enter(&dadkp->dad_mutex);
+	dadkp->dad_wce = (error != 0) || (wce != 0);
+	mutex_exit(&dadkp->dad_mutex);
+
 	/* logical disk geometry */
 	CTL_IOCTL(dadkp->dad_ctlobjp, DIOCTL_GETGEOM,
 	    (uintptr_t)&dadkp->dad_logg, 0);
@@ -625,6 +645,91 @@ dadk_ioctl(opaque_t objp, dev_t dev, int cmd, intptr_t arg, int flag,
 				return (EINVAL);
 		}
 	    }
+	case DKIOCFLUSHWRITECACHE:
+		{
+			struct buf *bp;
+			int err = 0;
+			struct dk_callback *dkc = (struct dk_callback *)arg;
+			struct cmpkt *pktp;
+			int is_sync = 1;
+
+			mutex_enter(&dadkp->dad_mutex);
+			if (dadkp->dad_noflush || !  dadkp->dad_wce) {
+				err = dadkp->dad_noflush ? ENOTSUP : 0;
+				mutex_exit(&dadkp->dad_mutex);
+				/*
+				 * If a callback was requested: a
+				 * callback will always be done if the
+				 * caller saw the DKIOCFLUSHWRITECACHE
+				 * ioctl return 0, and never done if the
+				 * caller saw the ioctl return an error.
+				 */
+				if ((flag & FKIOCTL) && dkc != NULL &&
+				    dkc->dkc_callback != NULL) {
+					(*dkc->dkc_callback)(dkc->dkc_cookie,
+					    err);
+					/*
+					 * Did callback and reported error.
+					 * Since we did a callback, ioctl
+					 * should return 0.
+					 */
+					err = 0;
+				}
+				return (err);
+			}
+			mutex_exit(&dadkp->dad_mutex);
+
+			bp = getrbuf(KM_SLEEP);
+
+			bp->b_edev = dev;
+			bp->b_dev  = cmpdev(dev);
+			bp->b_flags = B_BUSY;
+			bp->b_resid = 0;
+			bp->b_bcount = 0;
+			SET_BP_SEC(bp, 0);
+
+			if ((flag & FKIOCTL) && dkc != NULL &&
+			    dkc->dkc_callback != NULL) {
+				struct dk_callback *dkc2 =
+				    (struct dk_callback *)kmem_zalloc(
+				    sizeof (struct dk_callback), KM_SLEEP);
+
+				bcopy(dkc, dkc2, sizeof (*dkc2));
+				/*
+				 * Borrow b_list to carry private data
+				 * to the b_iodone func.
+				 */
+				bp->b_list = (struct buf *)dkc2;
+				bp->b_iodone = dadk_flushdone;
+				is_sync = 0;
+			}
+
+			/*
+			 * Setup command pkt
+			 * dadk_pktprep() can't fail since DDI_DMA_SLEEP set
+			 */
+			pktp = dadk_pktprep(dadkp, NULL, bp,
+			    dadk_iodone, DDI_DMA_SLEEP, NULL);
+
+			pktp->cp_time = DADK_FLUSH_CACHE_TIME;
+
+			*((char *)(pktp->cp_cdbp)) = DCMD_FLUSH_CACHE;
+			pktp->cp_byteleft = 0;
+			pktp->cp_private = NULL;
+			pktp->cp_secleft = 0;
+			pktp->cp_srtsec = -1;
+			pktp->cp_bytexfer = 0;
+
+			CTL_IOSETUP(dadkp->dad_ctlobjp, pktp);
+
+			FLC_ENQUE(dadkp->dad_flcobjp, bp);
+
+			if (is_sync) {
+				err = biowait(bp);
+				freerbuf(bp);
+			}
+			return (err);
+		}
 	default:
 		if (!dadkp->dad_rmb)
 			return (CTL_IOCTL(dadkp->dad_ctlobjp, cmd, arg, flag));
@@ -704,6 +809,20 @@ dadk_ioctl(opaque_t objp, dev_t dev, int cmd, intptr_t arg, int flag,
 	return (dadk_rmb_ioctl(dadkp, cmd, arg, flag, 0));
 }
 
+int
+dadk_flushdone(struct buf *bp)
+{
+	struct dk_callback *dkc = (struct dk_callback *)bp->b_list;
+
+	ASSERT(dkc != NULL && dkc->dkc_callback != NULL);
+
+	(*dkc->dkc_callback)(dkc->dkc_cookie, geterror(bp));
+
+	kmem_free(dkc, sizeof (*dkc));
+	freerbuf(bp);
+	return (0);
+}
+
 int
 dadk_getphygeom(opaque_t objp, struct tgdk_geom *dkgeom_p)
 {
@@ -957,11 +1076,10 @@ static int
 dadk_ioretry(struct cmpkt *pktp, int action)
 {
 	struct buf *bp;
-	struct dadk *dadkp;
+	struct dadk *dadkp = PKT2DADK(pktp);
 
 	switch (action) {
 	case QUE_COMMAND:
-		dadkp = PKT2DADK(pktp);
 		if (pktp->cp_retry++ < DADK_RETRY_COUNT) {
 			CTL_IOSETUP(dadkp->dad_ctlobjp, pktp);
 			if (CTL_TRANSPORT(dadkp->dad_ctlobjp, pktp) ==
@@ -981,8 +1099,22 @@ dadk_ioretry(struct cmpkt *pktp, int action)
 		bp = pktp->cp_bp;
 		bp->b_resid += pktp->cp_byteleft - pktp->cp_bytexfer +
 		    pktp->cp_resid;
-		if (geterror(bp) == 0)
-			bioerror(bp, EIO);
+		if (geterror(bp) == 0) {
+			if ((*((char *)(pktp->cp_cdbp)) == DCMD_FLUSH_CACHE) &&
+			    (pktp->cp_dev_private == (opaque_t)dadkp) &&
+			    ((int)(*(char *)pktp->cp_scbp) == DERR_ABORT)) {
+				/*
+				 * Flag "unimplemented" responses for
+				 * DCMD_FLUSH_CACHE as ENOTSUP
+				 */
+				bioerror(bp, ENOTSUP);
+				mutex_enter(&dadkp->dad_mutex);
+				dadkp->dad_noflush = 1;
+				mutex_exit(&dadkp->dad_mutex);
+			} else {
+				bioerror(bp, EIO);
+			}
+		}
 		/*FALLTHROUGH*/
 	case COMMAND_DONE:
 	default:
diff --git a/usr/src/uts/common/io/hotplug/pcicfg/pcicfg.c b/usr/src/uts/common/io/hotplug/pcicfg/pcicfg.c
index 6a740dcf9778..90979eca9e4b 100644
--- a/usr/src/uts/common/io/hotplug/pcicfg/pcicfg.c
+++ b/usr/src/uts/common/io/hotplug/pcicfg/pcicfg.c
@@ -558,7 +558,7 @@ pcicfg_configure(dev_info_t *devi, uint_t device)
 	bus = pci_bus_range.lo; /* primary bus number of this bus node */
 
 	ndi_devi_alloc_sleep(devi, "hp_attachment",
-		(dnode_t)DEVI_SID_NODEID, &attach_point);
+		(pnode_t)DEVI_SID_NODEID, &attach_point);
 
 	ndi_devi_enter(devi, &circ);
 	for (func = 0; func < PCICFG_MAX_FUNCTION; func++) {
@@ -776,7 +776,7 @@ pcicfg_configure_ntbridge(dev_info_t *new_device, uint_t bus, uint_t device)
 	for (devno = pcicfg_start_devno; devno < max_devs; devno++) {
 
 		ndi_devi_alloc_sleep(new_device, DEVI_PSEUDO_NEXNAME,
-		    (dnode_t)DEVI_SID_NODEID, &new_ntbridgechild);
+		    (pnode_t)DEVI_SID_NODEID, &new_ntbridgechild);
 
 		if (pcicfg_add_config_reg(new_ntbridgechild, next_bus, devno, 0)
 					!= DDI_PROP_SUCCESS) {
@@ -1104,7 +1104,7 @@ pcicfg_ntbridge_unconfigure_child(dev_info_t *new_device, uint_t devno)
 	bus = pci_bus_range.lo; /* primary bus number of this bus node */
 
 	ndi_devi_alloc_sleep(new_device, DEVI_PSEUDO_NEXNAME,
-	    (dnode_t)DEVI_SID_NODEID, &new_ntbridgechild);
+	    (pnode_t)DEVI_SID_NODEID, &new_ntbridgechild);
 
 	if (pcicfg_add_config_reg(new_ntbridgechild, bus, devno, 0)
 				!= DDI_PROP_SUCCESS) {
@@ -3255,7 +3255,7 @@ pcicfg_probe_children(dev_info_t *parent, uint_t bus,
 
 	ndi_devi_enter(parent, &circ);
 	ndi_devi_alloc_sleep(parent, DEVI_PSEUDO_NEXNAME,
-		(dnode_t)DEVI_SID_NODEID, &new_child);
+		(pnode_t)DEVI_SID_NODEID, &new_child);
 
 	if (pcicfg_add_config_reg(new_child, bus,
 		device, func) != DDI_SUCCESS) {
@@ -3908,7 +3908,7 @@ pcicfg_create_ac_child(dev_info_t *dip)
 	dev_info_t	*cdip;
 	char		*compat[1];
 
-	ndi_devi_alloc_sleep(dip, "se", (dnode_t)DEVI_SID_NODEID, &cdip);
+	ndi_devi_alloc_sleep(dip, "se", (pnode_t)DEVI_SID_NODEID, &cdip);
 	compat[0] = kmem_alloc(strlen("acse") + 1, KM_SLEEP);
 	(void) strcpy(compat[0], "acse");
 	if (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
diff --git a/usr/src/uts/common/io/hotplug/pcihp/pcihp.c b/usr/src/uts/common/io/hotplug/pcihp/pcihp.c
index 196abddadc47..46dbcd10a2d4 100644
--- a/usr/src/uts/common/io/hotplug/pcihp/pcihp.c
+++ b/usr/src/uts/common/io/hotplug/pcihp/pcihp.c
@@ -3563,7 +3563,7 @@ pcihp_config_setup(dev_info_t **dip, ddi_acc_handle_t *handle,
 	bus = pci_bus_range.lo;
 
 	if (ndi_devi_alloc(pdip, DEVI_PSEUDO_NEXNAME,
-	    (dnode_t)DEVI_SID_NODEID, dip) != NDI_SUCCESS) {
+	    (pnode_t)DEVI_SID_NODEID, dip) != NDI_SUCCESS) {
 
 		PCIHP_DEBUG((CE_NOTE, "Failed to alloc probe node\n"));
 		return (PCIHP_FAILURE);
diff --git a/usr/src/uts/common/io/i2o/pci_to_i2o.c b/usr/src/uts/common/io/i2o/pci_to_i2o.c
index 7da3a13a48e5..0dc8105d0e08 100644
--- a/usr/src/uts/common/io/i2o/pci_to_i2o.c
+++ b/usr/src/uts/common/io/i2o/pci_to_i2o.c
@@ -866,7 +866,7 @@ i2o_create_devinfo(iop_nexus_instance_t *iop)
 
 		/* create the devinfo node */
 		if (ndi_devi_alloc(iop->dip, nodename,
-			(dnode_t)DEVI_SID_NODEID, &cdip) != NDI_SUCCESS) {
+			(pnode_t)DEVI_SID_NODEID, &cdip) != NDI_SUCCESS) {
 			cmn_err(CE_WARN,
 				"i2o_create_devinfo: ndi_devi_alloc failed");
 			goto fail;
diff --git a/usr/src/uts/common/io/i8042.c b/usr/src/uts/common/io/i8042.c
index e5884400abc0..ea8be871a89a 100644
--- a/usr/src/uts/common/io/i8042.c
+++ b/usr/src/uts/common/io/i8042.c
@@ -1041,7 +1041,7 @@ alloc_kb_mouse(dev_info_t *i8042_dip)
 
 	/* mouse */
 	ndi_devi_alloc_sleep(i8042_dip, "mouse",
-	    (dnode_t)DEVI_SID_NODEID, &xdip);
+	    (pnode_t)DEVI_SID_NODEID, &xdip);
 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, xdip,
 	    "reg", 1);
 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, xdip,
@@ -1052,7 +1052,7 @@ alloc_kb_mouse(dev_info_t *i8042_dip)
 
 	/* keyboard */
 	ndi_devi_alloc_sleep(i8042_dip, "keyboard",
-	    (dnode_t)DEVI_SID_NODEID, &xdip);
+	    (pnode_t)DEVI_SID_NODEID, &xdip);
 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, xdip,
 	    "reg", 0);
 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, xdip,
diff --git a/usr/src/uts/common/io/ib/ibnex/ibnex.c b/usr/src/uts/common/io/ib/ibnex/ibnex.c
index a62c669f8316..8610834fad29 100644
--- a/usr/src/uts/common/io/ib/ibnex/ibnex.c
+++ b/usr/src/uts/common/io/ib/ibnex/ibnex.c
@@ -2910,7 +2910,7 @@ ibnex_commsvc_initnode(dev_info_t *parent, ibdm_port_attr_t *port_attr,
 	node_data->node_state = IBNEX_CFGADM_CONFIGURING;
 
 	ndi_devi_alloc_sleep(parent,
-	    IBNEX_IBPORT_CNAME, (dnode_t)DEVI_SID_NODEID, &cdip);
+	    IBNEX_IBPORT_CNAME, (pnode_t)DEVI_SID_NODEID, &cdip);
 
 	node_data->node_dip	= cdip;
 	ddi_set_parent_data(cdip, node_data);
diff --git a/usr/src/uts/common/io/openprom.c b/usr/src/uts/common/io/openprom.c
index ee0bbe588305..f01dd0b03386 100644
--- a/usr/src/uts/common/io/openprom.c
+++ b/usr/src/uts/common/io/openprom.c
@@ -73,7 +73,7 @@ extern int plat_stdin_is_keyboard(void);
  * XXX	Make this dynamic.. or (better still) make the interface stateless
  */
 static struct oprom_state {
-	dnode_t	current_id;	/* node we're fetching props from */
+	pnode_t	current_id;	/* node we're fetching props from */
 	int16_t	already_open;	/* if true, this instance is 'active' */
 	int16_t	ioc_state;	/* snapshot ioctl state */
 	char	*snapshot;	/* snapshot of all prom nodes */
@@ -93,9 +93,9 @@ static int opattach(dev_info_t *, ddi_attach_cmd_t cmd);
 static int opdetach(dev_info_t *, ddi_detach_cmd_t cmd);
 
 /* help functions */
-static int oprom_checknodeid(dnode_t, dnode_t);
+static int oprom_checknodeid(pnode_t, pnode_t);
 static int oprom_copyinstr(intptr_t, char *, size_t, size_t);
-static int oprom_copynode(dnode_t, uint_t, char **, size_t *);
+static int oprom_copynode(pnode_t, uint_t, char **, size_t *);
 static int oprom_snapshot(struct oprom_state *, intptr_t);
 static int oprom_copyout(struct oprom_state *, intptr_t);
 static int oprom_setstate(struct oprom_state *, int16_t);
@@ -182,7 +182,7 @@ _fini(void)
 }
 
 static dev_info_t *opdip;
-static dnode_t options_nodeid;
+static pnode_t options_nodeid;
 
 /*ARGSUSED*/
 static int
@@ -269,7 +269,7 @@ opromopen(dev_t *devp, int flag, int otyp, cred_t *credp)
 			/*
 			 * It's ours.
 			 */
-			st->current_id = (dnode_t)0;
+			st->current_id = (pnode_t)0;
 			ASSERT(st->snapshot == NULL && st->size == 0);
 			ASSERT(st->ioc_state == IOC_IDLE);
 			break;
@@ -339,7 +339,7 @@ opromioctl_cb(void *avp, int has_changed)
 	char *valbuf;
 	int error = 0;
 	uint_t userbufsize;
-	dnode_t node_id;
+	pnode_t node_id;
 	char propname[OBP_MAXPROPNAME];
 
 	st = argp->st;
@@ -621,25 +621,25 @@ opromioctl_cb(void *avp, int has_changed)
 	case OPROMSETNODEID:
 
 		if (prom_is_openprom() == 0 ||
-		    userbufsize < sizeof (dnode_t)) {
+		    userbufsize < sizeof (pnode_t)) {
 			error = EINVAL;
 			break;
 		}
 
 		/*
-		 * The argument is a phandle. (aka dnode_t)
+		 * The argument is a phandle. (aka pnode_t)
 		 */
 		if (copyin(((caddr_t)arg + sizeof (uint_t)),
-		    opp->oprom_array, sizeof (dnode_t)) != 0) {
+		    opp->oprom_array, sizeof (pnode_t)) != 0) {
 			error = EFAULT;
 			break;
 		}
 
 		/*
-		 * If dnode_t from userland is garbage, we
+		 * If pnode_t from userland is garbage, we
 		 * could confuse the PROM.
 		 */
-		node_id = *(dnode_t *)opp->oprom_array;
+		node_id = *(pnode_t *)opp->oprom_array;
 		if (oprom_checknodeid(node_id, st->current_id) == 0) {
 			cmn_err(CE_NOTE, "!nodeid 0x%x not found",
 			    (int)node_id);
@@ -657,11 +657,11 @@ opromioctl_cb(void *avp, int has_changed)
 			break;
 		}
 
-		opp->oprom_size = sizeof (dnode_t);
-		*(dnode_t *)opp->oprom_array = st->current_id;
+		opp->oprom_size = sizeof (pnode_t);
+		*(pnode_t *)opp->oprom_array = st->current_id;
 
 		if (copyout(opp, (void *)arg,
-		    sizeof (dnode_t) + sizeof (uint_t)) != 0)
+		    sizeof (pnode_t) + sizeof (uint_t)) != 0)
 			error = EFAULT;
 		break;
 
@@ -929,7 +929,7 @@ opromioctl_cb(void *avp, int has_changed)
 		struct openprom_opr64 *opr =
 		    (struct openprom_opr64 *)opp->oprom_array;
 		int i;
-		dnode_t id;
+		pnode_t id;
 
 		if (userbufsize < sizeof (*opr)) {
 			error = EINVAL;
@@ -1071,13 +1071,13 @@ oprom_copyinstr(intptr_t arg, char *buf, size_t bufsize, size_t maxsize)
 }
 
 /*
- * Check dnode_t passed in from userland
+ * Check pnode_t passed in from userland
  */
 static int
-oprom_checknodeid(dnode_t node_id, dnode_t current_id)
+oprom_checknodeid(pnode_t node_id, pnode_t current_id)
 {
 	int depth;
-	dnode_t id[OBP_STACKDEPTH];
+	pnode_t id[OBP_STACKDEPTH];
 
 	/*
 	 * optimized path
@@ -1098,7 +1098,7 @@ oprom_checknodeid(dnode_t node_id, dnode_t current_id)
 	 * long path: walk from root till we find node_id
 	 */
 	depth = 1;
-	id[0] = prom_nextnode((dnode_t)0);
+	id[0] = prom_nextnode((pnode_t)0);
 
 	while (depth) {
 		if (id[depth - 1] == node_id)
@@ -1191,7 +1191,7 @@ oprom_copyout(struct oprom_state *st, intptr_t arg)
  * Copy all properties of nodeid into a single packed nvlist
  */
 static int
-oprom_copyprop(dnode_t nodeid, uint_t flag, nvlist_t *nvl)
+oprom_copyprop(pnode_t nodeid, uint_t flag, nvlist_t *nvl)
 {
 	int proplen;
 	char *propname, *propval, *buf1, *buf2;
@@ -1253,10 +1253,10 @@ oprom_copyprop(dnode_t nodeid, uint_t flag, nvlist_t *nvl)
  * Copy all children and descendents into a a packed nvlist
  */
 static int
-oprom_copychild(dnode_t nodeid, uint_t flag, char **buf, size_t *size)
+oprom_copychild(pnode_t nodeid, uint_t flag, char **buf, size_t *size)
 {
 	nvlist_t *nvl;
-	dnode_t child = prom_childnode(nodeid);
+	pnode_t child = prom_childnode(nodeid);
 
 	if (child == 0)
 		return (0);
@@ -1285,7 +1285,7 @@ oprom_copychild(dnode_t nodeid, uint_t flag, char **buf, size_t *size)
  * Copy a node into a packed nvlist
  */
 static int
-oprom_copynode(dnode_t nodeid, uint_t flag, char **buf, size_t *size)
+oprom_copynode(pnode_t nodeid, uint_t flag, char **buf, size_t *size)
 {
 	int error = 0;
 	nvlist_t *nvl;
diff --git a/usr/src/uts/common/io/pcmcia/pcmem.c b/usr/src/uts/common/io/pcmcia/pcmem.c
index d056d4138e4d..092f6da2b92c 100644
--- a/usr/src/uts/common/io/pcmcia/pcmem.c
+++ b/usr/src/uts/common/io/pcmcia/pcmem.c
@@ -20,13 +20,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 1999-2000 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
-
 /*
  *  PCMCIA Memory Nexus Driver
  *
@@ -304,7 +303,7 @@ pcmem_create_pcram_node(dev_info_t *dip)
 	PCMEM_DEBUG((CE_CONT,
 	    "pcmem_create_pcram_node dip=%p\n", (void *)dip));
 
-	if (ndi_devi_alloc(dip, "pcram", (dnode_t)DEVI_SID_NODEID, &child) !=
+	if (ndi_devi_alloc(dip, "pcram", (pnode_t)DEVI_SID_NODEID, &child) !=
 	    NDI_SUCCESS) {
 		cmn_err(CE_WARN,
 		    "pcmem: unable to create node [%s]\n", "pcram");
diff --git a/usr/src/uts/common/io/scsi/targets/sd.c b/usr/src/uts/common/io/scsi/targets/sd.c
index 288b0768a4b2..ee1d098f731e 100644
--- a/usr/src/uts/common/io/scsi/targets/sd.c
+++ b/usr/src/uts/common/io/scsi/targets/sd.c
@@ -67,10 +67,10 @@
  * Loadable module info.
  */
 #if (defined(__fibre))
-#define	SD_MODULE_NAME	"SCSI SSA/FCAL Disk Driver %I%"
+#define	SD_MODULE_NAME	"SCSI SSA/FCAL Disk Driver 1.471"
 char _depends_on[]	= "misc/scsi drv/fcp";
 #else
-#define	SD_MODULE_NAME	"SCSI Disk Driver %I%"
+#define	SD_MODULE_NAME	"SCSI Disk Driver 1.471"
 char _depends_on[]	= "misc/scsi";
 #endif
 
@@ -810,6 +810,7 @@ static int sd_pm_idletime = 1;
 #define	sd_init_event_callbacks		ssd_init_event_callbacks
 #define	sd_event_callback		ssd_event_callback
 #define	sd_disable_caching		ssd_disable_caching
+#define	sd_get_write_cache_enabled	ssd_get_write_cache_enabled
 #define	sd_make_device			ssd_make_device
 #define	sdopen				ssdopen
 #define	sdclose				ssdclose
@@ -932,6 +933,8 @@ static int sd_pm_idletime = 1;
 #define	sd_send_scsi_PERSISTENT_RESERVE_OUT	\
 					ssd_send_scsi_PERSISTENT_RESERVE_OUT
 #define	sd_send_scsi_SYNCHRONIZE_CACHE	ssd_send_scsi_SYNCHRONIZE_CACHE
+#define	sd_send_scsi_SYNCHRONIZE_CACHE_biodone	\
+					ssd_send_scsi_SYNCHRONIZE_CACHE_biodone
 #define	sd_send_scsi_MODE_SENSE		ssd_send_scsi_MODE_SENSE
 #define	sd_send_scsi_MODE_SELECT	ssd_send_scsi_MODE_SELECT
 #define	sd_send_scsi_RDWR		ssd_send_scsi_RDWR
@@ -1147,6 +1150,7 @@ static void  sd_event_callback(dev_info_t *, ddi_eventcookie_t, void *, void *);
 
 
 static int   sd_disable_caching(struct sd_lun *un);
+static int   sd_get_write_cache_enabled(struct sd_lun *un, int *is_enabled);
 static dev_t sd_make_device(dev_info_t *devi);
 
 static void  sd_update_block_info(struct sd_lun *un, uint32_t lbasize,
@@ -1375,7 +1379,9 @@ static int sd_send_scsi_PERSISTENT_RESERVE_IN(struct sd_lun *un,
 	uchar_t usr_cmd, uint16_t data_len, uchar_t *data_bufp);
 static int sd_send_scsi_PERSISTENT_RESERVE_OUT(struct sd_lun *un,
 	uchar_t usr_cmd, uchar_t *usr_bufp);
-static int sd_send_scsi_SYNCHRONIZE_CACHE(struct sd_lun *un);
+static int sd_send_scsi_SYNCHRONIZE_CACHE(struct sd_lun *un,
+	struct dk_callback *dkc);
+static int sd_send_scsi_SYNCHRONIZE_CACHE_biodone(struct buf *bp);
 static int sd_send_scsi_GET_CONFIGURATION(struct sd_lun *un,
 	struct uscsi_cmd *ucmdbuf, uchar_t *rqbuf, uint_t rqbuflen,
 	uchar_t *bufaddr, uint_t buflen);
@@ -7626,6 +7632,7 @@ sd_unit_attach(dev_info_t *devi)
 	int	reservation_flag = SD_TARGET_IS_UNRESERVED;
 	int	instance;
 	int	rval;
+	int	wc_enabled;
 	uint64_t	capacity;
 	uint_t		lbasize;
 
@@ -8487,6 +8494,19 @@ sd_unit_attach(dev_info_t *devi)
 		}
 	}
 
+	/*
+	 * NOTE: Since there is currently no mechanism to
+	 * change the state of the Write Cache Enable mode select,
+	 * this code just checks the value of the WCE bit
+	 * at device attach time.  If a mechanism
+	 * is added to the driver to change WCE, un_f_write_cache_enabled
+	 * must be updated appropriately.
+	 */
+	(void) sd_get_write_cache_enabled(un, &wc_enabled);
+	mutex_enter(SD_MUTEX(un));
+	un->un_f_write_cache_enabled = (wc_enabled != 0);
+	mutex_exit(SD_MUTEX(un));
+
 	/*
 	 * Set the pstat and error stat values here, so data obtained during the
 	 * previous attach-time routines is available.
@@ -9715,6 +9735,115 @@ sd_disable_caching(struct sd_lun *un)
 }
 
 
+/*
+ *    Function: sd_get_write_cache_enabled()
+ *
+ * Description: This routine is the driver entry point for determining if
+ *		write caching is enabled.  It examines the WCE (write cache
+ *		enable) bits of mode page 8 (MODEPAGE_CACHING).
+ *
+ *   Arguments: un - driver soft state (unit) structure
+ *   		is_enabled - pointer to int where write cache enabled state
+ *   			is returned (non-zero -> write cache enabled)
+ *
+ *
+ * Return Code: EIO
+ *		code returned by sd_send_scsi_MODE_SENSE
+ *
+ *     Context: Kernel Thread
+ *
+ * NOTE: If ioctl is added to disable write cache, this sequence should
+ * be followed so that no locking is required for accesses to
+ * un->un_f_write_cache_enabled:
+ * 	do mode select to clear wce
+ * 	do synchronize cache to flush cache
+ * 	set un->un_f_write_cache_enabled = FALSE
+ *
+ * Conversely, an ioctl to enable the write cache should be done
+ * in this order:
+ * 	set un->un_f_write_cache_enabled = TRUE
+ * 	do mode select to set wce
+ */
+
+static int
+sd_get_write_cache_enabled(struct sd_lun *un, int *is_enabled)
+{
+	struct mode_caching	*mode_caching_page;
+	uchar_t			*header;
+	size_t			buflen;
+	int			hdrlen;
+	int			bd_len;
+	int			rval = 0;
+
+	ASSERT(un != NULL);
+	ASSERT(is_enabled != NULL);
+
+	/* in case of error, flag as enabled */
+	*is_enabled = TRUE;
+
+	/*
+	 * Do a test unit ready, otherwise a mode sense may not work if this
+	 * is the first command sent to the device after boot.
+	 */
+	(void) sd_send_scsi_TEST_UNIT_READY(un, 0);
+
+	if (un->un_f_cfg_is_atapi == TRUE) {
+		hdrlen = MODE_HEADER_LENGTH_GRP2;
+	} else {
+		hdrlen = MODE_HEADER_LENGTH;
+	}
+
+	/*
+	 * Allocate memory for the retrieved mode page and its headers.  Set
+	 * a pointer to the page itself.
+	 */
+	buflen = hdrlen + MODE_BLK_DESC_LENGTH + sizeof (struct mode_caching);
+	header = kmem_zalloc(buflen, KM_SLEEP);
+
+	/* Get the information from the device. */
+	if (un->un_f_cfg_is_atapi == TRUE) {
+		rval = sd_send_scsi_MODE_SENSE(un, CDB_GROUP1, header, buflen,
+		    MODEPAGE_CACHING, SD_PATH_DIRECT);
+	} else {
+		rval = sd_send_scsi_MODE_SENSE(un, CDB_GROUP0, header, buflen,
+		    MODEPAGE_CACHING, SD_PATH_DIRECT);
+	}
+	if (rval != 0) {
+		SD_ERROR(SD_LOG_IOCTL_RMMEDIA, un,
+		    "sd_get_write_cache_enabled: Mode Sense Failed\n");
+		kmem_free(header, buflen);
+		return (rval);
+	}
+
+	/*
+	 * Determine size of Block Descriptors in order to locate
+	 * the mode page data. ATAPI devices return 0, SCSI devices
+	 * should return MODE_BLK_DESC_LENGTH.
+	 */
+	if (un->un_f_cfg_is_atapi == TRUE) {
+		struct mode_header_grp2	*mhp;
+		mhp	= (struct mode_header_grp2 *)header;
+		bd_len  = (mhp->bdesc_length_hi << 8) | mhp->bdesc_length_lo;
+	} else {
+		bd_len  = ((struct mode_header *)header)->bdesc_length;
+	}
+
+	if (bd_len > MODE_BLK_DESC_LENGTH) {
+		scsi_log(SD_DEVINFO(un), sd_label, CE_WARN,
+		    "sd_get_write_cache_enabled: Mode Sense returned invalid "
+		    "block descriptor length\n");
+		kmem_free(header, buflen);
+		return (EIO);
+	}
+
+	mode_caching_page = (struct mode_caching *)(header + hdrlen + bd_len);
+	*is_enabled = mode_caching_page->wce;
+
+	kmem_free(header, buflen);
+	return (0);
+}
+
+
 /*
  *    Function: sd_make_device
  *
@@ -10348,8 +10477,13 @@ sdclose(dev_t dev, int flag, int otyp, cred_t *cred_p)
 #endif
 				mutex_exit(SD_MUTEX(un));
 				if (sd_pm_entry(un) == DDI_SUCCESS) {
-					if (sd_send_scsi_SYNCHRONIZE_CACHE(un)
-					    != 0) {
+					rval =
+					    sd_send_scsi_SYNCHRONIZE_CACHE(un,
+					    NULL);
+					/* ignore error if not supported */
+					if (rval == ENOTSUP) {
+						rval = 0;
+					} else if (rval != 0) {
 						rval = EIO;
 					}
 					sd_pm_exit(un);
@@ -11887,6 +12021,8 @@ sd_uscsi_iodone(int index, struct sd_lun *un, struct buf *bp)
 
 	SD_INFO(SD_LOG_IO, un, "sd_uscsi_iodone: entry.\n");
 
+	bp->b_private = xp->xb_private;
+
 	mutex_enter(SD_MUTEX(un));
 
 	/*
@@ -19827,52 +19963,127 @@ sd_send_scsi_PERSISTENT_RESERVE_OUT(struct sd_lun *un, uchar_t usr_cmd,
  */
 
 static int
-sd_send_scsi_SYNCHRONIZE_CACHE(struct sd_lun *un)
+sd_send_scsi_SYNCHRONIZE_CACHE(struct sd_lun *un, struct dk_callback *dkc)
 {
-	struct	scsi_extended_sense	sense_buf;
-	union scsi_cdb		cdb;
-	struct uscsi_cmd	ucmd_buf;
-	int			status;
+	struct sd_uscsi_info	*uip;
+	struct uscsi_cmd	*uscmd;
+	union scsi_cdb		*cdb;
+	struct buf		*bp;
+	int			rval = 0;
+
+	SD_TRACE(SD_LOG_IO, un,
+	    "sd_send_scsi_SYNCHRONIZE_CACHE: entry: un:0x%p\n", un);
 
 	ASSERT(un != NULL);
 	ASSERT(!mutex_owned(SD_MUTEX(un)));
 
-	SD_TRACE(SD_LOG_IO, un,
-	    "sd_send_scsi_SYNCHRONIZE_CACHE: entry: un:0x%p\n", un);
+	cdb = kmem_zalloc(CDB_GROUP1, KM_SLEEP);
+	cdb->scc_cmd = SCMD_SYNCHRONIZE_CACHE;
 
-	bzero(&cdb, sizeof (cdb));
-	bzero(&ucmd_buf, sizeof (ucmd_buf));
-	bzero(&sense_buf, sizeof (struct scsi_extended_sense));
+	/*
+	 * First get some memory for the uscsi_cmd struct and cdb
+	 * and initialize for SYNCHRONIZE_CACHE cmd.
+	 */
+	uscmd = kmem_zalloc(sizeof (struct uscsi_cmd), KM_SLEEP);
+	uscmd->uscsi_cdblen = CDB_GROUP1;
+	uscmd->uscsi_cdb = (caddr_t)cdb;
+	uscmd->uscsi_bufaddr = NULL;
+	uscmd->uscsi_buflen = 0;
+	uscmd->uscsi_rqbuf = kmem_zalloc(SENSE_LENGTH, KM_SLEEP);
+	uscmd->uscsi_rqlen = SENSE_LENGTH;
+	uscmd->uscsi_rqresid = SENSE_LENGTH;
+	uscmd->uscsi_flags = USCSI_RQENABLE | USCSI_SILENT;
+	uscmd->uscsi_timeout = sd_io_time;
+
+	/*
+	 * Allocate an sd_uscsi_info struct and fill it with the info
+	 * needed by sd_initpkt_for_uscsi().  Then put the pointer into
+	 * b_private in the buf for sd_initpkt_for_uscsi().  Note that
+	 * since we allocate the buf here in this function, we do not
+	 * need to preserve the prior contents of b_private.
+	 * The sd_uscsi_info struct is also used by sd_uscsi_strategy()
+	 */
+	uip = kmem_zalloc(sizeof (struct sd_uscsi_info), KM_SLEEP);
+	uip->ui_flags = SD_PATH_DIRECT;
+	uip->ui_cmdp  = uscmd;
 
-	cdb.scc_cmd = SCMD_SYNCHRONIZE_CACHE;
+	bp = getrbuf(KM_SLEEP);
+	bp->b_private = uip;
 
-	ucmd_buf.uscsi_cdb	= (char *)&cdb;
-	ucmd_buf.uscsi_cdblen	= CDB_GROUP1;
-	ucmd_buf.uscsi_bufaddr	= NULL;
-	ucmd_buf.uscsi_buflen	= 0;
-	ucmd_buf.uscsi_rqbuf	= (caddr_t)&sense_buf;
-	ucmd_buf.uscsi_rqlen	= sizeof (struct scsi_extended_sense);
-	ucmd_buf.uscsi_flags	= USCSI_RQENABLE | USCSI_SILENT;
-	ucmd_buf.uscsi_timeout	= 240;
+	/*
+	 * Setup buffer to carry uscsi request.
+	 */
+	bp->b_flags  = B_BUSY;
+	bp->b_bcount = 0;
+	bp->b_blkno  = 0;
 
-	status = sd_send_scsi_cmd(SD_GET_DEV(un), &ucmd_buf, UIO_SYSSPACE,
-	    UIO_SYSSPACE, UIO_SYSSPACE, SD_PATH_DIRECT);
+	if (dkc != NULL) {
+		bp->b_iodone = sd_send_scsi_SYNCHRONIZE_CACHE_biodone;
+		uip->ui_dkc = *dkc;
+	}
+
+	bp->b_edev = SD_GET_DEV(un);
+	bp->b_dev = cmpdev(bp->b_edev);	/* maybe unnecessary? */
+
+	(void) sd_uscsi_strategy(bp);
+
+	/*
+	 * If synchronous request, wait for completion
+	 * If async just return and let b_iodone callback
+	 * cleanup.
+	 * NOTE: On return, u_ncmds_in_driver will be decremented,
+	 * but it was also incremented in sd_uscsi_strategy(), so
+	 * we should be ok.
+	 */
+	if (dkc == NULL) {
+		(void) biowait(bp);
+		rval = sd_send_scsi_SYNCHRONIZE_CACHE_biodone(bp);
+	}
+
+	return (rval);
+}
+
+
+static int
+sd_send_scsi_SYNCHRONIZE_CACHE_biodone(struct buf *bp)
+{
+	struct sd_uscsi_info *uip;
+	struct uscsi_cmd *uscmd;
+	struct scsi_extended_sense *sense_buf;
+	struct sd_lun *un;
+	int status;
 
+	uip = (struct sd_uscsi_info *)(bp->b_private);
+	ASSERT(uip != NULL);
+
+	uscmd = uip->ui_cmdp;
+	ASSERT(uscmd != NULL);
+
+	sense_buf = (struct scsi_extended_sense *)uscmd->uscsi_rqbuf;
+	ASSERT(sense_buf != NULL);
+
+	un = ddi_get_soft_state(sd_state, SD_GET_INSTANCE_FROM_BUF(bp));
+	ASSERT(un != NULL);
+
+	status = geterror(bp);
 	switch (status) {
 	case 0:
 		break;	/* Success! */
 	case EIO:
-		switch (ucmd_buf.uscsi_status) {
+		switch (uscmd->uscsi_status) {
 		case STATUS_RESERVATION_CONFLICT:
 			/* Ignore reservation conflict */
 			status = 0;
 			goto done;
 
 		case STATUS_CHECK:
-			if ((ucmd_buf.uscsi_rqstatus == STATUS_GOOD) &&
-			    (sense_buf.es_key == KEY_ILLEGAL_REQUEST)) {
+			if ((uscmd->uscsi_rqstatus == STATUS_GOOD) &&
+			    (sense_buf->es_key == KEY_ILLEGAL_REQUEST)) {
 				/* Ignore Illegal Request error */
-				status = 0;
+				mutex_enter(SD_MUTEX(un));
+				un->un_f_sync_cache_unsupported = TRUE;
+				mutex_exit(SD_MUTEX(un));
+				status = ENOTSUP;
 				goto done;
 			}
 			break;
@@ -19881,7 +20092,7 @@ sd_send_scsi_SYNCHRONIZE_CACHE(struct sd_lun *un)
 		}
 		/* FALLTHRU */
 	default:
-		/* Ignore error if the media is not present. */
+		/* Ignore error if the media is not present */
 		if (sd_send_scsi_TEST_UNIT_READY(un, 0) != 0) {
 			status = 0;
 			goto done;
@@ -19893,7 +20104,16 @@ sd_send_scsi_SYNCHRONIZE_CACHE(struct sd_lun *un)
 	}
 
 done:
-	SD_TRACE(SD_LOG_IO, un, "sd_send_scsi_SYNCHRONIZE_CACHE: exit\n");
+	if (uip->ui_dkc.dkc_callback != NULL) {
+		(*uip->ui_dkc.dkc_callback)(uip->ui_dkc.dkc_cookie, status);
+	}
+
+	ASSERT((bp->b_flags & B_REMAPPED) == 0);
+	freerbuf(bp);
+	kmem_free(uip, sizeof (struct sd_uscsi_info));
+	kmem_free(uscmd->uscsi_rqbuf, SENSE_LENGTH);
+	kmem_free(uscmd->uscsi_cdb, (size_t)uscmd->uscsi_cdblen);
+	kmem_free(uscmd, sizeof (struct uscsi_cmd));
 
 	return (status);
 }
@@ -20641,6 +20861,7 @@ sdioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cred_p, int *rval_p)
 		case DKIOCSVTOC:
 		case DKIOCSETEFI:
 		case DKIOCSMBOOT:
+		case DKIOCFLUSHWRITECACHE:
 			mutex_exit(SD_MUTEX(un));
 			err = sd_send_scsi_TEST_UNIT_READY(un, 0);
 			if (err != 0) {
@@ -21517,6 +21738,42 @@ sdioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cred_p, int *rval_p)
 
 #endif /* SD_FAULT_INJECTION */
 
+	case DKIOCFLUSHWRITECACHE:
+		{
+			struct dk_callback *dkc = (struct dk_callback *)arg;
+
+			mutex_enter(SD_MUTEX(un));
+			if (un->un_f_sync_cache_unsupported ||
+			    ! un->un_f_write_cache_enabled) {
+				err = un->un_f_sync_cache_unsupported ?
+					ENOTSUP : 0;
+				mutex_exit(SD_MUTEX(un));
+				if ((flag & FKIOCTL) && dkc != NULL &&
+				    dkc->dkc_callback != NULL) {
+					(*dkc->dkc_callback)(dkc->dkc_cookie,
+					    err);
+					/*
+					 * Did callback and reported error.
+					 * Since we did a callback, ioctl
+					 * should return 0.
+					 */
+					err = 0;
+				}
+				break;
+			}
+			mutex_exit(SD_MUTEX(un));
+
+			if ((flag & FKIOCTL) && dkc != NULL &&
+			    dkc->dkc_callback != NULL) {
+				/* async SYNC CACHE request */
+				err = sd_send_scsi_SYNCHRONIZE_CACHE(un, dkc);
+			} else {
+				/* synchronous SYNC CACHE request */
+				err = sd_send_scsi_SYNCHRONIZE_CACHE(un, NULL);
+			}
+		}
+		break;
+
 	default:
 		err = ENOTTY;
 		break;
diff --git a/usr/src/uts/common/io/usb/scsa2usb/scsa2usb.c b/usr/src/uts/common/io/usb/scsa2usb/scsa2usb.c
index 0569d64f1cf5..84e8d15ce527 100644
--- a/usr/src/uts/common/io/usb/scsa2usb/scsa2usb.c
+++ b/usr/src/uts/common/io/usb/scsa2usb/scsa2usb.c
@@ -1750,7 +1750,7 @@ scsa2usb_create_luns(scsa2usb_state_t *scsa2usbp)
 		}
 
 		ndi_devi_alloc_sleep(scsa2usbp->scsa2usb_dip, node_name,
-		    (dnode_t)DEVI_SID_NODEID, &cdip);
+		    (pnode_t)DEVI_SID_NODEID, &cdip);
 
 		/* attach target & lun properties */
 		rval = ndi_prop_update_int(DDI_DEV_T_NONE, cdip, "target", 0);
diff --git a/usr/src/uts/common/io/usb/usba/usba.c b/usr/src/uts/common/io/usb/usba/usba.c
index d25ed5bf22fa..f4abdbd69b8c 100644
--- a/usr/src/uts/common/io/usb/usba/usba.c
+++ b/usr/src/uts/common/io/usb/usba/usba.c
@@ -822,7 +822,7 @@ usba_create_child_devi(dev_info_t	*dip,
 	    "port status=0x%x", node_name,
 	    (void *)usba_device, port_status);
 
-	ndi_devi_alloc_sleep(dip, node_name, (dnode_t)DEVI_SID_NODEID,
+	ndi_devi_alloc_sleep(dip, node_name, (pnode_t)DEVI_SID_NODEID,
 				child_dip);
 
 	USB_DPRINTF_L3(DPRINT_MASK_USBA, usba_log_handle,
diff --git a/usr/src/uts/common/nfs/nfs4.h b/usr/src/uts/common/nfs/nfs4.h
index 3da429675efe..e45fc311c442 100644
--- a/usr/src/uts/common/nfs/nfs4.h
+++ b/usr/src/uts/common/nfs/nfs4.h
@@ -1265,8 +1265,8 @@ extern int	vs_aent_to_ace4(vsecattr_t *, vsecattr_t *, int, int);
 extern int	vs_ace4_to_aent(vsecattr_t *, vsecattr_t *, uid_t, gid_t,
     int, int, int);
 extern int	vs_ace4_to_acet(vsecattr_t *, vsecattr_t *, uid_t, gid_t,
-    int, int, int);
-extern int	vs_acet_to_ace4(vsecattr_t *, vsecattr_t *, int, int);
+    int, int);
+extern int	vs_acet_to_ace4(vsecattr_t *, vsecattr_t *, int);
 extern void	vs_acet_destroy(vsecattr_t *);
 extern void	vs_ace4_destroy(vsecattr_t *);
 extern void	vs_aent_destroy(vsecattr_t *);
diff --git a/usr/src/uts/common/os/autoconf.c b/usr/src/uts/common/os/autoconf.c
index 9be275c03bed..6127073b2848 100644
--- a/usr/src/uts/common/os/autoconf.c
+++ b/usr/src/uts/common/os/autoconf.c
@@ -208,11 +208,11 @@ getlongprop_buf(int id, char *name, char *buf, int maxlen)
 {
 	int size;
 
-	size = prom_getproplen((dnode_t)id, name);
+	size = prom_getproplen((pnode_t)id, name);
 	if (size <= 0 || (size > maxlen - 1))
 		return (-1);
 
-	if (-1 == prom_getprop((dnode_t)id, name, buf))
+	if (-1 == prom_getprop((pnode_t)id, name, buf))
 		return (-1);
 
 	/*
@@ -246,14 +246,14 @@ get_neighbors(dev_info_t *di, int flag)
 	snid = cnid = 0;
 	switch (flag) {
 		case DDI_WALK_PRUNESIB:
-			cnid = (int)prom_childnode((dnode_t)nid);
+			cnid = (int)prom_childnode((pnode_t)nid);
 			break;
 		case DDI_WALK_PRUNECHILD:
-			snid = (int)prom_nextnode((dnode_t)nid);
+			snid = (int)prom_nextnode((pnode_t)nid);
 			break;
 		case 0:
-			snid = (int)prom_nextnode((dnode_t)nid);
-			cnid = (int)prom_childnode((dnode_t)nid);
+			snid = (int)prom_nextnode((pnode_t)nid);
+			cnid = (int)prom_childnode((pnode_t)nid);
 			break;
 		default:
 			return (DDI_WALK_TERMINATE);
@@ -265,7 +265,7 @@ get_neighbors(dev_info_t *di, int flag)
 		 * add the first sibling that passes check_status()
 		 */
 		for (; snid && (snid != -1);
-		    snid = (int)prom_nextnode((dnode_t)snid)) {
+		    snid = (int)prom_nextnode((pnode_t)snid)) {
 			if (getlongprop_buf(snid, OBP_NAME, buf,
 			    sizeof (buf)) > 0) {
 				if (check_status(snid, buf, parent) ==
@@ -286,9 +286,9 @@ get_neighbors(dev_info_t *di, int flag)
 			if (check_status(cnid, buf, di) == DDI_SUCCESS) {
 				(void) ddi_add_child(di, buf, cnid, -1);
 			} else {
-				for (cnid = (int)prom_nextnode((dnode_t)cnid);
+				for (cnid = (int)prom_nextnode((pnode_t)cnid);
 				    cnid && (cnid != -1);
-				    cnid = (int)prom_nextnode((dnode_t)cnid)) {
+				    cnid = (int)prom_nextnode((pnode_t)cnid)) {
 					if (getlongprop_buf(cnid, OBP_NAME,
 					    buf, sizeof (buf)) > 0) {
 						if (check_status(cnid, buf, di)
@@ -352,7 +352,7 @@ static void
 create_devinfo_tree(void)
 {
 	major_t major;
-	dnode_t nodeid;
+	pnode_t nodeid;
 
 	i_ddi_node_cache_init();
 #if defined(__sparc)
diff --git a/usr/src/uts/common/os/devcfg.c b/usr/src/uts/common/os/devcfg.c
index c11cbcdf86b6..deabd79f3770 100644
--- a/usr/src/uts/common/os/devcfg.c
+++ b/usr/src/uts/common/os/devcfg.c
@@ -80,7 +80,7 @@ struct mt_config_handle {
 };
 
 struct devi_nodeid {
-	dnode_t nodeid;
+	pnode_t nodeid;
 	dev_info_t *dip;
 	struct devi_nodeid *next;
 };
@@ -196,7 +196,7 @@ i_ddi_node_cache_init()
  * The allocated node has a reference count of 0.
  */
 dev_info_t *
-i_ddi_alloc_node(dev_info_t *pdip, char *node_name, dnode_t nodeid,
+i_ddi_alloc_node(dev_info_t *pdip, char *node_name, pnode_t nodeid,
     int instance, ddi_prop_t *sys_prop, int flag)
 {
 	struct dev_info *devi;
@@ -1666,7 +1666,7 @@ ndi_devi_tryenter(dev_info_t *dip, int *circular)
  * not allowed to sleep.
  */
 int
-ndi_devi_alloc(dev_info_t *parent, char *node_name, dnode_t nodeid,
+ndi_devi_alloc(dev_info_t *parent, char *node_name, pnode_t nodeid,
     dev_info_t **ret_dip)
 {
 	ASSERT(node_name != NULL);
@@ -1686,7 +1686,7 @@ ndi_devi_alloc(dev_info_t *parent, char *node_name, dnode_t nodeid,
  * This routine may sleep and should not be called at interrupt time
  */
 void
-ndi_devi_alloc_sleep(dev_info_t *parent, char *node_name, dnode_t nodeid,
+ndi_devi_alloc_sleep(dev_info_t *parent, char *node_name, pnode_t nodeid,
     dev_info_t **ret_dip)
 {
 	ASSERT(node_name != NULL);
@@ -3652,7 +3652,7 @@ init_spec_child(dev_info_t *pdip, struct hwc_spec *specp, uint_t flags)
 		return;
 	}
 
-	dip = i_ddi_alloc_node(pdip, node_name, (dnode_t)DEVI_PSEUDO_NODEID,
+	dip = i_ddi_alloc_node(pdip, node_name, (pnode_t)DEVI_PSEUDO_NODEID,
 	    -1, specp->hwc_devi_sys_prop_ptr, KM_SLEEP);
 
 	if (dip == NULL)
@@ -5367,7 +5367,7 @@ path_to_major(char *path)
 {
 	dev_info_t *dip;
 	char *p, *q;
-	dnode_t nodeid;
+	pnode_t nodeid;
 	major_t maj;
 
 	/*
@@ -6390,7 +6390,7 @@ mt_config_driver(struct mt_config_handle *hdl)
  * NOTE: This function will return NULL for .conf nodeids.
  */
 dev_info_t *
-e_ddi_nodeid_to_dip(dnode_t nodeid)
+e_ddi_nodeid_to_dip(pnode_t nodeid)
 {
 	dev_info_t		*dip = NULL;
 	struct devi_nodeid	*prev, *elem;
diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c
index 7ae9d5f16b11..be7ba8b0cc5f 100644
--- a/usr/src/uts/common/os/kmem.c
+++ b/usr/src/uts/common/os/kmem.c
@@ -1681,6 +1681,21 @@ kmem_cache_magazine_enable(kmem_cache_t *cp)
 
 }
 
+/*
+ * Reap (almost) everything right now.  See kmem_cache_magazine_purge()
+ * for explanation of the back-to-back kmem_depot_ws_update() calls.
+ */
+void
+kmem_cache_reap_now(kmem_cache_t *cp)
+{
+	kmem_depot_ws_update(cp);
+	kmem_depot_ws_update(cp);
+
+	(void) taskq_dispatch(kmem_taskq,
+	    (task_func_t *)kmem_depot_ws_reap, cp, TQ_SLEEP);
+	taskq_wait(kmem_taskq);
+}
+
 /*
  * Recompute a cache's magazine size.  The trade-off is that larger magazines
  * provide a higher transfer rate with the depot, while smaller magazines
@@ -1978,6 +1993,15 @@ kmem_maxavail(void)
 	return ((size_t)ptob(MAX(MIN(pmem, vmem), 0)));
 }
 
+/*
+ * Indicate whether memory-intensive kmem debugging is enabled.
+ */
+int
+kmem_debugging(void)
+{
+	return (kmem_flags & (KMF_AUDIT | KMF_REDZONE));
+}
+
 kmem_cache_t *
 kmem_cache_create(
 	char *name,		/* descriptive name for this cache */
diff --git a/usr/src/uts/common/os/list.c b/usr/src/uts/common/os/list.c
index b511d60bdcfe..8194b791471a 100644
--- a/usr/src/uts/common/os/list.c
+++ b/usr/src/uts/common/os/list.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -180,3 +180,16 @@ list_move_tail(list_t *dst, list_t *src)
 	/* empty src list */
 	srcnode->list_next = srcnode->list_prev = srcnode;
 }
+
+int
+list_link_active(list_node_t *link)
+{
+	ASSERT((link->list_next == NULL) == (link->list_prev == NULL));
+	return (link->list_next != NULL);
+}
+
+int
+list_is_empty(list_t *list)
+{
+	return (list_empty(list));
+}
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index 8c0b61a6c521..a5b011a3f82c 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -1761,3 +1761,15 @@ secpolicy_gart_map(const cred_t *cr)
 	}
 	return (0);
 }
+
+/*
+ * secpolicy_zfs
+ *
+ * Determine if the user has permission to manipulate ZFS datasets (not pools).
+ * Equivalent to the SYS_MOUNT privilege.
+ */
+int
+secpolicy_zfs(const cred_t *cr)
+{
+	return (PRIV_POLICY(cr, PRIV_SYS_MOUNT, B_FALSE, EPERM, NULL));
+}
diff --git a/usr/src/uts/common/os/printf.c b/usr/src/uts/common/os/printf.c
index d21e8cb02961..603da31b6211 100644
--- a/usr/src/uts/common/os/printf.c
+++ b/usr/src/uts/common/os/printf.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -298,6 +298,23 @@ assfail(const char *a, const char *f, int l)
 	return (0);
 }
 
+void
+assfail3(const char *a, uintmax_t lv, const char *op, uintmax_t rv,
+    const char *f, int l)
+{
+	if (aask)  {
+		printf("ASSERTION CAUGHT: %s (0x%llx %s 0x%llx), file: %s, "
+		    "line: %d", a, (u_longlong_t)lv, op, (u_longlong_t)rv,
+		    f, l);
+		debug_enter(NULL);
+	}
+
+	if (!aok && !panicstr)
+		panic("assertion failed: %s (0x%llx %s 0x%llx), file: %s, "
+		    "line: %d", a, (u_longlong_t)lv, op, (u_longlong_t)rv,
+		    f, l);
+}
+
 int
 strlog(short mid, short sid, char level, ushort_t sl, char *fmt, ...)
 {
diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c
index 92f331c157af..081e9d9005c4 100644
--- a/usr/src/uts/common/os/sunddi.c
+++ b/usr/src/uts/common/os/sunddi.c
@@ -4500,7 +4500,7 @@ impl_ddi_bus_prop_op(dev_t dev, dev_info_t *dip, dev_info_t *ch_dip,
 	if (((dev == DDI_DEV_T_NONE) || (dev == DDI_DEV_T_ANY)) &&
 	    ndi_dev_is_prom_node(ch_dip) &&
 	    ((mod_flags & DDI_PROP_NOTPROM) == 0)) {
-		len = prom_getproplen((dnode_t)DEVI(ch_dip)->devi_nodeid, name);
+		len = prom_getproplen((pnode_t)DEVI(ch_dip)->devi_nodeid, name);
 		if (len == -1) {
 			return (DDI_PROP_NOT_FOUND);
 		}
@@ -4555,7 +4555,7 @@ impl_ddi_bus_prop_op(dev_t dev, dev_info_t *dip, dev_info_t *ch_dip,
 		/*
 		 * Call the PROM function to do the copy.
 		 */
-		(void) prom_getprop((dnode_t)DEVI(ch_dip)->devi_nodeid,
+		(void) prom_getprop((pnode_t)DEVI(ch_dip)->devi_nodeid,
 			name, buffer);
 
 		*lengthp = len; /* return the actual length to the caller */
diff --git a/usr/src/uts/common/os/sunndi.c b/usr/src/uts/common/os/sunndi.c
index 47586687694a..4def65409450 100644
--- a/usr/src/uts/common/os/sunndi.c
+++ b/usr/src/uts/common/os/sunndi.c
@@ -957,7 +957,7 @@ i_dc_devi_create(struct devctl_iocdata *dcp, dev_info_t *pdip,
 	/*
 	 * construct a new dev_info node with a user-provided nodename
 	 */
-	ndi_devi_alloc_sleep(pdip, cname, (dnode_t)DEVI_SID_NODEID, &cdip);
+	ndi_devi_alloc_sleep(pdip, cname, (pnode_t)DEVI_SID_NODEID, &cdip);
 
 	/*
 	 * create hardware properties for each member in the property
diff --git a/usr/src/uts/common/os/swapgeneric.c b/usr/src/uts/common/os/swapgeneric.c
index a180ec293c7b..9da38ab18be3 100644
--- a/usr/src/uts/common/os/swapgeneric.c
+++ b/usr/src/uts/common/os/swapgeneric.c
@@ -877,7 +877,7 @@ load_boot_platform_modules(char *drv)
  * to locate a given nodeid in the device tree.
  */
 struct i_path_findnode {
-	dnode_t nodeid;
+	pnode_t nodeid;
 	dev_info_t *dip;
 };
 
@@ -952,7 +952,7 @@ netboot_over_ib(char *bootpath)
 
 	char		*temp;
 	boolean_t	ret = B_FALSE;
-	dnode_t		node = prom_finddevice(bootpath);
+	pnode_t		node = prom_finddevice(bootpath);
 	int		len;
 	char		devicetype[OBP_MAXDRVNAME];
 
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index b4abecaadfd5..71fe7305a2ae 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -173,7 +173,7 @@
  *   the following system calls (all subcodes of the primary "zone"
  *   system call):
  *   - zone_create: creates a zone with selected attributes (name,
- *     root path, privileges, resource controls)
+ *     root path, privileges, resource controls, ZFS datasets)
  *   - zone_enter: allows the current process to enter a zone
  *   - zone_getattr: reports attributes of a zone
  *   - zone_list: lists all zones active in the system
@@ -769,6 +769,23 @@ zone_free_zsd(zone_t *zone)
 	list_destroy(&zone->zone_zsd);
 }
 
+/*
+ * Frees memory associated with the zone dataset list.
+ */
+static void
+zone_free_datasets(zone_t *zone)
+{
+	zone_dataset_t *t, *next;
+
+	for (t = list_head(&zone->zone_datasets); t != NULL; t = next) {
+		next = list_next(&zone->zone_datasets, t);
+		list_remove(&zone->zone_datasets, t);
+		kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1);
+		kmem_free(t, sizeof (*t));
+	}
+	list_destroy(&zone->zone_datasets);
+}
+
 /*
  * zone.cpu-shares resource control support.
  */
@@ -1055,6 +1072,7 @@ zone_free(zone_t *zone)
 	}
 
 	zone_free_zsd(zone);
+	zone_free_datasets(zone);
 
 	if (zone->zone_rootvp != NULL)
 		VN_RELE(zone->zone_rootvp);
@@ -2499,6 +2517,55 @@ zone_create_error(int er_error, int er_ext, int *er_out) {
 	return (set_errno(er_error));
 }
 
+/*
+ * Parses a comma-separated list of ZFS datasets into a per-zone dictionary.
+ */
+static int
+parse_zfs(zone_t *zone, caddr_t ubuf, size_t buflen)
+{
+	char *kbuf;
+	char *dataset, *next;
+	zone_dataset_t *zd;
+	size_t len;
+
+	if (ubuf == NULL || buflen == 0)
+		return (0);
+
+	if ((kbuf = kmem_alloc(buflen, KM_NOSLEEP)) == NULL)
+		return (ENOMEM);
+
+	if (copyin(ubuf, kbuf, buflen) != 0) {
+		kmem_free(kbuf, buflen);
+		return (EFAULT);
+	}
+
+	dataset = next = kbuf;
+	for (;;) {
+		zd = kmem_alloc(sizeof (zone_dataset_t), KM_SLEEP);
+
+		next = strchr(dataset, ',');
+
+		if (next == NULL)
+			len = strlen(dataset);
+		else
+			len = next - dataset;
+
+		zd->zd_dataset = kmem_alloc(len + 1, KM_SLEEP);
+		bcopy(dataset, zd->zd_dataset, len);
+		zd->zd_dataset[len] = '\0';
+
+		list_insert_head(&zone->zone_datasets, zd);
+
+		if (next == NULL)
+			break;
+
+		dataset = next + 1;
+	}
+
+	kmem_free(kbuf, buflen);
+	return (0);
+}
+
 /*
  * System call to create/initialize a new zone named 'zone_name', rooted
  * at 'zone_root', with a zone-wide privilege limit set of 'zone_privs',
@@ -2510,7 +2577,7 @@ zone_create_error(int er_error, int er_ext, int *er_out) {
 static zoneid_t
 zone_create(const char *zone_name, const char *zone_root,
     const priv_set_t *zone_privs, caddr_t rctlbuf, size_t rctlbufsz,
-    int *extended_error)
+    caddr_t zfsbuf, size_t zfsbufsz, int *extended_error)
 {
 	struct zsched_arg zarg;
 	nvlist_t *rctls = NULL;
@@ -2543,6 +2610,8 @@ zone_create(const char *zone_name, const char *zone_root,
 	cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
 	list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
 	    offsetof(struct zsd_entry, zsd_linkage));
+	list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
+	    offsetof(zone_dataset_t, zd_linkage));
 
 	if ((error = zone_set_name(zone, zone_name)) != 0) {
 		zone_free(zone);
@@ -2578,6 +2647,11 @@ zone_create(const char *zone_name, const char *zone_root,
 		return (zone_create_error(error, 0, extended_error));
 	}
 
+	if ((error = parse_zfs(zone, zfsbuf, zfsbufsz)) != 0) {
+		zone_free(zone);
+		return (set_errno(error));
+	}
+
 	/*
 	 * Stop all lwps since that's what normally happens as part of fork().
 	 * This needs to happen before we grab any locks to avoid deadlock
@@ -3722,7 +3796,7 @@ zone_lookup(const char *zone_name)
 
 /* ARGSUSED */
 long
-zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4, void *arg5)
+zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
 {
 	zone_def zs;
 
@@ -3748,6 +3822,8 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4, void *arg5)
 			    (unsigned long)zs32.zone_privs;
 			zs.rctlbuf = (caddr_t)(unsigned long)zs32.rctlbuf;
 			zs.rctlbufsz = zs32.rctlbufsz;
+			zs.zfsbuf = (caddr_t)(unsigned long)zs32.zfsbuf;
+			zs.zfsbufsz = zs32.zfsbufsz;
 			zs.extended_error =
 			    (int *)(unsigned long)zs32.extended_error;
 #else
@@ -3757,6 +3833,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4, void *arg5)
 
 		return (zone_create(zs.zone_name, zs.zone_root,
 			zs.zone_privs, (caddr_t)zs.rctlbuf, zs.rctlbufsz,
+			(caddr_t)zs.zfsbuf, zs.zfsbufsz,
 			zs.extended_error));
 	case ZONE_BOOT:
 		return (zone_boot((zoneid_t)(uintptr_t)arg1,
@@ -4037,3 +4114,61 @@ zone_shutdown_global(void)
 	zone_status_set(global_zone, ZONE_IS_SHUTTING_DOWN);
 	mutex_exit(&zone_status_lock);
 }
+
+/*
+ * Returns true if the named dataset is visible in the current zone.
+ * The 'write' parameter is set to 1 if the dataset is also writable.
+ */
+int
+zone_dataset_visible(const char *dataset, int *write)
+{
+	zone_dataset_t *zd;
+	size_t len;
+	zone_t *zone = curproc->p_zone;
+
+	if (dataset[0] == '\0')
+		return (0);
+
+	/*
+	 * Walk the list once, looking for datasets which match exactly, or
+	 * specify a dataset underneath an exported dataset.  If found, return
+	 * true and note that it is writable.
+	 */
+	for (zd = list_head(&zone->zone_datasets); zd != NULL;
+	    zd = list_next(&zone->zone_datasets, zd)) {
+
+		len = strlen(zd->zd_dataset);
+		if (strlen(dataset) >= len &&
+		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
+		    (zd->zd_dataset[len-1] == '/' ||
+		    dataset[len] == '\0' || dataset[len] == '/')) {
+			if (write)
+				*write = 1;
+			return (1);
+		}
+	}
+
+	/*
+	 * Walk the list a second time, searching for datasets which are parents
+	 * of exported datasets.  These should be visible, but read-only.
+	 *
+	 * Note that we also have to support forms such as 'pool/dataset/', with
+	 * a trailing slash.
+	 */
+	for (zd = list_head(&zone->zone_datasets); zd != NULL;
+	    zd = list_next(&zone->zone_datasets, zd)) {
+
+		len = strlen(dataset);
+		if (dataset[len - 1] == '/')
+			len--;	/* Ignore trailing slash */
+		if (len < strlen(zd->zd_dataset) &&
+		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
+		    zd->zd_dataset[len] == '/') {
+			if (write)
+				*write = 0;
+			return (1);
+		}
+	}
+
+	return (0);
+}
diff --git a/usr/src/uts/common/pcmcia/nexus/pcmcia.c b/usr/src/uts/common/pcmcia/nexus/pcmcia.c
index 20a16203b28b..f6807b9d463e 100644
--- a/usr/src/uts/common/pcmcia/nexus/pcmcia.c
+++ b/usr/src/uts/common/pcmcia/nexus/pcmcia.c
@@ -1383,7 +1383,7 @@ pcmcia_number_socket(dev_info_t *dip, int localsocket)
 	dev_info_t *child = NULL;
 	struct pcmcia_parent_private *ppd;
 
-	if (ndi_devi_alloc(dip, "pcs", (dnode_t)DEVI_SID_NODEID,
+	if (ndi_devi_alloc(dip, "pcs", (pnode_t)DEVI_SID_NODEID,
 	    &child) == NDI_SUCCESS) {
 		ppd = kmem_zalloc(sizeof (struct pcmcia_parent_private),
 		    KM_SLEEP);
@@ -3224,7 +3224,7 @@ pcmcia_init_devinfo(dev_info_t *pdip, struct pcm_device_info *info)
 		else
 			name = info->pd_bind_name;
 
-		if (ndi_devi_alloc(pdip, name, (dnode_t)DEVI_SID_NODEID,
+		if (ndi_devi_alloc(pdip, name, (pnode_t)DEVI_SID_NODEID,
 		    &dip) !=
 		    NDI_SUCCESS) {
 			cmn_err(CE_WARN,
diff --git a/usr/src/uts/common/rpc/xdr_mem.c b/usr/src/uts/common/rpc/xdr_mem.c
index bb2590b3f3be..32ff32daaa07 100644
--- a/usr/src/uts/common/rpc/xdr_mem.c
+++ b/usr/src/uts/common/rpc/xdr_mem.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1989 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -56,7 +56,7 @@ static struct xdr_ops *xdrmem_ops(void);
  * memory buffer.
  */
 void
-xdrmem_create(XDR *xdrs, caddr_t addr, u_int size, enum xdr_op op)
+xdrmem_create(XDR *xdrs, caddr_t addr, uint_t size, enum xdr_op op)
 {
 	xdrs->x_op = op;
 	xdrs->x_ops = xdrmem_ops();
@@ -113,14 +113,14 @@ xdrmem_putbytes(XDR *xdrs, caddr_t addr, int len)
 	return (TRUE);
 }
 
-static u_int
+static uint_t
 xdrmem_getpos(XDR *xdrs)
 {
-	return ((u_int)((uintptr_t)xdrs->x_private - (uintptr_t)xdrs->x_base));
+	return ((uint_t)((uintptr_t)xdrs->x_private - (uintptr_t)xdrs->x_base));
 }
 
 static bool_t
-xdrmem_setpos(XDR *xdrs, u_int pos)
+xdrmem_setpos(XDR *xdrs, uint_t pos)
 {
 	caddr_t newaddr = xdrs->x_base + pos;
 	caddr_t lastaddr = xdrs->x_private + xdrs->x_handy;
@@ -142,7 +142,7 @@ xdrmem_inline(XDR *xdrs, int len)
 	if (xdrs->x_handy >= len) {
 		xdrs->x_handy -= len;
 		/* LINTED pointer alignment */
-		buf = (rpc_inline_t *) xdrs->x_private;
+		buf = (rpc_inline_t *)xdrs->x_private;
 		xdrs->x_private += len;
 	}
 	return (buf);
@@ -151,10 +151,18 @@ xdrmem_inline(XDR *xdrs, int len)
 static bool_t
 xdrmem_control(XDR *xdrs, int request, void *info)
 {
+	xdr_bytesrec *xptr;
 	int32_t *int32p;
 	int len;
 
 	switch (request) {
+
+	case XDR_GET_BYTES_AVAIL:
+		xptr = (xdr_bytesrec *)info;
+		xptr->xc_is_last_record = TRUE;
+		xptr->xc_num_avail = xdrs->x_handy;
+		return (TRUE);
+
 	case XDR_PEEK:
 		/*
 		 * Return the next 4 byte unit in the XDR stream.
@@ -177,9 +185,8 @@ xdrmem_control(XDR *xdrs, int request, void *info)
 		xdrs->x_private += len;
 		return (TRUE);
 
-	default:
-		return (FALSE);
 	}
+	return (FALSE);
 }
 
 static struct xdr_ops *
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index 3bebf87a416e..4af9e19f1219 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -720,7 +720,8 @@ FSHDRS=				\
 	ufs_prot.h		\
 	ufs_quota.h		\
 	ufs_snap.h              \
-	ufs_trans.h
+	ufs_trans.h		\
+	zfs.h
 
 PCMCIAHDRS=		\
 	pcata.h		\
diff --git a/usr/src/uts/common/sys/acl.h b/usr/src/uts/common/sys/acl.h
index e6a7e43c7757..7c34d887531c 100644
--- a/usr/src/uts/common/sys/acl.h
+++ b/usr/src/uts/common/sys/acl.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -44,11 +44,13 @@ typedef struct acl {
 
 typedef struct ace {
 	uid_t		a_who;		/* uid or gid */
-	uint32_t	a_access_mask;	/* "rwx" */
+	uint32_t	a_access_mask;	/* read,write,... */
 	uint16_t	a_flags;	/* see below */
 	uint16_t	a_type;		/* allow or deny */
 } ace_t;
 
+typedef struct acl_info acl_t;
+
 /*
  * The following are Defined types for an aclent_t.
  */
@@ -75,37 +77,61 @@ typedef struct ace {
 /*
  * The following are defined for ace_t.
  */
+#define	ACE_READ_DATA		0x00000001
+#define	ACE_LIST_DIRECTORY	0x00000001
+#define	ACE_WRITE_DATA		0x00000002
+#define	ACE_ADD_FILE		0x00000002
+#define	ACE_APPEND_DATA		0x00000004
+#define	ACE_ADD_SUBDIRECTORY	0x00000004
+#define	ACE_READ_NAMED_ATTRS	0x00000008
+#define	ACE_WRITE_NAMED_ATTRS	0x00000010
+#define	ACE_EXECUTE		0x00000020
+#define	ACE_DELETE_CHILD	0x00000040
+#define	ACE_READ_ATTRIBUTES	0x00000080
+#define	ACE_WRITE_ATTRIBUTES	0x00000100
+#define	ACE_DELETE		0x00010000
+#define	ACE_READ_ACL		0x00020000
+#define	ACE_WRITE_ACL		0x00040000
+#define	ACE_WRITE_OWNER		0x00080000
+#define	ACE_SYNCHRONIZE		0x00100000
+
 #define	ACE_FILE_INHERIT_ACE		0x0001
 #define	ACE_DIRECTORY_INHERIT_ACE	0x0002
-#define	ACE_NO_PROPOGATE_INHERIT_ACE	0x0004
+#define	ACE_NO_PROPAGATE_INHERIT_ACE	0x0004
 #define	ACE_INHERIT_ONLY_ACE		0x0008
-#define	ACE_LOCALLY_DEFINED		0x0010
-#define	ACE_OWNER			0x0100 /* file owner */
-#define	ACE_GROUP			0x0200 /* file group */
-#define	ACE_OTHER			0x0400 /* other field */
-#define	ACE_USER			0x0800 /* additional users */
-#define	ACE_GROUPS			0x1000 /* additional groups */
+#define	ACE_SUCCESSFUL_ACCESS_ACE_FLAG	0x0010
+#define	ACE_FAILED_ACCESS_ACE_FLAG	0x0020
+#define	ACE_IDENTIFIER_GROUP		0x0040
+#define	ACE_OWNER			0x1000
+#define	ACE_GROUP			0x2000
+#define	ACE_EVERYONE			0x4000
+
+#define	ACE_ACCESS_ALLOWED_ACE_TYPE	0x0000
+#define	ACE_ACCESS_DENIED_ACE_TYPE	0x0001
+#define	ACE_SYSTEM_AUDIT_ACE_TYPE	0x0002
+#define	ACE_SYSTEM_ALARM_ACE_TYPE	0x0003
+
+#define	ACE_ALL_PERMS	(ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \
+    ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \
+    ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \
+    ACE_WRITE_OWNER|ACE_SYNCHRONIZE)
+
 /*
  * The following flags are supported by both NFSv4 ACLs and ace_t.
  */
 #define	ACE_NFSV4_SUP_FLAGS (ACE_FILE_INHERIT_ACE | \
     ACE_DIRECTORY_INHERIT_ACE | \
-    ACE_NO_PROPOGATE_INHERIT_ACE | \
-    ACE_INHERIT_ONLY_ACE)
-
-#define	ALLOW	0
-#define	DENY	1
-
-#define	ACE_READ_DATA	04	/* 'r'	 */
-#define	ACE_WRITE_DATA	02	/* 'w'	 */
-#define	ACE_EXECUTE	01	/* 'x'	 */
+    ACE_NO_PROPAGATE_INHERIT_ACE | \
+    ACE_INHERIT_ONLY_ACE | \
+    ACE_IDENTIFIER_GROUP)
 
 /* cmd args to acl(2) for aclent_t  */
 #define	GETACL			1
 #define	SETACL			2
 #define	GETACLCNT		3
 
-/* cmd's to manipulate ace acl's. */
+/* cmd's to manipulate ace acls. */
 #define	ACE_GETACL		4
 #define	ACE_SETACL		5
 #define	ACE_GETACLCNT		6
@@ -125,6 +151,7 @@ typedef struct ace {
 #define	MEM_ERROR		7
 #define	ENTRY_ERROR		8
 
+
 /*
  * similar to ufs_acl.h: changed to char type for user commands (tar, cpio)
  * Attribute types
@@ -132,6 +159,43 @@ typedef struct ace {
 #define	UFSD_FREE	('0')	/* Free entry */
 #define	UFSD_ACL	('1')	/* Access Control Lists */
 #define	UFSD_DFACL	('2')	/* reserved for future use */
+#define	ACE_ACL		('3')	/* ace_t style acls */
+
+/*
+ * flag to [f]acl_get()
+ * controls whether a trivial acl should be returned.
+ */
+#define	ACL_NO_TRIVIAL	0x2
+
+/*
+ * Legacy aclcheck errors for aclent_t ACLs
+ */
+#define	EACL_GRP_ERROR		GRP_ERROR
+#define	EACL_USER_ERROR		USER_ERROR
+#define	EACL_OTHER_ERROR	OTHER_ERROR
+#define	EACL_CLASS_ERROR	CLASS_ERROR
+#define	EACL_DUPLICATE_ERROR	DUPLICATE_ERROR
+#define	EACL_MISS_ERROR		MISS_ERROR
+#define	EACL_MEM_ERROR		MEM_ERROR
+#define	EACL_ENTRY_ERROR	ENTRY_ERROR
+
+#define	EACL_INHERIT_ERROR	9		/* invalid inherit flags */
+#define	EACL_FLAGS_ERROR	10		/* unknown flag value */
+#define	EACL_PERM_MASK_ERROR	11		/* unknown permission */
+#define	EACL_COUNT_ERROR	12		/* invalid acl count */
+
+#define	EACL_INVALID_SLOT	13		/* invalid acl slot */
+#define	EACL_NO_ACL_ENTRY	14		/* Entry doesn't exist */
+#define	EACL_DIFF_TYPE		15		/* acls aren't same type */
+
+#define	EACL_INVALID_USER_GROUP	16		/* need user/group name */
+#define	EACL_INVALID_STR	17		/* invalid acl string */
+#define	EACL_FIELD_NOT_BLANK	18		/* can't have blank field */
+#define	EACL_INVALID_ACCESS_TYPE 19		/* invalid access type */
+#define	EACL_UNKNOWN_DATA	20		/* Unrecognized data in ACL */
+#define	EACL_MISSING_FIELDS	21		/* missing fields in acl */
+
+#define	EACL_INHERIT_NOTDIR	22		/* Need dir for inheritance */
 
 extern int aclcheck(aclent_t *, int, int *);
 extern int acltomode(aclent_t *, int, mode_t *);
@@ -139,6 +203,16 @@ extern int aclfrommode(aclent_t *, int, mode_t *);
 extern int aclsort(int, int, aclent_t *);
 extern char *acltotext(aclent_t *, int);
 extern aclent_t *aclfromtext(char *, int *);
+extern void acl_free(acl_t *);
+extern int acl_get(const char *, int, acl_t **);
+extern int facl_get(int, int, acl_t **);
+extern int acl_set(const char *, acl_t *acl);
+extern int facl_set(int, acl_t *acl);
+extern int acl_strip(const char *, uid_t, gid_t, mode_t);
+extern int acl_trivial(const char *);
+extern char *acl_totext(acl_t *);
+extern int acl_fromtext(const char *, acl_t **);
+extern int acl_check(acl_t *, int);
 
 #else	/* !defined(_KERNEL) */
 
diff --git a/usr/src/uts/common/sys/autoconf.h b/usr/src/uts/common/sys/autoconf.h
index a3ff515d1d0e..73082ab4129f 100644
--- a/usr/src/uts/common/sys/autoconf.h
+++ b/usr/src/uts/common/sys/autoconf.h
@@ -236,7 +236,7 @@ extern int exclude_level(int);
 
 extern major_t path_to_major(char *);
 extern void i_ddi_node_cache_init(void);
-extern dev_info_t *i_ddi_alloc_node(dev_info_t *, char *, dnode_t, int,
+extern dev_info_t *i_ddi_alloc_node(dev_info_t *, char *, pnode_t, int,
     ddi_prop_t *, int);
 extern void i_ddi_forceattach_drivers(void);
 extern int i_ddi_io_initialized(void);
diff --git a/usr/src/uts/common/sys/avl.h b/usr/src/uts/common/sys/avl.h
index f3dbce7d9d6d..bf9af8948a8f 100644
--- a/usr/src/uts/common/sys/avl.h
+++ b/usr/src/uts/common/sys/avl.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -78,7 +78,7 @@ extern "C" {
  *
  * followed by any mixture of:
  *
- * 2a. Insert nodes with: avl_find() and avl_insert()
+ * 2a. Insert nodes with: avl_add(), or avl_find() and avl_insert()
  *
  * 2b. Visited elements with:
  *	 avl_first() - returns the lowest valued node
@@ -89,7 +89,7 @@ extern "C" {
  * 2c.  Find the node with the closest value either less than or greater
  *	than a given value with avl_nearest().
  *
- * 2d. Remove individual nodes from the list/tree with avl_remove.
+ * 2d. Remove individual nodes from the list/tree with avl_remove().
  *
  * and finally when the list is being destroyed
  *
@@ -235,7 +235,17 @@ extern void *avl_nearest(avl_tree_t *tree, avl_index_t where, int direction);
 
 
 /*
- * Remove a single node from the tree.
+ * Add a single node to the tree.
+ * The node must not be in the tree, and it must not
+ * compare equal to any other node already in the tree.
+ *
+ * node   - the node to add
+ */
+extern void avl_add(avl_tree_t *tree, void *node);
+
+
+/*
+ * Remove a single node from the tree.  The node must be in the tree.
  *
  * node   - the node to remove
  */
diff --git a/usr/src/uts/common/sys/debug.h b/usr/src/uts/common/sys/debug.h
index a114dc0e9912..cc419e51324e 100644
--- a/usr/src/uts/common/sys/debug.h
+++ b/usr/src/uts/common/sys/debug.h
@@ -20,12 +20,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
-/*	  All Rights Reserved  	*/
+/*	  All Rights Reserved	*/
 
 
 #ifndef _SYS_DEBUG_H
@@ -34,6 +34,7 @@
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/isa_defs.h>
+#include <sys/types.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -43,29 +44,22 @@ extern "C" {
  * ASSERT(ex) causes a panic or debugger entry if expression ex is not
  * true.  ASSERT() is included only for debugging, and is a no-op in
  * production kernels.  VERIFY(ex), on the other hand, behaves like
- * ASSERT on debug kernels but evaluates the expression on non-debug
- * kernels.
+ * ASSERT and is evaluated on both debug and non-debug kernels.
  */
 
-#ifdef _KERNEL
-#if DEBUG
-#define	VERIFY(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__)))
-#else
-#define	VERIFY(EX) ((void)(EX))
-#endif
-#endif
-
 #if defined(__STDC__)
 extern int assfail(const char *, const char *, int);
+#define	VERIFY(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__)))
 #if DEBUG
-#define	ASSERT(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__)))
+#define	ASSERT(EX) VERIFY(EX)
 #else
 #define	ASSERT(x)  ((void)0)
 #endif
 #else	/* defined(__STDC__) */
 extern int assfail();
+#define	VERIFY(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__)))
 #if DEBUG
-#define	ASSERT(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__)))
+#define	ASSERT(EX) VERIFY(EX)
 #else
 #define	ASSERT(x)  ((void)0)
 #endif
@@ -82,6 +76,39 @@ extern int assfail();
 #define	ASSERT32(x)	ASSERT(x)
 #endif
 
+/*
+ * ASSERT3() behaves like ASSERT() except that it is an explicit conditional,
+ * and prints out the values of the left and right hand expressions as part of
+ * the panic message to ease debugging.  The three variants imply the type
+ * of their arguments.  ASSERT3S() is for signed data types, ASSERT3U() is
+ * for unsigned, and ASSERT3P() is for pointers.  The VERIFY3*() macros
+ * have the same relationship as above.
+ */
+extern void assfail3(const char *, uintmax_t, const char *, uintmax_t,
+    const char *, int);
+#define	VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE) do { \
+	const TYPE __left = (TYPE)(LEFT); \
+	const TYPE __right = (TYPE)(RIGHT); \
+	if (!(__left OP __right)) \
+		assfail3(#LEFT " " #OP " " #RIGHT, \
+			(uintmax_t)__left, #OP, (uintmax_t)__right, \
+			__FILE__, __LINE__); \
+_NOTE(CONSTCOND) } while (0)
+
+
+#define	VERIFY3S(x, y, z)	VERIFY3_IMPL(x, y, z, int64_t)
+#define	VERIFY3U(x, y, z)	VERIFY3_IMPL(x, y, z, uint64_t)
+#define	VERIFY3P(x, y, z)	VERIFY3_IMPL(x, y, z, uintptr_t)
+#if DEBUG
+#define	ASSERT3S(x, y, z)	VERIFY3S(x, y, z)
+#define	ASSERT3U(x, y, z)	VERIFY3U(x, y, z)
+#define	ASSERT3P(x, y, z)	VERIFY3P(x, y, z)
+#else
+#define	ASSERT3S(x, y, z)	((void)0)
+#define	ASSERT3U(x, y, z)	((void)0)
+#define	ASSERT3P(x, y, z)	((void)0)
+#endif
+
 #ifdef	_KERNEL
 
 extern void abort_sequence_enter(char *);
diff --git a/usr/src/uts/common/sys/dkio.h b/usr/src/uts/common/sys/dkio.h
index 546976eea45a..b3de6726c831 100644
--- a/usr/src/uts/common/sys/dkio.h
+++ b/usr/src/uts/common/sys/dkio.h
@@ -166,6 +166,33 @@ struct dk_geom {
 #define	DKIOCGVTOC	(DKIOC|11)		/* Get VTOC */
 #define	DKIOCSVTOC	(DKIOC|12)		/* Set VTOC & Write to Disk */
 
+/*
+ * Disk Cache Controls.  These ioctls should be supported by
+ * all disk drivers.
+ *
+ * DKIOCFLUSHWRITECACHE when used from user-mode ignores the ioctl
+ * argument, but it should be passed as NULL to allow for future
+ * reinterpretation.  From user-mode, this ioctl request is synchronous.
+ *
+ * When invoked from within the kernel, the arg can be NULL to indicate
+ * a synchronous request or can be the address of a struct dk_callback
+ * to request an asynchronous callback when the flush request is complete.
+ * In this case, the flag to the ioctl must include FKIOCTL and the
+ * dkc_callback field of the pointed to struct must be non-null or the
+ * request is made synchronously.
+ *
+ * In the callback case: if the ioctl returns 0, a callback WILL be performed.
+ * If the ioctl returns non-zero, a callback will NOT be performed.
+ * NOTE: In some cases, the callback may be done BEFORE the ioctl call
+ * returns.  The caller's locking strategy should be prepared for this case.
+ */
+#define	DKIOCFLUSHWRITECACHE	(DKIOC|34)	/* flush cache to phys medium */
+
+struct dk_callback {
+	void (*dkc_callback)(void *dkc_cookie, int error);
+	void *dkc_cookie;
+};
+
 /*
  * The following ioctls are used by Sun drivers to communicate
  * with their associated format routines. Support of these ioctls
diff --git a/usr/src/uts/common/sys/dktp/dadk.h b/usr/src/uts/common/sys/dktp/dadk.h
index 1b1853e17ba6..0db566bd5066 100644
--- a/usr/src/uts/common/sys/dktp/dadk.h
+++ b/usr/src/uts/common/sys/dktp/dadk.h
@@ -45,7 +45,9 @@ struct	dadk {
 	unsigned dad_rmb : 1;		/* removable device		*/
 	unsigned dad_rdonly : 1;	/* read only device		*/
 	unsigned dad_cdrom : 1;		/* cdrom device			*/
-	unsigned dad_resv : 5;
+	unsigned dad_noflush : 1;	/* flush cmd unsupported	*/
+	unsigned dad_wce : 1;		/* disk write cache enabled	*/
+	unsigned dad_resv : 3;
 	unsigned char dad_type;		/* device type			*/
 	unsigned char dad_ctype;	/* controller type 		*/
 
@@ -74,6 +76,7 @@ struct	dadk {
  */
 #define	DADK_BSY_TIMEOUT	(drv_usectohz(5 * 1000000))
 #define	DADK_IO_TIME		35
+#define	DADK_FLUSH_CACHE_TIME	60
 #define	DADK_RETRY_COUNT	5
 #define	DADK_SILENT		1
 
@@ -97,6 +100,7 @@ int dadk_open(opaque_t objp, int flag);
 int dadk_close(opaque_t objp);
 int dadk_ioctl(opaque_t objp, dev_t dev, int cmd, intptr_t arg,
     int flag, cred_t *cred_p, int *rval_p);
+int dadk_flushdone(struct buf *bp);
 int dadk_strategy(opaque_t objp, struct buf *bp);
 int dadk_setgeom(opaque_t objp, struct tgdk_geom *dkgeom_p);
 int dadk_getgeom(opaque_t objp, struct tgdk_geom *dkgeom_p);
diff --git a/usr/src/uts/common/sys/dktp/dadkio.h b/usr/src/uts/common/sys/dktp/dadkio.h
index d5ff7c8cc5da..a6c2e792d1bf 100644
--- a/usr/src/uts/common/sys/dktp/dadkio.h
+++ b/usr/src/uts/common/sys/dktp/dadkio.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -39,7 +39,7 @@ extern "C" {
 #define	DIOCTL_GETMODEL		3	/* get model number		*/
 #define	DIOCTL_GETSERIAL	4	/* get serial number		*/
 #define	DIOCTL_RWCMD		5	/* read/write a disk		*/
-
+#define	DIOCTL_GETWCE		6	/* get write cache enabled state */
 /*
  * arg structure for DIOCTL_GETMODEL and DIOCTL_GETSERIAL
  * On input to the ioctl, is_size contains the size of the buffer
@@ -92,6 +92,8 @@ typedef struct dadk_ioc_string
 #define	DCMD_READOFFSET	24	/* cdrom read offset			*/
 #define	DCMD_READMODE2	25	/* cdrom mode 2				*/
 #define	DCMD_VOLCTRL	26	/* cdrom volume control			*/
+/* additional disk commands */
+#define	DCMD_FLUSH_CACHE 27	/* flush write cache to physical medium	*/
 
 /*	driver error code						*/
 #define	DERR_SUCCESS	0	/* success				*/
diff --git a/usr/src/uts/common/sys/esunddi.h b/usr/src/uts/common/sys/esunddi.h
index e057d7e36e01..3cde0b2b14fa 100644
--- a/usr/src/uts/common/sys/esunddi.h
+++ b/usr/src/uts/common/sys/esunddi.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -211,7 +211,7 @@ e_ddi_walk_driver(char *, int (*f)(dev_info_t *, void *), void *);
  * NOTE: .conf nodeids are not valid arguments to this function.
  */
 dev_info_t *
-e_ddi_nodeid_to_dip(dnode_t nodeid);
+e_ddi_nodeid_to_dip(pnode_t nodeid);
 
 /*
  * Obsolete interfaces, no longer used, to be removed.
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
new file mode 100644
index 000000000000..b40f4789b68b
--- /dev/null
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -0,0 +1,308 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_FS_ZFS_H
+#define	_SYS_FS_ZFS_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Types and constants shared between userland and the kernel.
+ */
+
+/*
+ * Each dataset can be one of the following types.  These constants can be
+ * combined into masks that can be passed to various functions.
+ */
+typedef enum {
+	ZFS_TYPE_FILESYSTEM	= 0x1,
+	ZFS_TYPE_SNAPSHOT	= 0x2,
+	ZFS_TYPE_VOLUME		= 0x4
+} zfs_type_t;
+
+#define	ZFS_TYPE_ANY	\
+	(ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT)
+
+/*
+ * Properties are identified by these constants.  They are arranged in order of
+ * how they should be displayed by 'zfs get'.  If you make any changes to this
+ * list, be sure to update the property table in usr/src/common/zfs/zfs_prop.c.
+ */
+typedef enum {
+	ZFS_PROP_INVAL = -1,
+	ZFS_PROP_TYPE,
+	ZFS_PROP_CREATION,
+	ZFS_PROP_USED,
+	ZFS_PROP_AVAILABLE,
+	ZFS_PROP_REFERENCED,
+	ZFS_PROP_COMPRESSRATIO,
+	ZFS_PROP_MOUNTED,
+	ZFS_PROP_ORIGIN,
+	ZFS_PROP_QUOTA,
+	ZFS_PROP_RESERVATION,
+	ZFS_PROP_VOLSIZE,
+	ZFS_PROP_VOLBLOCKSIZE,
+	ZFS_PROP_RECORDSIZE,
+	ZFS_PROP_MOUNTPOINT,
+	ZFS_PROP_SHARENFS,
+	ZFS_PROP_CHECKSUM,
+	ZFS_PROP_COMPRESSION,
+	ZFS_PROP_ATIME,
+	ZFS_PROP_DEVICES,
+	ZFS_PROP_EXEC,
+	ZFS_PROP_SETUID,
+	ZFS_PROP_READONLY,
+	ZFS_PROP_ZONED,
+	ZFS_PROP_SNAPDIR,
+	ZFS_PROP_ACLMODE,
+	ZFS_PROP_ACLINHERIT,
+	/*
+	 * The following properties are not exposed to the user, but are
+	 * accessible by libzfs clients.
+	 */
+	ZFS_PROP_CREATETXG,
+	ZFS_PROP_NAME,
+	ZFS_NPROP_ALL
+} zfs_prop_t;
+
+#define	ZFS_NPROP_VISIBLE	ZFS_PROP_CREATETXG
+
+/*
+ * The following functions are shared between libzfs and the kernel.
+ */
+zfs_prop_t zfs_name_to_prop(const char *);
+int zfs_prop_readonly(zfs_prop_t);
+void zfs_prop_default_string(zfs_prop_t, char *, size_t);
+uint64_t zfs_prop_default_numeric(zfs_prop_t);
+
+/*
+ * The following are configuration names used in the nvlist describing a pool's
+ * configuration.
+ */
+#define	ZPOOL_CONFIG_VERSION		"version"
+#define	ZPOOL_CONFIG_POOL_NAME		"name"
+#define	ZPOOL_CONFIG_POOL_STATE		"state"
+#define	ZPOOL_CONFIG_POOL_TXG		"txg"
+#define	ZPOOL_CONFIG_POOL_GUID		"pool_guid"
+#define	ZPOOL_CONFIG_CREATE_TXG		"create_txg"
+#define	ZPOOL_CONFIG_TOP_GUID		"top_guid"
+#define	ZPOOL_CONFIG_POOL_HEALTH	"pool_health"
+#define	ZPOOL_CONFIG_VDEV_TREE		"vdev_tree"
+#define	ZPOOL_CONFIG_TYPE		"type"
+#define	ZPOOL_CONFIG_CHILDREN		"children"
+#define	ZPOOL_CONFIG_ID			"id"
+#define	ZPOOL_CONFIG_GUID		"guid"
+#define	ZPOOL_CONFIG_PATH		"path"
+#define	ZPOOL_CONFIG_DEVID		"devid"
+#define	ZPOOL_CONFIG_METASLAB_ARRAY	"metaslab_array"
+#define	ZPOOL_CONFIG_METASLAB_SHIFT	"metaslab_shift"
+#define	ZPOOL_CONFIG_ASHIFT		"ashift"
+#define	ZPOOL_CONFIG_ASIZE		"asize"
+#define	ZPOOL_CONFIG_DTL		"DTL"
+#define	ZPOOL_CONFIG_STATS		"stats"
+
+#define	VDEV_TYPE_ROOT			"root"
+#define	VDEV_TYPE_MIRROR		"mirror"
+#define	VDEV_TYPE_REPLACING		"replacing"
+#define	VDEV_TYPE_RAIDZ			"raidz"
+#define	VDEV_TYPE_DISK			"disk"
+#define	VDEV_TYPE_FILE			"file"
+#define	VDEV_TYPE_MISSING		"missing"
+
+/*
+ * This is needed in userland to report the minimum necessary device size.
+ */
+#define	SPA_MINDEVSIZE		(64ULL << 20)
+
+/*
+ * The location of the pool configuration repository, shared between kernel and
+ * userland.
+ */
+#define	ZPOOL_CACHE_DIR		"/etc/zfs"
+#define	ZPOOL_CACHE_FILE	"zpool.cache"
+#define	ZPOOL_CACHE_TMP		".zpool.cache"
+
+#define	ZPOOL_CACHE		ZPOOL_CACHE_DIR "/" ZPOOL_CACHE_FILE
+
+/*
+ * vdev states are ordered from least to most healthy.
+ * A vdev that's CANT_OPEN or below is considered unusable.
+ */
+typedef enum vdev_state {
+	VDEV_STATE_UNKNOWN = 0,	/* Uninitialized vdev			*/
+	VDEV_STATE_CLOSED,	/* Not currently open			*/
+	VDEV_STATE_OFFLINE,	/* Not allowed to open			*/
+	VDEV_STATE_CANT_OPEN,	/* Tried to open, but failed		*/
+	VDEV_STATE_DEGRADED,	/* Replicated vdev with unhealthy kids	*/
+	VDEV_STATE_HEALTHY	/* Presumed good			*/
+} vdev_state_t;
+
+/*
+ * vdev aux states.  When a vdev is in the CANT_OPEN state, the aux field
+ * of the vdev stats structure uses these constants to distinguish why.
+ */
+typedef enum vdev_aux {
+	VDEV_AUX_NONE,		/* no error				*/
+	VDEV_AUX_OPEN_FAILED,	/* ldi_open_*() or vn_open() failed	*/
+	VDEV_AUX_CORRUPT_DATA,	/* bad label or disk contents		*/
+	VDEV_AUX_NO_REPLICAS,	/* insufficient number of replicas	*/
+	VDEV_AUX_BAD_GUID_SUM,	/* vdev guid sum doesn't match		*/
+	VDEV_AUX_TOO_SMALL,	/* vdev size is too small		*/
+	VDEV_AUX_BAD_LABEL	/* the label is OK but invalid		*/
+} vdev_aux_t;
+
+/*
+ * pool state.  The following states are actually written to disk as part of the
+ * normal SPA lifecycle: ACTIVE, EXPORTED, DESTROYED.  The remaining states
+ * (UNITIALIZED, UNAVAIL) are software abstractions used at various levels to
+ * communicate pool state.
+ */
+typedef enum pool_state {
+	POOL_STATE_ACTIVE = 0,		/* In active use		*/
+	POOL_STATE_EXPORTED,		/* Explicitly exported		*/
+	POOL_STATE_DESTROYED,		/* Explicitly destroyed		*/
+	POOL_STATE_UNINITIALIZED,	/* Internal spa_t state		*/
+	POOL_STATE_UNAVAIL		/* Internal libzfs state	*/
+} pool_state_t;
+
+/*
+ * Scrub types.
+ */
+typedef enum pool_scrub_type {
+	POOL_SCRUB_NONE,
+	POOL_SCRUB_RESILVER,
+	POOL_SCRUB_EVERYTHING,
+	POOL_SCRUB_TYPES
+} pool_scrub_type_t;
+
+/*
+ * ZIO types.  Needed to interpret vdev statistics below.
+ */
+typedef enum zio_type {
+	ZIO_TYPE_NULL = 0,
+	ZIO_TYPE_READ,
+	ZIO_TYPE_WRITE,
+	ZIO_TYPE_FREE,
+	ZIO_TYPE_CLAIM,
+	ZIO_TYPE_IOCTL,
+	ZIO_TYPES
+} zio_type_t;
+
+/*
+ * Vdev statistics.  Note: all fields should be 64-bit because this
+ * is passed between kernel and userland as an nvlist uint64 array.
+ */
+typedef struct vdev_stat {
+	hrtime_t	vs_timestamp;		/* time since vdev load	*/
+	uint64_t	vs_state;		/* vdev state		*/
+	uint64_t	vs_aux;			/* see vdev_aux_t	*/
+	uint64_t	vs_alloc;		/* space allocated	*/
+	uint64_t	vs_space;		/* total capacity	*/
+	uint64_t	vs_ops[ZIO_TYPES];	/* operation count	*/
+	uint64_t	vs_bytes[ZIO_TYPES];	/* bytes read/written	*/
+	uint64_t	vs_read_errors;		/* read errors		*/
+	uint64_t	vs_write_errors;	/* write errors		*/
+	uint64_t	vs_checksum_errors;	/* checksum errors	*/
+	uint64_t	vs_self_healed;		/* self-healed bytes	*/
+	uint64_t	vs_scrub_type;		/* pool_scrub_type_t	*/
+	uint64_t	vs_scrub_complete;	/* completed?		*/
+	uint64_t	vs_scrub_examined;	/* bytes examined; top	*/
+	uint64_t	vs_scrub_repaired;	/* bytes repaired; leaf	*/
+	uint64_t	vs_scrub_errors;	/* errors during scrub	*/
+	uint64_t	vs_scrub_start;		/* UTC scrub start time	*/
+	uint64_t	vs_scrub_end;		/* UTC scrub end time	*/
+} vdev_stat_t;
+
+#define	ZFS_DRIVER	"zfs"
+#define	ZFS_DEV		"/dev/zfs"
+
+/*
+ * zvol paths.  Irritatingly, the devfsadm interfaces want all these
+ * paths without the /dev prefix, but for some things, we want the
+ * /dev prefix.  Below are the names without /dev.
+ */
+#define	ZVOL_DEV_DIR	"zvol/dsk"
+#define	ZVOL_RDEV_DIR	"zvol/rdsk"
+
+/*
+ * And here are the things we need with /dev, etc. in front of them.
+ */
+#define	ZVOL_PSEUDO_DEV		"/devices/pseudo/zvol@0:"
+#define	ZVOL_FULL_DEV_DIR	"/dev/" ZVOL_DEV_DIR
+
+#define	ZVOL_PROP_NAME		"name"
+
+/*
+ * /dev/zfs ioctl numbers.
+ */
+#define	ZFS_IOC		('Z' << 8)
+
+typedef enum zfs_ioc {
+	ZFS_IOC_POOL_CREATE = ZFS_IOC,
+	ZFS_IOC_POOL_DESTROY,
+	ZFS_IOC_POOL_IMPORT,
+	ZFS_IOC_POOL_EXPORT,
+	ZFS_IOC_POOL_CONFIGS,
+	ZFS_IOC_POOL_GUID,
+	ZFS_IOC_POOL_STATS,
+	ZFS_IOC_POOL_TRYIMPORT,
+	ZFS_IOC_POOL_SCRUB,
+	ZFS_IOC_POOL_FREEZE,
+	ZFS_IOC_VDEV_ADD,
+	ZFS_IOC_VDEV_REMOVE,
+	ZFS_IOC_VDEV_ONLINE,
+	ZFS_IOC_VDEV_OFFLINE,
+	ZFS_IOC_VDEV_ATTACH,
+	ZFS_IOC_VDEV_DETACH,
+	ZFS_IOC_OBJSET_STATS,
+	ZFS_IOC_DATASET_LIST_NEXT,
+	ZFS_IOC_SNAPSHOT_LIST_NEXT,
+	ZFS_IOC_SET_PROP,
+	ZFS_IOC_SET_QUOTA,
+	ZFS_IOC_SET_RESERVATION,
+	ZFS_IOC_SET_VOLSIZE,
+	ZFS_IOC_SET_VOLBLOCKSIZE,
+	ZFS_IOC_CREATE_MINOR,
+	ZFS_IOC_REMOVE_MINOR,
+	ZFS_IOC_CREATE,
+	ZFS_IOC_DESTROY,
+	ZFS_IOC_ROLLBACK,
+	ZFS_IOC_RENAME,
+	ZFS_IOC_RECVBACKUP,
+	ZFS_IOC_SENDBACKUP
+} zfs_ioc_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_ZFS_H */
diff --git a/usr/src/uts/common/sys/kmem.h b/usr/src/uts/common/sys/kmem.h
index b0e0e030e75d..097e92f2e51a 100644
--- a/usr/src/uts/common/sys/kmem.h
+++ b/usr/src/uts/common/sys/kmem.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -92,6 +92,7 @@ extern void kmem_thread_init(void);
 extern void kmem_mp_init(void);
 extern void kmem_reap(void);
 extern void kmem_reap_idspace(void);
+extern int kmem_debugging(void);
 extern size_t kmem_avail(void);
 extern size_t kmem_maxavail(void);
 
@@ -102,6 +103,7 @@ extern void kmem_cache_destroy(kmem_cache_t *);
 extern void *kmem_cache_alloc(kmem_cache_t *, int);
 extern void kmem_cache_free(kmem_cache_t *, void *);
 extern uint64_t kmem_cache_stat(kmem_cache_t *, char *);
+extern void kmem_cache_reap_now(kmem_cache_t *);
 
 #endif	/* _KERNEL */
 
diff --git a/usr/src/uts/common/sys/list.h b/usr/src/uts/common/sys/list.h
index dc22eb1abd63..7e9d9aaaf750 100644
--- a/usr/src/uts/common/sys/list.h
+++ b/usr/src/uts/common/sys/list.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -36,9 +36,6 @@ extern "C" {
 #endif
 
 typedef struct list_node list_node_t;
-struct list;
-
-#ifdef _KERNEL
 typedef struct list list_t;
 
 void list_create(list_t *, size_t, size_t);
@@ -56,7 +53,8 @@ void *list_tail(list_t *);
 void *list_next(list_t *, void *);
 void *list_prev(list_t *, void *);
 
-#endif /* _KERNEL */
+int list_link_active(list_node_t *);
+int list_is_empty(list_t *);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/common/sys/mntent.h b/usr/src/uts/common/sys/mntent.h
index 75217cabda1c..ef1e29743019 100644
--- a/usr/src/uts/common/sys/mntent.h
+++ b/usr/src/uts/common/sys/mntent.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  *
  *	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T
@@ -40,6 +40,7 @@ extern "C" {
 #define	VFSTAB		"/etc/vfstab"
 #define	MNTMAXSTR	128
 
+#define	MNTTYPE_ZFS	"zfs"		/* ZFS file system */
 #define	MNTTYPE_UFS	"ufs"		/* Unix file system */
 #define	MNTTYPE_NFS	"nfs"		/* NFS file system */
 #define	MNTTYPE_NFS3	"nfs3"		/* NFS Version 3 file system */
@@ -116,7 +117,8 @@ extern "C" {
 #define	MNTOPT_PUBLIC	"public"	/* Use NFS public file handlee */
 #define	MNTOPT_LOGGING "logging" 	/* enable logging */
 #define	MNTOPT_NOLOGGING "nologging" 	/* disable logging */
-#define	MNTOPT_NOATIME  "noatime"	/* Do not update i_atime for inodes */
+#define	MNTOPT_ATIME	"atime"		/* update atime for files */
+#define	MNTOPT_NOATIME  "noatime"	/* do not update atime for files */
 #define	MNTOPT_GLOBAL	"global"	/* Cluster-wide global mount */
 #define	MNTOPT_NOGLOBAL	"noglobal"	/* Mount local to single node */
 #define	MNTOPT_DFRATIME	"dfratime"	/* Deferred access time updates */
diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h
index 3ac69f9bda40..c26ef92d3144 100644
--- a/usr/src/uts/common/sys/policy.h
+++ b/usr/src/uts/common/sys/policy.h
@@ -140,6 +140,7 @@ int secpolicy_vnode_setdac(const cred_t *, uid_t);
 int secpolicy_vnode_setid_retain(const cred_t *, boolean_t);
 int secpolicy_vnode_setids_setgids(const cred_t *, gid_t);
 int secpolicy_vnode_stky_modify(const cred_t *);
+int secpolicy_zfs(const cred_t *);
 
 int secpolicy_basic_exec(const cred_t *);
 int secpolicy_basic_fork(const cred_t *);
diff --git a/usr/src/uts/common/sys/scsi/targets/sddef.h b/usr/src/uts/common/sys/scsi/targets/sddef.h
index ee2dfeefea1f..3b2a45b09939 100644
--- a/usr/src/uts/common/sys/scsi/targets/sddef.h
+++ b/usr/src/uts/common/sys/scsi/targets/sddef.h
@@ -422,8 +422,8 @@ struct sd_lun {
 						/* a part of error recovery. */
 	    un_f_is_fibre		:1,	/* The device supports fibre */
 						/* channel */
-	    un_f_available		:1,	/* Not use */
-						/* available */
+	    un_f_sync_cache_unsupported	:1,	/* sync cache cmd not */
+						/* supported */
 	    un_f_format_in_progress	:1,	/* The device is currently */
 						/* executing a FORMAT cmd. */
 	    un_f_opt_queueing		:1,	/* Enable Command Queuing to */
@@ -433,7 +433,9 @@ struct sd_lun {
 	    un_f_opt_disable_cache	:1,	/* Read/Write disk cache is */
 						/* disabled.  */
 	    un_f_cfg_is_atapi		:1,	/* This is an ATAPI device.  */
-	    un_f_cfg_obsolete		:1,	/* available for reuse  */
+	    un_f_write_cache_enabled	:1,	/* device return success on */
+						/* writes before transfer to */
+						/* physical media complete */
 	    un_f_cfg_playmsf_bcd	:1,	/* Play Audio, BCD params. */
 	    un_f_cfg_readsub_bcd	:1,	/* READ SUBCHANNEL BCD resp. */
 	    un_f_cfg_read_toc_trk_bcd	:1,	/* track # is BCD */
@@ -1414,6 +1416,11 @@ _NOTE(SCHEME_PROTECTS_DATA("unique per pkt", sd_xbuf))
 struct sd_uscsi_info {
 	int			ui_flags;
 	struct uscsi_cmd	*ui_cmdp;
+	/*
+	 * ui_dkc is used by sd_send_scsi_SYNCHRONIZE_CACHE() to allow
+	 * for async completion notification.
+	 */
+	struct dk_callback	ui_dkc;
 };
 
 _NOTE(SCHEME_PROTECTS_DATA("Unshared data", sd_uscsi_info))
diff --git a/usr/src/uts/common/sys/sunndi.h b/usr/src/uts/common/sys/sunndi.h
index 60f2970ce56c..09b8c83bb606 100644
--- a/usr/src/uts/common/sys/sunndi.h
+++ b/usr/src/uts/common/sys/sunndi.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -119,11 +119,11 @@ ndi_prop_remove_all(dev_info_t *dip);
  * not allowed to sleep.
  */
 int
-ndi_devi_alloc(dev_info_t *parent, char *node_name, dnode_t nodeid,
+ndi_devi_alloc(dev_info_t *parent, char *node_name, pnode_t nodeid,
     dev_info_t **ret_dip);
 
 void
-ndi_devi_alloc_sleep(dev_info_t *parent, char *node_name, dnode_t nodeid,
+ndi_devi_alloc_sleep(dev_info_t *parent, char *node_name, pnode_t nodeid,
     dev_info_t **ret_dip);
 
 /*
diff --git a/usr/src/uts/common/sys/vnode.h b/usr/src/uts/common/sys/vnode.h
index 4fd3444bc07b..904d7a80ecf7 100644
--- a/usr/src/uts/common/sys/vnode.h
+++ b/usr/src/uts/common/sys/vnode.h
@@ -912,7 +912,9 @@ extern uint_t pvn_vmodsort_supported;
  * Generally useful macros.
  */
 #define	VBSIZE(vp)	((vp)->v_vfsp->vfs_bsize)
-#define	VTOZ(vp)	((vp)->v_vfsp->vfs_zone)
+
+#define	VTOZONE(vp)	((vp)->v_vfsp->vfs_zone)
+
 #define	NULLVP		((struct vnode *)0)
 #define	NULLVPP		((struct vnode **)0)
 
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index 0bc997a57b33..fbcbea12fafd 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -84,6 +84,8 @@ typedef struct {
 	caddr32_t rctlbuf;
 	size32_t rctlbufsz;
 	caddr32_t extended_error;
+	caddr32_t zfsbuf;
+	size32_t  zfsbufsz;
 } zone_def32;
 #endif
 typedef struct {
@@ -93,6 +95,8 @@ typedef struct {
 	const char *rctlbuf;
 	size_t rctlbufsz;
 	int *extended_error;
+	const char *zfsbuf;
+	size_t zfsbufsz;
 } zone_def;
 
 /* extended error information */
@@ -185,6 +189,14 @@ typedef struct zone_cmd_rval {
 
 struct pool;
 
+/*
+ * Structure to record list of ZFS datasets exported to a zone.
+ */
+typedef struct zone_dataset {
+	char		*zd_dataset;
+	list_node_t	zd_linkage;
+} zone_dataset_t;
+
 typedef struct zone {
 	/*
 	 * zone_name is never modified once set.
@@ -259,6 +271,10 @@ typedef struct zone {
 	 */
 	int		zone_ncpus;  /* zone's idea of ncpus */
 	int		zone_ncpus_online; /* zone's idea of ncpus_online */
+	/*
+	 * List of ZFS datasets exported to this zone.
+	 */
+	list_t		zone_datasets;	/* list of datasets */
 } zone_t;
 
 /*
@@ -273,7 +289,7 @@ extern rctl_hndl_t rc_zone_nlwps;
 
 extern const char * const zone_initname;
 
-extern long zone(int, void *, void *, void *, void *, void *);
+extern long zone(int, void *, void *, void *, void *);
 extern void zone_zsd_init(void);
 extern void zone_init(void);
 extern void zone_hold(zone_t *);
@@ -424,6 +440,11 @@ extern void zone_pset_set(zone_t *, psetid_t);
 extern int zone_ncpus_get(zone_t *);
 extern int zone_ncpus_online_get(zone_t *);
 
+/*
+ * Returns true if the named pool/dataset is visible in the current zone.
+ */
+extern int zone_dataset_visible(const char *, int *);
+
 /*
  * zone version of uadmin()
  */
diff --git a/usr/src/uts/common/syscall/acl.c b/usr/src/uts/common/syscall/acl.c
index a52184ec2e13..27ab1bbc2607 100644
--- a/usr/src/uts/common/syscall/acl.c
+++ b/usr/src/uts/common/syscall/acl.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -52,6 +52,7 @@
 #include <sys/filio.h>
 #include <sys/acl.h>
 #include <sys/cmn_err.h>
+#include <acl/acl_common.h>
 
 #include <sys/unistd.h>
 #include <sys/debug.h>
@@ -216,9 +217,6 @@ cacl(int cmd, int nentries, void *aclbufp, vnode_t *vp, int *rv)
 		break;
 
 	case ACE_GETACL:
-		if (nentries < 3)
-			return (EINVAL);
-
 		if (aclbufp == NULL)
 			return (EFAULT);
 
@@ -317,7 +315,7 @@ cacl(int cmd, int nentries, void *aclbufp, vnode_t *vp, int *rv)
 		break;
 
 	case ACE_SETACL:
-		if (nentries < 3 || nentries > (MAX_ACL_ENTRIES * 2))
+		if (nentries > (MAX_ACL_ENTRIES))
 			return (EINVAL);
 
 		if (aclbufp == NULL)
@@ -357,74 +355,3 @@ cacl(int cmd, int nentries, void *aclbufp, vnode_t *vp, int *rv)
 		kmem_free(vsecattr.vsa_dfaclentp, dfaclbsize);
 	return (error);
 }
-
-
-/*
- * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified.
- * v = Ptr to array/vector of objs
- * n = # objs in the array
- * s = size of each obj (must be multiples of a word size)
- * f = ptr to function to compare two objs
- *	returns (-1 = less than, 0 = equal, 1 = greater than
- */
-void
-ksort(caddr_t v, int n, int s, int (*f)())
-{
-	int g, i, j, ii;
-	unsigned int *p1, *p2;
-	unsigned int tmp;
-
-	/* No work to do */
-	if (v == NULL || n <= 1)
-		return;
-
-	/* Sanity check on arguments */
-	ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0);
-	ASSERT(s > 0);
-	for (g = n / 2; g > 0; g /= 2) {
-		for (i = g; i < n; i++) {
-			for (j = i - g; j >= 0 &&
-				(*f)(v + j * s, v + (j + g) * s) == 1;
-					j -= g) {
-				p1 = (unsigned *)(v + j * s);
-				p2 = (unsigned *)(v + (j + g) * s);
-				for (ii = 0; ii < s / 4; ii++) {
-					tmp = *p1;
-					*p1++ = *p2;
-					*p2++ = tmp;
-				}
-			}
-		}
-	}
-}
-
-/*
- * Compare two acls, all fields.  Returns:
- * -1 (less than)
- *  0 (equal)
- * +1 (greater than)
- */
-int
-cmp2acls(void *a, void *b)
-{
-	aclent_t *x = (aclent_t *)a;
-	aclent_t *y = (aclent_t *)b;
-
-	/* Compare types */
-	if (x->a_type < y->a_type)
-		return (-1);
-	if (x->a_type > y->a_type)
-		return (1);
-	/* Equal types; compare id's */
-	if (x->a_id < y->a_id)
-		return (-1);
-	if (x->a_id > y->a_id)
-		return (1);
-	/* Equal ids; compare perms */
-	if (x->a_perm < y->a_perm)
-		return (-1);
-	if (x->a_perm > y->a_perm)
-		return (1);
-	/* Totally equal */
-	return (0);
-}
diff --git a/usr/src/uts/common/syscall/systeminfo.c b/usr/src/uts/common/syscall/systeminfo.c
index 91c8e73ee4c8..5eb42cca61ab 100644
--- a/usr/src/uts/common/syscall/systeminfo.c
+++ b/usr/src/uts/common/syscall/systeminfo.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -244,7 +244,7 @@ systeminfo(int command, char *buf, long count)
  * to locate a given nodeid in the device tree.
  */
 struct i_path_findnode {
-	dnode_t nodeid;
+	pnode_t nodeid;
 	dev_info_t *dip;
 };
 
diff --git a/usr/src/uts/i86pc/io/acpica/acpi_enum.c b/usr/src/uts/i86pc/io/acpica/acpi_enum.c
index 2385303e7fce..0d18c988bf99 100644
--- a/usr/src/uts/i86pc/io/acpica/acpi_enum.c
+++ b/usr/src/uts/i86pc/io/acpica/acpi_enum.c
@@ -568,7 +568,7 @@ get_bus_dip(char *nodename, dev_info_t *isa_dip)
 	if (i8042_dip)
 		return (i8042_dip);
 
-	ndi_devi_alloc_sleep(isa_dip, "i8042", (dnode_t)DEVI_SID_NODEID,
+	ndi_devi_alloc_sleep(isa_dip, "i8042", (pnode_t)DEVI_SID_NODEID,
 	    &i8042_dip);
 	(void) ndi_prop_update_int_array(DDI_DEV_T_NONE, i8042_dip,
 	    "reg", (int *)i8042_regs, 6);
@@ -818,7 +818,7 @@ isa_acpi_callback(ACPI_HANDLE ObjHandle, uint32_t NestingLevel, void *a,
 			dip = get_bus_dip(devname, dip);
 		}
 		ndi_devi_alloc_sleep(dip, devname,
-		    (dnode_t)DEVI_SID_NODEID, &xdip);
+		    (pnode_t)DEVI_SID_NODEID, &xdip);
 		(void) ndi_prop_update_string(DDI_DEV_T_NONE, xdip,
 		    "compatible", hidstr);
 		(void) ndi_prop_update_string(DDI_DEV_T_NONE, xdip,
@@ -830,7 +830,7 @@ isa_acpi_callback(ACPI_HANDLE ObjHandle, uint32_t NestingLevel, void *a,
 			/* a keyboard device includes PNP03xx */
 			dip = get_bus_dip(keyboard_alias, dip);
 			ndi_devi_alloc_sleep(dip, keyboard_alias,
-			    (dnode_t)DEVI_SID_NODEID, &xdip);
+			    (pnode_t)DEVI_SID_NODEID, &xdip);
 			(void) ndi_prop_update_string(DDI_DEV_T_NONE, xdip,
 			    "compatible", "pnpPNP,303");
 			(void) ndi_prop_update_string(DDI_DEV_T_NONE, xdip,
@@ -840,7 +840,7 @@ isa_acpi_callback(ACPI_HANDLE ObjHandle, uint32_t NestingLevel, void *a,
 				/* a mouse device include PNP0Fxx */
 				dip = get_bus_dip(mouse_alias, dip);
 				ndi_devi_alloc_sleep(dip, mouse_alias,
-				    (dnode_t)DEVI_SID_NODEID, &xdip);
+				    (pnode_t)DEVI_SID_NODEID, &xdip);
 				(void) ndi_prop_update_string(DDI_DEV_T_NONE,
 				    xdip, "compatible", "pnpPNP,f03");
 				(void) ndi_prop_update_string(DDI_DEV_T_NONE,
@@ -1017,7 +1017,7 @@ acpi_isa_device_enum(dev_info_t *isa_dip)
 	usedrdip = ddi_find_devinfo(USED_RESOURCES, -1, 0);
 	if (usedrdip == NULL) {
 		ndi_devi_alloc_sleep(ddi_root_node(), USED_RESOURCES,
-		    (dnode_t)DEVI_SID_NODEID, &usedrdip);
+		    (pnode_t)DEVI_SID_NODEID, &usedrdip);
 
 	}
 
diff --git a/usr/src/uts/i86pc/io/isa.c b/usr/src/uts/i86pc/io/isa.c
index 666a7aabcd45..af533379034e 100644
--- a/usr/src/uts/i86pc/io/isa.c
+++ b/usr/src/uts/i86pc/io/isa.c
@@ -537,7 +537,7 @@ add_known_used_resources(void)
 
 	if (usedrdip == NULL) {
 		(void) ndi_devi_alloc_sleep(ddi_root_node(), USED_RESOURCES,
-		    (dnode_t)DEVI_SID_NODEID, &usedrdip);
+		    (pnode_t)DEVI_SID_NODEID, &usedrdip);
 	}
 
 	(void) ndi_prop_update_int_array(DDI_DEV_T_NONE, usedrdip,
@@ -607,7 +607,7 @@ isa_alloc_nodes(dev_info_t *isa_dip)
 	/* serial ports */
 	for (i = 0; i < 2; i++) {
 		ndi_devi_alloc_sleep(isa_dip, "asy",
-		    (dnode_t)DEVI_SID_NODEID, &xdip);
+		    (pnode_t)DEVI_SID_NODEID, &xdip);
 		(void) ndi_prop_update_int_array(DDI_DEV_T_NONE, xdip,
 		    "reg", (int *)&asy_regs[i], 3);
 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, xdip,
@@ -617,7 +617,7 @@ isa_alloc_nodes(dev_info_t *isa_dip)
 
 	/* parallel port */
 	ndi_devi_alloc_sleep(isa_dip, "lp",
-	    (dnode_t)DEVI_SID_NODEID, &xdip);
+	    (pnode_t)DEVI_SID_NODEID, &xdip);
 	(void) ndi_prop_update_int_array(DDI_DEV_T_NONE, xdip,
 	    "reg", (int *)&lp_regs, 3);
 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, xdip,
@@ -626,7 +626,7 @@ isa_alloc_nodes(dev_info_t *isa_dip)
 
 	/* i8042 node */
 	ndi_devi_alloc_sleep(isa_dip, "i8042",
-	    (dnode_t)DEVI_SID_NODEID, &xdip);
+	    (pnode_t)DEVI_SID_NODEID, &xdip);
 	(void) ndi_prop_update_int_array(DDI_DEV_T_NONE, xdip,
 	    "reg", (int *)i8042_regs, 6);
 	(void) ndi_prop_update_int_array(DDI_DEV_T_NONE, xdip,
diff --git a/usr/src/uts/i86pc/io/pci/pci_boot.c b/usr/src/uts/i86pc/io/pci/pci_boot.c
index d0179054f63b..19ee4071e29b 100644
--- a/usr/src/uts/i86pc/io/pci/pci_boot.c
+++ b/usr/src/uts/i86pc/io/pci/pci_boot.c
@@ -163,7 +163,7 @@ create_root_bus_dip(uchar_t bus)
 	ASSERT(pci_bus_res[bus].par_bus == (uchar_t)-1);
 
 	ndi_devi_alloc_sleep(ddi_root_node(), "pci",
-	    (dnode_t)DEVI_SID_NODEID, &dip);
+	    (pnode_t)DEVI_SID_NODEID, &dip);
 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, dip,
 	    "device_type", "pci");
 	(void) ndi_prop_update_int(DDI_DEV_T_NONE, dip,
@@ -645,12 +645,12 @@ new_func_pci(uchar_t bus, uchar_t dev, uchar_t func, uchar_t header,
 
 		/* allocate two child nodes */
 		ndi_devi_alloc_sleep(dip, "ide",
-		    (dnode_t)DEVI_SID_NODEID, &cdip);
+		    (pnode_t)DEVI_SID_NODEID, &cdip);
 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cdip,
 		    "reg", 0);
 		(void) ndi_devi_bind_driver(cdip, 0);
 		ndi_devi_alloc_sleep(dip, "ide",
-		    (dnode_t)DEVI_SID_NODEID, &cdip);
+		    (pnode_t)DEVI_SID_NODEID, &cdip);
 		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cdip,
 		    "reg", 1);
 		(void) ndi_devi_bind_driver(cdip, 0);
diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c
index 562d5cd5a6bc..80802e61f8f4 100644
--- a/usr/src/uts/i86pc/os/cpuid.c
+++ b/usr/src/uts/i86pc/os/cpuid.c
@@ -2327,7 +2327,7 @@ add_cpunode2devtree(processorid_t cpu_id, struct cpuid_info *cpi)
 	 */
 	if (cpu_nex_devi == NULL) {
 		if (ndi_devi_alloc(ddi_root_node(), "cpus",
-		    (dnode_t)DEVI_SID_NODEID, &cpu_nex_devi) != NDI_SUCCESS) {
+		    (pnode_t)DEVI_SID_NODEID, &cpu_nex_devi) != NDI_SUCCESS) {
 			mutex_exit(&cpu_node_lock);
 			return;
 		}
diff --git a/usr/src/uts/i86pc/os/ddi_impl.c b/usr/src/uts/i86pc/os/ddi_impl.c
index 748b2c968b94..5168f076f2b3 100644
--- a/usr/src/uts/i86pc/os/ddi_impl.c
+++ b/usr/src/uts/i86pc/os/ddi_impl.c
@@ -246,7 +246,7 @@ status_okay(int id, char *buf, int buflen)
 	 * NB: proplen, if it's a string, includes the NULL in the
 	 * the size of the property, and fail_len does not.
 	 */
-	proplen = prom_getproplen((dnode_t)id, (caddr_t)status);
+	proplen = prom_getproplen((pnode_t)id, (caddr_t)status);
 	if (proplen <= fail_len)	/* nonexistant or uninteresting len */
 		return (1);
 
@@ -266,7 +266,7 @@ status_okay(int id, char *buf, int buflen)
 	 * a buffer was passed in and the caller wants to print the
 	 * value, but the buffer was too small).
 	 */
-	(void) prom_bounded_getprop((dnode_t)id, (caddr_t)status,
+	(void) prom_bounded_getprop((pnode_t)id, (caddr_t)status,
 	    (caddr_t)bufp, len);
 	*(bufp + len - 1) = (char)0;
 
@@ -349,11 +349,11 @@ getlongprop_buf(int id, char *name, char *buf, int maxlen)
 {
 	int size;
 
-	size = prom_getproplen((dnode_t)id, name);
+	size = prom_getproplen((pnode_t)id, name);
 	if (size <= 0 || (size > maxlen - 1))
 		return (-1);
 
-	if (-1 == prom_getprop((dnode_t)id, name, buf))
+	if (-1 == prom_getprop((pnode_t)id, name, buf))
 		return (-1);
 
 	if (strcmp("name", name) == 0) {
@@ -2091,7 +2091,7 @@ impl_setup_ddi(void)
 	int err;
 
 	ndi_devi_alloc_sleep(ddi_root_node(), "ramdisk",
-	    (dnode_t)DEVI_SID_NODEID, &xdip);
+	    (pnode_t)DEVI_SID_NODEID, &xdip);
 
 	(void) BOP_GETPROP(bootops,
 	    "ramdisk_start", (void *)&ramdisk_start);
@@ -2109,7 +2109,7 @@ impl_setup_ddi(void)
 
 	/* isa node */
 	ndi_devi_alloc_sleep(ddi_root_node(), "isa",
-	    (dnode_t)DEVI_SID_NODEID, &isa_dip);
+	    (pnode_t)DEVI_SID_NODEID, &isa_dip);
 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, isa_dip,
 	    "device_type", "isa");
 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, isa_dip,
diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel
index a99ee6c7528e..858fa4b52869 100644
--- a/usr/src/uts/intel/Makefile.intel
+++ b/usr/src/uts/intel/Makefile.intel
@@ -395,7 +395,7 @@ SCHED_KMODS	+= IA RT TS RT_DPTBL TS_DPTBL FSS FX FX_DPTBL
 #	File System Modules (/kernel/fs):
 #
 FS_KMODS	+= autofs cachefs ctfs devfs fdfs fifofs hsfs lofs
-FS_KMODS	+= mntfs namefs nfs objfs
+FS_KMODS	+= mntfs namefs nfs objfs zfs
 FS_KMODS	+= pcfs procfs sockfs specfs tmpfs udfs ufs xmemfs
 
 #
diff --git a/usr/src/uts/intel/os/minor_perm b/usr/src/uts/intel/os/minor_perm
index c4e62dd8e5ce..d60196d570c1 100644
--- a/usr/src/uts/intel/os/minor_perm
+++ b/usr/src/uts/intel/os/minor_perm
@@ -113,3 +113,5 @@ bmc:bmc 0666 root sys
 dld:* 0666 root sys
 aggr:* 0666 root sys
 smbios:smbios 0444 root sys
+zfs:* 0600 root sys
+zfs:zfs 0666 root sys
diff --git a/usr/src/uts/intel/os/name_to_major b/usr/src/uts/intel/os/name_to_major
index 14e6682730e2..ce24d9f92cf3 100644
--- a/usr/src/uts/intel/os/name_to_major
+++ b/usr/src/uts/intel/os/name_to_major
@@ -115,4 +115,5 @@ dld 178
 aggr 179
 smbios 180
 power 181
+zfs 182
 did 239
diff --git a/usr/src/uts/intel/promif/prom_emul.c b/usr/src/uts/intel/promif/prom_emul.c
index 156e4872d163..5497d9eab846 100644
--- a/usr/src/uts/intel/promif/prom_emul.c
+++ b/usr/src/uts/intel/promif/prom_emul.c
@@ -34,7 +34,7 @@
 
 static prom_node_t *promif_top;
 
-static prom_node_t *promif_find_node(dnode_t nodeid);
+static prom_node_t *promif_find_node(pnode_t nodeid);
 static int getproplen(prom_node_t *pnp, char *name);
 static void *getprop(prom_node_t *pnp, char *name);
 
@@ -137,7 +137,7 @@ promif_create_device_tree(void)
 }
 
 static prom_node_t *
-find_node_work(prom_node_t *pnp, dnode_t n)
+find_node_work(prom_node_t *pnp, pnode_t n)
 {
 	prom_node_t *qnp;
 
@@ -156,7 +156,7 @@ find_node_work(prom_node_t *pnp, dnode_t n)
 }
 
 static prom_node_t *
-promif_find_node(dnode_t nodeid)
+promif_find_node(pnode_t nodeid)
 {
 	if (nodeid == OBP_NONODE)
 		return (promif_top);
@@ -167,8 +167,8 @@ promif_find_node(dnode_t nodeid)
 	return (find_node_work(promif_top, nodeid));
 }
 
-dnode_t
-promif_nextnode(dnode_t nodeid)
+pnode_t
+promif_nextnode(pnode_t nodeid)
 {
 	prom_node_t *pnp;
 
@@ -184,8 +184,8 @@ promif_nextnode(dnode_t nodeid)
 	return (OBP_NONODE);
 }
 
-dnode_t
-promif_childnode(dnode_t nodeid)
+pnode_t
+promif_childnode(pnode_t nodeid)
 {
 	prom_node_t *pnp;
 
@@ -213,7 +213,7 @@ getproplen(prom_node_t *pnp, char *name)
 }
 
 int
-promif_getproplen(dnode_t nodeid, char *name)
+promif_getproplen(pnode_t nodeid, char *name)
 {
 	prom_node_t *pnp;
 
@@ -237,7 +237,7 @@ getprop(prom_node_t *pnp, char *name)
 }
 
 int
-promif_getprop(dnode_t nodeid, char *name, void *value)
+promif_getprop(pnode_t nodeid, char *name, void *value)
 {
 	prom_node_t *pnp;
 	void *v;
@@ -276,7 +276,7 @@ nextprop(prom_node_t *pnp, char *name)
 }
 
 char *
-promif_nextprop(dnode_t nodeid, char *name, char *next)
+promif_nextprop(pnode_t nodeid, char *name, char *next)
 {
 	prom_node_t *pnp;
 	char *s;
diff --git a/usr/src/uts/intel/promif/prom_node.c b/usr/src/uts/intel/promif/prom_node.c
index 1011d8f1bb89..5001f59da344 100644
--- a/usr/src/uts/intel/promif/prom_node.c
+++ b/usr/src/uts/intel/promif/prom_node.c
@@ -41,22 +41,22 @@
  * Return the root nodeid.
  * Calling prom_nextnode(0) returns the root nodeid.
  */
-dnode_t
+pnode_t
 prom_rootnode(void)
 {
-	static dnode_t rootnode;
+	static pnode_t rootnode;
 
 	return (rootnode ? rootnode : (rootnode = prom_nextnode(OBP_NONODE)));
 }
 
-dnode_t
-prom_nextnode(dnode_t nodeid)
+pnode_t
+prom_nextnode(pnode_t nodeid)
 {
 	return (promif_nextnode(nodeid));
 }
 
-dnode_t
-prom_childnode(dnode_t nodeid)
+pnode_t
+prom_childnode(pnode_t nodeid)
 {
 
 	return (promif_childnode(nodeid));
@@ -66,32 +66,32 @@ prom_childnode(dnode_t nodeid)
  * disallow searching
  */
 /*ARGSUSED*/
-dnode_t
-prom_findnode_byname(dnode_t n, char *name)
+pnode_t
+prom_findnode_byname(pnode_t n, char *name)
 {
 	return (OBP_NONODE);
 }
 
-dnode_t
+pnode_t
 prom_chosennode(void)
 {
 	return (OBP_NONODE);
 }
 
-dnode_t
+pnode_t
 prom_optionsnode(void)
 {
 	return (OBP_NONODE);
 }
 
 /*ARGSUSED*/
-dnode_t
+pnode_t
 prom_finddevice(char *path)
 {
 	return (OBP_BADNODE);
 }
 
-dnode_t
+pnode_t
 prom_alias_node(void)
 {
 	return (OBP_BADNODE);
diff --git a/usr/src/uts/intel/promif/prom_prop.c b/usr/src/uts/intel/promif/prom_prop.c
index 8c0c6b437e6f..83accc699b06 100644
--- a/usr/src/uts/intel/promif/prom_prop.c
+++ b/usr/src/uts/intel/promif/prom_prop.c
@@ -35,19 +35,19 @@
 #include <sys/prom_emul.h>
 
 int
-prom_getproplen(dnode_t nodeid, caddr_t name)
+prom_getproplen(pnode_t nodeid, caddr_t name)
 {
 	return (promif_getproplen(nodeid, name));
 }
 
 int
-prom_getprop(dnode_t nodeid, caddr_t name, caddr_t value)
+prom_getprop(pnode_t nodeid, caddr_t name, caddr_t value)
 {
 	return (promif_getprop(nodeid, name, value));
 }
 
 caddr_t
-prom_nextprop(dnode_t nodeid, caddr_t previous, caddr_t next)
+prom_nextprop(pnode_t nodeid, caddr_t previous, caddr_t next)
 {
 	return (promif_nextprop(nodeid, previous, next));
 }
@@ -70,7 +70,7 @@ prom_decode_composite_string(void *buf, size_t buflen, char *prev)
 
 /*ARGSUSED*/
 int
-prom_bounded_getprop(dnode_t nodeid, caddr_t name, caddr_t value, int len)
+prom_bounded_getprop(pnode_t nodeid, caddr_t name, caddr_t value, int len)
 {
 	return (-1);
 }
diff --git a/usr/src/uts/intel/sys/obpdefs.h b/usr/src/uts/intel/sys/obpdefs.h
index 66b5336a4392..9de84210160c 100644
--- a/usr/src/uts/intel/sys/obpdefs.h
+++ b/usr/src/uts/intel/sys/obpdefs.h
@@ -20,8 +20,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 1991-1994,1999 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #ifndef	_SYS_OBPDEFS_H
@@ -40,14 +40,14 @@ extern "C" {
 
 typedef int	ihandle_t;		/* 1275 device instance number */
 typedef int	phandle_t;		/* 1275 device tree node ptr */
-typedef	phandle_t dnode_t;
+typedef	phandle_t pnode_t;
 
 /*
  * Device type matching
  */
 
-#define	OBP_NONODE	((dnode_t)0)
-#define	OBP_BADNODE	((dnode_t)-1)
+#define	OBP_NONODE	((pnode_t)0)
+#define	OBP_BADNODE	((pnode_t)-1)
 
 /*
  * Property Defines
diff --git a/usr/src/uts/intel/sys/prom_emul.h b/usr/src/uts/intel/sys/prom_emul.h
index a58ad2af2eb1..fd5930bf34af 100644
--- a/usr/src/uts/intel/sys/prom_emul.h
+++ b/usr/src/uts/intel/sys/prom_emul.h
@@ -20,8 +20,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 1995-1999 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #ifndef	_SYS_PROM_EMUL_H
@@ -49,7 +49,7 @@ struct prom_prop {
  * of the PROM device tree.
  */
 struct prom_node {
-	dnode_t	pn_nodeid;
+	pnode_t	pn_nodeid;
 	struct prom_prop *pn_propp;
 	struct prom_node *pn_child;
 	struct prom_node *pn_sibling;
@@ -62,22 +62,22 @@ typedef struct prom_node prom_node_t;
  */
 extern void promif_create_device_tree(void);
 
-extern dnode_t promif_findnode_byname(dnode_t n, char *name);
-extern dnode_t promif_nextnode(dnode_t n);
-extern dnode_t promif_childnode(dnode_t n);
+extern pnode_t promif_findnode_byname(pnode_t n, char *name);
+extern pnode_t promif_nextnode(pnode_t n);
+extern pnode_t promif_childnode(pnode_t n);
 
-extern int promif_getproplen(dnode_t n, char *name);
-extern int promif_getprop(dnode_t n,  char *name, void *value);
-extern int promif_bounded_getprop(dnode_t, char *name, void *value, int len);
-char *promif_nextprop(dnode_t n, char *previous, char *next);
+extern int promif_getproplen(pnode_t n, char *name);
+extern int promif_getprop(pnode_t n,  char *name, void *value);
+extern int promif_bounded_getprop(pnode_t, char *name, void *value, int len);
+char *promif_nextprop(pnode_t n, char *previous, char *next);
 
 /*
  * XXX: The following functions are unsafe and unecessary, and should be
  * XXX: removed. OS created nodes belong in the OS copy of the device tree.
  * XXX: The OS should not be creating nodes in the prom's device tree!
  */
-extern dnode_t promif_add_child(dnode_t parent, dnode_t child, char *name);
-extern void promif_create_prop_external(dnode_t, char *name, void *, int);
+extern pnode_t promif_add_child(pnode_t parent, pnode_t child, char *name);
+extern void promif_create_prop_external(pnode_t, char *name, void *, int);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/uts/intel/sys/promif.h b/usr/src/uts/intel/sys/promif.h
index 2650491c7d3a..20ce5c2db1ab 100644
--- a/usr/src/uts/intel/sys/promif.h
+++ b/usr/src/uts/intel/sys/promif.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -65,16 +65,16 @@ extern	void		prom_free(caddr_t virt, uint_t size);
 /*
  * Device tree and property group: OBP and IEEE 1275-1994.
  */
-extern	dnode_t		prom_childnode(dnode_t nodeid);
-extern	dnode_t		prom_nextnode(dnode_t nodeid);
-extern	dnode_t		prom_optionsnode(void);
-extern	dnode_t		prom_alias_node(void);
-extern	dnode_t		prom_rootnode(void);
+extern	pnode_t		prom_childnode(pnode_t nodeid);
+extern	pnode_t		prom_nextnode(pnode_t nodeid);
+extern	pnode_t		prom_optionsnode(void);
+extern	pnode_t		prom_alias_node(void);
+extern	pnode_t		prom_rootnode(void);
 
-extern	int		prom_getproplen(dnode_t nodeid, caddr_t name);
-extern	int		prom_getprop(dnode_t nodeid, caddr_t name,
+extern	int		prom_getproplen(pnode_t nodeid, caddr_t name);
+extern	int		prom_getprop(pnode_t nodeid, caddr_t name,
 			    caddr_t value);
-extern	caddr_t		prom_nextprop(dnode_t nodeid, caddr_t previous,
+extern	caddr_t		prom_nextprop(pnode_t nodeid, caddr_t previous,
 			    caddr_t next);
 
 extern	char		*prom_decode_composite_string(void *buf,
@@ -83,9 +83,9 @@ extern	char		*prom_decode_composite_string(void *buf,
 /*
  * Device tree and property group: IEEE 1275-1994 Only.
  */
-extern	dnode_t		prom_finddevice(char *path);
+extern	pnode_t		prom_finddevice(char *path);
 
-extern	int		prom_bounded_getprop(dnode_t nodeid,
+extern	int		prom_bounded_getprop(pnode_t nodeid,
 			    caddr_t name, caddr_t buffer, int buflen);
 
 /*
@@ -192,7 +192,7 @@ extern	char		*prom_vsprintf(char *s, const char *fmt, __va_list adx)
  * promif tree searching routines ... OBP and IEEE 1275-1994.
  */
 
-extern	dnode_t		prom_findnode_byname(dnode_t id, char *name);
+extern	pnode_t		prom_findnode_byname(pnode_t id, char *name);
 extern	char		*prom_get_extend_name(void);
 
 extern	int		prom_devreset(int);
diff --git a/usr/src/uts/intel/zfs/Makefile b/usr/src/uts/intel/zfs/Makefile
new file mode 100644
index 000000000000..84eeec044a11
--- /dev/null
+++ b/usr/src/uts/intel/zfs/Makefile
@@ -0,0 +1,100 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+#	This makefile drives the production of the zfs file system
+#	kernel module.
+
+#
+#	Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+ARCHDIR:sh = cd ..; basename `pwd`
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= zfs
+OBJECTS		= $(ZFS_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(ZFS_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_DRV_DIR)/$(MODULE)
+ROOTLINK	= $(ROOT_FS_DIR)/$(MODULE)
+CONF_SRCDIR	= $(UTSBASE)/common/fs/zfs
+
+#
+#	Include common rules.
+#
+include ../Makefile.$(ARCHDIR)
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY) $(SRC_CONFILE)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE)
+
+#
+#	Overrides and depends_on
+#
+MODSTUBS_DIR	 = $(OBJS_DIR)
+LDFLAGS		+= -dy -Nfs/specfs -Ndrv/random
+
+INC_PATH	+= -I$(UTSBASE)/common/fs/zfs
+INC_PATH	+= -I$(SRC)/common
+INC_PATH	+= -I$(COMMONBASE)/zfs
+
+C99MODE=	-xc99=%all
+C99LMODE=	-Xc99=%all
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+$(ROOTLINK):	$(ROOT_FS_DIR) $(ROOTMODULE)
+	-$(RM) $@; ln $(ROOTMODULE) $@
+
+#
+#	Include common targets.
+#
+include ../Makefile.targ
diff --git a/usr/src/uts/req.flg b/usr/src/uts/req.flg
index 55aad336bfcc..b0d3f87d8619 100644
--- a/usr/src/uts/req.flg
+++ b/usr/src/uts/req.flg
@@ -48,3 +48,5 @@ find_files "s.*" usr/src/cmd/cmd-crypto/etc/keys
 find_files "s.*" usr/src/common/ipf
 find_files "s.*" usr/src/common/mdesc
 find_files "s.*" usr/src/common/fs
+find_files "s.*" usr/src/common/acl
+find_files "s.*" usr/src/common/zfs
diff --git a/usr/src/uts/sparc/Makefile.sparc b/usr/src/uts/sparc/Makefile.sparc
index eef8510dfc90..ff057dd7c540 100644
--- a/usr/src/uts/sparc/Makefile.sparc
+++ b/usr/src/uts/sparc/Makefile.sparc
@@ -294,7 +294,7 @@ SCHED_KMODS	+= RT TS RT_DPTBL TS_DPTBL IA FSS FX FX_DPTBL
 #
 #	File System Modules (/kernel/fs):
 #
-FS_KMODS	+= devfs fdfs fifofs hsfs lofs namefs nfs pcfs tmpfs
+FS_KMODS	+= devfs fdfs fifofs hsfs lofs namefs nfs pcfs tmpfs zfs
 FS_KMODS	+= specfs udfs ufs autofs cachefs procfs sockfs mntfs
 FS_KMODS	+= ctfs objfs
 
diff --git a/usr/src/uts/sparc/os/cpr_sparc.c b/usr/src/uts/sparc/os/cpr_sparc.c
index e9b802b5bee3..ebc452b8fe50 100644
--- a/usr/src/uts/sparc/os/cpr_sparc.c
+++ b/usr/src/uts/sparc/os/cpr_sparc.c
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -67,13 +67,13 @@ static cdef_t orig_def_info = {
 
 static char *cpr_next_component(char **);
 static char *cpr_get_prefix(char *);
-static char *cpr_build_nodename(dnode_t);
+static char *cpr_build_nodename(pnode_t);
 static void cpr_abbreviate_devpath(char *, char *);
 static int cpr_show_props = 0;
 
 
 static int
-cpr_get_options_node(dnode_t *nodep)
+cpr_get_options_node(pnode_t *nodep)
 {
 	*nodep = prom_optionsnode();
 	if (*nodep == OBP_NONODE || *nodep == OBP_BADNODE) {
@@ -93,7 +93,7 @@ static int
 cpr_get_bool_prop(char *name, int *result)
 {
 	char value[PROP_BOOL_LEN];
-	dnode_t node;
+	pnode_t node;
 	int len, err;
 
 	if (err = cpr_get_options_node(&node))
@@ -116,7 +116,7 @@ int
 cpr_update_nvram(cprop_t *props)
 {
 	cprop_t *tail;
-	dnode_t node;
+	pnode_t node;
 	int len, rc;
 
 	if (rc = cpr_get_options_node(&node))
@@ -258,7 +258,7 @@ cpr_default_setup(int alloc)
 {
 	cprop_t *orig, *new, *tail;
 	int len, err = 0;
-	dnode_t node;
+	pnode_t node;
 	char *fmt;
 
 	if (alloc == 0) {
@@ -348,7 +348,7 @@ cpr_spinning_bar(void)
 static void
 cpr_abbreviate_devpath(char *in_path, char *out_path)
 {
-	static dnode_t cur_node;
+	static pnode_t cur_node;
 	char *position = in_path + 1;	/* Skip the leading slash. */
 	char *cmpt;
 
@@ -356,8 +356,8 @@ cpr_abbreviate_devpath(char *in_path, char *out_path)
 	*out_path = '\0';
 
 	while ((cmpt = cpr_next_component(&position)) != NULL) {
-		dnode_t long_match = NULL;
-		dnode_t short_match = NULL;
+		pnode_t long_match = NULL;
+		pnode_t short_match = NULL;
 		int short_hits = 0;
 		char *name;
 		char *prefix = cpr_get_prefix(cmpt);
@@ -461,7 +461,7 @@ cpr_get_prefix(char *cmpt)
  * from the first two (binary) words of the "reg" property.
  */
 static char *
-cpr_build_nodename(dnode_t node)
+cpr_build_nodename(pnode_t node)
 {
 	static char	name[OBP_MAXPATHLEN];
 	int		reg[512];
diff --git a/usr/src/uts/sparc/os/minor_perm b/usr/src/uts/sparc/os/minor_perm
index 587b0cfc4e69..e46f3718ff77 100644
--- a/usr/src/uts/sparc/os/minor_perm
+++ b/usr/src/uts/sparc/os/minor_perm
@@ -162,3 +162,5 @@ mdesc:* 0666 root sys
 dld:* 0666 root sys
 aggr:* 0666 root sys
 ntwdt:* 0644 root sys
+zfs:* 0600 root sys
+zfs:zfs 0666 root sys
diff --git a/usr/src/uts/sparc/os/name_to_major b/usr/src/uts/sparc/os/name_to_major
index 5ac6715971bf..277aeff8efce 100644
--- a/usr/src/uts/sparc/os/name_to_major
+++ b/usr/src/uts/sparc/os/name_to_major
@@ -200,3 +200,4 @@ mi2cv 249
 todds1337 250
 pic16f747 251
 mdesc 252
+zfs 253
diff --git a/usr/src/uts/sparc/v9/sys/prom_isa.h b/usr/src/uts/sparc/v9/sys/prom_isa.h
index 12a739987fb9..c2f3cb4c4fc7 100644
--- a/usr/src/uts/sparc/v9/sys/prom_isa.h
+++ b/usr/src/uts/sparc/v9/sys/prom_isa.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -56,7 +56,7 @@ typedef	unsigned long long cell_t;
 #define	p1275_uint2cell(u)	((cell_t)((unsigned int)(u)))
 #define	p1275_size2cell(u)	((cell_t)((size_t)(u)))
 #define	p1275_phandle2cell(ph)	((cell_t)((unsigned int)((phandle_t)(ph))))
-#define	p1275_dnode2cell(d)	((cell_t)((unsigned int)((dnode_t)(d))))
+#define	p1275_dnode2cell(d)	((cell_t)((unsigned int)((pnode_t)(d))))
 #define	p1275_ihandle2cell(ih)	((cell_t)((unsigned int)((ihandle_t)(ih))))
 #define	p1275_ull2cell_high(ll)	(0LL)
 #define	p1275_ull2cell_low(ll)	((cell_t)(ll))
@@ -67,7 +67,7 @@ typedef	unsigned long long cell_t;
 #define	p1275_cell2uint(u)	((unsigned int)((cell_t)(u)))
 #define	p1275_cell2size(u)	((size_t)((cell_t)(u)))
 #define	p1275_cell2phandle(ph)	((phandle_t)((cell_t)(ph)))
-#define	p1275_cell2dnode(d)	((dnode_t)((cell_t)(d)))
+#define	p1275_cell2dnode(d)	((pnode_t)((cell_t)(d)))
 #define	p1275_cell2ihandle(ih)	((ihandle_t)((cell_t)(ih)))
 #define	p1275_cells2ull(h, l)	((unsigned long long)(cell_t)(l))
 #define	p1275_cell2uintptr(i)	((uintptr_t)((cell_t)(i)))
diff --git a/usr/src/uts/sparc/zfs/Makefile b/usr/src/uts/sparc/zfs/Makefile
new file mode 100644
index 000000000000..84eeec044a11
--- /dev/null
+++ b/usr/src/uts/sparc/zfs/Makefile
@@ -0,0 +1,100 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+#	This makefile drives the production of the zfs file system
+#	kernel module.
+
+#
+#	Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+ARCHDIR:sh = cd ..; basename `pwd`
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= zfs
+OBJECTS		= $(ZFS_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(ZFS_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_DRV_DIR)/$(MODULE)
+ROOTLINK	= $(ROOT_FS_DIR)/$(MODULE)
+CONF_SRCDIR	= $(UTSBASE)/common/fs/zfs
+
+#
+#	Include common rules.
+#
+include ../Makefile.$(ARCHDIR)
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY) $(SRC_CONFILE)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE)
+
+#
+#	Overrides and depends_on
+#
+MODSTUBS_DIR	 = $(OBJS_DIR)
+LDFLAGS		+= -dy -Nfs/specfs -Ndrv/random
+
+INC_PATH	+= -I$(UTSBASE)/common/fs/zfs
+INC_PATH	+= -I$(SRC)/common
+INC_PATH	+= -I$(COMMONBASE)/zfs
+
+C99MODE=	-xc99=%all
+C99LMODE=	-Xc99=%all
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+$(ROOTLINK):	$(ROOT_FS_DIR) $(ROOTMODULE)
+	-$(RM) $@; ln $(ROOTMODULE) $@
+
+#
+#	Include common targets.
+#
+include ../Makefile.targ
diff --git a/usr/src/uts/sun/io/ttymux/ttymux_ioctl.c b/usr/src/uts/sun/io/ttymux/ttymux_ioctl.c
index e3aa089ced21..dd03a8a84229 100644
--- a/usr/src/uts/sun/io/ttymux/ttymux_ioctl.c
+++ b/usr/src/uts/sun/io/ttymux/ttymux_ioctl.c
@@ -126,7 +126,7 @@ mblk2assoc(mblk_t *mp, ttymux_assoc_t *assoc)
  * Given a device path return an OBP alias for it if it exists.
  */
 static char *
-val2alias(dnode_t node, char *path)
+val2alias(pnode_t node, char *path)
 {
 	char *buf1;
 	char *buf2;
@@ -291,7 +291,7 @@ device_fini_impl(sm_mux_state_t *ms, sm_console_t *cn, sm_lqi_t *plqi)
 }
 
 static int
-read_prop(dnode_t node, char *propname, char **propval)
+read_prop(pnode_t node, char *propname, char **propval)
 {
 	int	proplen = prom_getproplen(node, propname);
 
@@ -347,7 +347,7 @@ sm_strtok_r(char *p, char *sep, char **lasts)
 static int
 upd_config(boolean_t append, char *pname, char *path)
 {
-	dnode_t		onode, anode;
+	pnode_t		onode, anode;
 	size_t		plen;		/* length of property name */
 	char		*pval;		/* value of property */
 	char		*tok, *lasts;
diff --git a/usr/src/uts/sun/sys/bootconf.h b/usr/src/uts/sun/sys/bootconf.h
index d7143836a948..40e7d25facc8 100644
--- a/usr/src/uts/sun/sys/bootconf.h
+++ b/usr/src/uts/sun/sys/bootconf.h
@@ -171,7 +171,7 @@ typedef unsigned long long boot_cell_t;
 #define	boot_offt2cell(u)	((boot_cell_t)((off_t)(u)))
 #define	boot_size2cell(u)	((boot_cell_t)((size_t)(u)))
 #define	boot_phandle2cell(ph)	((boot_cell_t)((unsigned)((phandle_t)(ph))))
-#define	boot_dnode2cell(d)	((boot_cell_t)((unsigned)((dnode_t)(d))))
+#define	boot_dnode2cell(d)	((boot_cell_t)((unsigned)((pnode_t)(d))))
 #define	boot_ihandle2cell(ih)	((boot_cell_t)((unsigned)((ihandle_t)(ih))))
 
 #define	boot_cell2ptr(p)	((void *)(uintptr_t)((boot_cell_t)(p)))
@@ -181,7 +181,7 @@ typedef unsigned long long boot_cell_t;
 #define	boot_cell2offt(u)	((off_t)((boot_cell_t)(u)))
 #define	boot_cell2size(u)	((size_t)((boot_cell_t)(u)))
 #define	boot_cell2phandle(ph)	((phandle_t)((boot_cell_t)(ph)))
-#define	boot_cell2dnode(d)	((dnode_t)((boot_cell_t)(d)))
+#define	boot_cell2dnode(d)	((pnode_t)((boot_cell_t)(d)))
 #define	boot_cell2ihandle(ih)	((ihandle_t)((boot_cell_t)(ih)))
 #define	boot_cells2ull(h, l)	((unsigned long long)(boot_cell_t)(l))
 
diff --git a/usr/src/uts/sun/sys/dada/impl/identify.h b/usr/src/uts/sun/sys/dada/impl/identify.h
index d394d8833c01..fe52d7ea2142 100644
--- a/usr/src/uts/sun/sys/dada/impl/identify.h
+++ b/usr/src/uts/sun/sys/dada/impl/identify.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -71,11 +71,15 @@ struct	dcd_identify {
 	ushort_t dcd_recmwdma;	/*  66  rec multi-word dma cycle info    */
 	ushort_t dcd_minpio;	/*  67  min PIO cycle info */
 	ushort_t dcd_minpioflow;	/*  68  min PIO cycle info w/flow ctl */
-	ushort_t dcd_padding1[19]; /* 69 pad to 87 */
+	ushort_t dcd_padding1[11];	/* 69 pad to 79 */
+	ushort_t dcd_majvers;	/*  80  ATA major version supported */
+	ushort_t dcd_padding2[4];	/* 81 pad to 84 */
+	ushort_t dcd_features85;	/*  85  feature enabled bits */
+	ushort_t dcd_padding3[2];	/* 86 pad to 87 */
 	ushort_t dcd_ultra_dma;	/*  88	Ultra dma capability */
-	ushort_t dcd_padding2[37]; /* 89 pad to 125 */
+	ushort_t dcd_padding4[37];	/* 89 pad to 125 */
 	ushort_t dcd_lastlun;	/* 126 last logical unit number */
-	ushort_t dcd_padding3[129];	/* pad to 255 */
+	ushort_t dcd_padding5[129];	/* pad to 255 */
 };
 
 
@@ -97,6 +101,16 @@ struct	dcd_identify {
 #define	PIO_MODE4_MASK		0x02
 #define	PIO_MODE3_MASK		0x01
 
+/*
+ * The following are bits for dcd_majvers, word 80
+ */
+#define	IDENTIFY_80_ATAPI_4	0x0010
+
+/*
+ * The following are the bits for dcd_features85, word 85
+ */
+#define	IDENTIFY_85_WCE		(1 << 5)
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/uts/sun/sys/obpdefs.h b/usr/src/uts/sun/sys/obpdefs.h
index 2689ba7d6df5..8a975c86569c 100644
--- a/usr/src/uts/sun/sys/obpdefs.h
+++ b/usr/src/uts/sun/sys/obpdefs.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1991-2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -37,14 +37,14 @@ extern "C" {
 
 typedef	int		ihandle_t;
 typedef	int		phandle_t;
-typedef	phandle_t	dnode_t;
+typedef	phandle_t	pnode_t;
 
 /*
  * Device type matching
  */
 
-#define	OBP_NONODE	((dnode_t)0)
-#define	OBP_BADNODE	((dnode_t)-1)
+#define	OBP_NONODE	((pnode_t)0)
+#define	OBP_BADNODE	((pnode_t)-1)
 
 /*
  * Property Defines
diff --git a/usr/src/uts/sun/sys/promif.h b/usr/src/uts/sun/sys/promif.h
index eecd400f712b..b07835fa08dc 100644
--- a/usr/src/uts/sun/sys/promif.h
+++ b/usr/src/uts/sun/sys/promif.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -86,13 +86,13 @@ extern	void		prom_free(caddr_t virt, size_t size);
 /*
  * Device tree and property group: OBP and IEEE 1275-1994.
  */
-extern	dnode_t		prom_childnode(dnode_t nodeid);
-extern	dnode_t		prom_nextnode(dnode_t nodeid);
-extern	dnode_t		prom_parentnode(dnode_t nodeid);
-extern	dnode_t		prom_rootnode(void);
-extern	dnode_t		prom_chosennode(void);
-extern	dnode_t		prom_alias_node(void);
-extern	dnode_t		prom_optionsnode(void);
+extern	pnode_t		prom_childnode(pnode_t nodeid);
+extern	pnode_t		prom_nextnode(pnode_t nodeid);
+extern	pnode_t		prom_parentnode(pnode_t nodeid);
+extern	pnode_t		prom_rootnode(void);
+extern	pnode_t		prom_chosennode(void);
+extern	pnode_t		prom_alias_node(void);
+extern	pnode_t		prom_optionsnode(void);
 
 extern	int		prom_asr_list_keys_len();
 extern	int		prom_asr_list_keys(caddr_t value);
@@ -102,16 +102,16 @@ extern	int		prom_asr_disable(char *keystr, int keystr_len,
 			    char *reason, int reason_len);
 extern	int		prom_asr_enable(char *keystr, int keystr_len);
 
-extern	int		prom_getproplen(dnode_t nodeid, caddr_t name);
-extern	int		prom_getprop(dnode_t nodeid, caddr_t name,
+extern	int		prom_getproplen(pnode_t nodeid, caddr_t name);
+extern	int		prom_getprop(pnode_t nodeid, caddr_t name,
 			    caddr_t value);
-extern	caddr_t		prom_nextprop(dnode_t nodeid, caddr_t previous,
+extern	caddr_t		prom_nextprop(pnode_t nodeid, caddr_t previous,
 			    caddr_t next);
-extern	int		prom_setprop(dnode_t nodeid, caddr_t name,
+extern	int		prom_setprop(pnode_t nodeid, caddr_t name,
 			    caddr_t value, int len);
 
-extern	int		prom_getnode_byname(dnode_t id, char *name);
-extern	int		prom_devicetype(dnode_t id, char *type);
+extern	int		prom_getnode_byname(pnode_t id, char *name);
+extern	int		prom_devicetype(pnode_t id, char *type);
 
 extern	char		*prom_decode_composite_string(void *buf,
 			    size_t buflen, char *prev);
@@ -119,9 +119,9 @@ extern	char		*prom_decode_composite_string(void *buf,
 /*
  * Device tree and property group: IEEE 1275-1994 Only.
  */
-extern	dnode_t		prom_finddevice(char *path);	/* Also on obp2.x */
+extern	pnode_t		prom_finddevice(char *path);	/* Also on obp2.x */
 
-extern	int		prom_bounded_getprop(dnode_t nodeid,
+extern	int		prom_bounded_getprop(pnode_t nodeid,
 			    caddr_t name, caddr_t buffer, int buflen);
 
 extern	phandle_t	prom_getphandle(ihandle_t i);
@@ -148,8 +148,8 @@ extern	int		prom_phandle_to_path(phandle_t, char *buf,
  */
 extern	ihandle_t	prom_stdin_ihandle(void);
 extern	ihandle_t	prom_stdout_ihandle(void);
-extern	dnode_t		prom_stdin_node(void);
-extern	dnode_t		prom_stdout_node(void);
+extern	pnode_t		prom_stdin_node(void);
+extern	pnode_t		prom_stdout_node(void);
 extern	char		*prom_stdinpath(void);
 extern	char		*prom_stdoutpath(void);
 extern	int		prom_stdin_devname(char *buffer);
@@ -179,7 +179,7 @@ extern	int		prom_getversion(void);
 extern	int		prom_is_openprom(void);
 extern	int		prom_is_p1275(void);
 extern	int		prom_version_name(char *buf, int buflen);
-extern	int		prom_version_check(char *buf, size_t len, dnode_t *n);
+extern	int		prom_version_check(char *buf, size_t len, pnode_t *n);
 
 extern	void		*prom_mon_id(void);	/* SMCC/OBP platform centric */
 
@@ -223,7 +223,7 @@ extern void		prom_set_symbol_lookup(void *sym2val, void *val2sym);
  * Administrative group: IEEE 1275 only.
  */
 extern	int		prom_test(char *service);
-extern	int		prom_test_method(char *method, dnode_t node);
+extern	int		prom_test_method(char *method, pnode_t node);
 
 /*
  * Promif support group: Generic.
@@ -264,7 +264,7 @@ extern  ssize_t		prom_write(ihandle_t fd, caddr_t buf, size_t len,
 extern	int		prom_seek(int fd, u_longlong_t offset);
 
 extern	void		prom_writestr(const char *buf, size_t bufsize);
-extern	void		prom_dnode_to_pathname(dnode_t, char *);
+extern	void		prom_pnode_to_pathname(pnode_t, char *);
 
 /*PRINTFLIKE1*/
 extern	void		prom_printf(const char *fmt, ...)
@@ -284,12 +284,12 @@ extern	char		*prom_vsprintf(char *s, const char *fmt, __va_list adx)
 #define	PROM_WALK_CONTINUE	0	/* keep walking to next node */
 #define	PROM_WALK_TERMINATE	1	/* abort walk now */
 
-extern	void		prom_walk_devs(dnode_t node,
-			    int (*f)(dnode_t, void *, void *),
+extern	void		prom_walk_devs(pnode_t node,
+			    int (*f)(pnode_t, void *, void *),
 			    void *arg, void *result);
 
-extern	dnode_t		prom_findnode_byname(dnode_t id, char *name);
-extern	dnode_t		prom_findnode_bydevtype(dnode_t id, char *devtype);
+extern	pnode_t		prom_findnode_byname(pnode_t id, char *name);
+extern	pnode_t		prom_findnode_bydevtype(pnode_t id, char *devtype);
 
 #define	PROM_STOP	{	\
 	prom_printf("File %s line %d\n", __FILE__, __LINE__); \
diff --git a/usr/src/uts/sun4/os/ddi_impl.c b/usr/src/uts/sun4/os/ddi_impl.c
index da2931ba5bdf..64f2914874e5 100644
--- a/usr/src/uts/sun4/os/ddi_impl.c
+++ b/usr/src/uts/sun4/os/ddi_impl.c
@@ -1487,7 +1487,7 @@ configure(void)
 	 * page as a side-effect of devr_next(0) (which prom_nextnode calls),
 	 * so this *must* be executed early on. (XXX This is untrue for sun4u)
 	 */
-	(void) prom_nextnode((dnode_t)0);
+	(void) prom_nextnode((pnode_t)0);
 #endif
 
 	/*
@@ -1550,7 +1550,7 @@ status_okay(int id, char *buf, int buflen)
 	 * NB: proplen, if it's a string, includes the NULL in the
 	 * the size of the property, and fail_len does not.
 	 */
-	proplen = prom_getproplen((dnode_t)id, (caddr_t)status);
+	proplen = prom_getproplen((pnode_t)id, (caddr_t)status);
 	if (proplen <= fail_len)	/* nonexistent or uninteresting len */
 		return (1);
 
@@ -1570,7 +1570,7 @@ status_okay(int id, char *buf, int buflen)
 	 * a buffer was passed in and the caller wants to print the
 	 * value, but the buffer was too small).
 	 */
-	(void) prom_bounded_getprop((dnode_t)id, (caddr_t)status,
+	(void) prom_bounded_getprop((pnode_t)id, (caddr_t)status,
 	    (caddr_t)bufp, len);
 	*(bufp + len - 1) = (char)0;
 
diff --git a/usr/src/uts/sun4/os/mp_startup.c b/usr/src/uts/sun4/os/mp_startup.c
index f97ac1ecf214..1124e2f3fe95 100644
--- a/usr/src/uts/sun4/os/mp_startup.c
+++ b/usr/src/uts/sun4/os/mp_startup.c
@@ -248,9 +248,9 @@ start_cpu(int cpuid, void(*flag_func)(int))
 		    cpuid);
 	} else {
 		/* "by-cpuid" interface didn't exist.  Do it the old way */
-		dnode_t nodeid = cpunodes[cpuid].nodeid;
+		pnode_t nodeid = cpunodes[cpuid].nodeid;
 
-		ASSERT(nodeid != (dnode_t)0);
+		ASSERT(nodeid != (pnode_t)0);
 		(void) prom_startcpu(nodeid, (caddr_t)&cpu_startup, cpuid);
 	}
 
@@ -795,9 +795,9 @@ start_other_cpus(int flag)
 	 * launch all the slave cpus now
 	 */
 	for (cpuid = 0; cpuid < NCPU; cpuid++) {
-		dnode_t nodeid = cpunodes[cpuid].nodeid;
+		pnode_t nodeid = cpunodes[cpuid].nodeid;
 
-		if (nodeid == (dnode_t)0)
+		if (nodeid == (pnode_t)0)
 			continue;
 
 		if (cpuid == bootcpu) {
diff --git a/usr/src/uts/sun4/os/startup.c b/usr/src/uts/sun4/os/startup.c
index 7f3c8f04e700..d12d2388b6e7 100644
--- a/usr/src/uts/sun4/os/startup.c
+++ b/usr/src/uts/sun4/os/startup.c
@@ -2937,7 +2937,7 @@ static void
 do_prom_version_check(void)
 {
 	int i;
-	dnode_t node;
+	pnode_t node;
 	char buf[64];
 	static char drev[] = "Down-rev firmware detected%s\n"
 		"\tPlease upgrade to the following minimum version:\n"
diff --git a/usr/src/uts/sun4/vm/sfmmu.c b/usr/src/uts/sun4/vm/sfmmu.c
index b69cb54f0427..4b637f2a0b56 100644
--- a/usr/src/uts/sun4/vm/sfmmu.c
+++ b/usr/src/uts/sun4/vm/sfmmu.c
@@ -350,13 +350,13 @@ read_prom_mappings(size_t *ntransrootp)
 {
 	char *prop = "translations";
 	size_t translen;
-	dnode_t node;
+	pnode_t node;
 	struct translation *transroot;
 
 	/*
 	 * the "translations" property is associated with the mmu node
 	 */
-	node = (dnode_t)prom_getphandle(prom_mmu_ihandle());
+	node = (pnode_t)prom_getphandle(prom_mmu_ihandle());
 
 	/*
 	 * We use the TSB space to read in the prom mappings.  This space
diff --git a/usr/src/uts/sun4u/cherrystone/os/cherrystone.c b/usr/src/uts/sun4u/cherrystone/os/cherrystone.c
index 1a8d495213a2..01425c40302b 100644
--- a/usr/src/uts/sun4u/cherrystone/os/cherrystone.c
+++ b/usr/src/uts/sun4u/cherrystone/os/cherrystone.c
@@ -336,7 +336,7 @@ update_mem_bounds(int boardid, int cpuid, int bankid,
  * the cpu memory decoder registers at boot time.
  */
 void
-plat_fill_mc(dnode_t nodeid)
+plat_fill_mc(pnode_t nodeid)
 {
 	uint64_t	mc_addr, saf_addr;
 	uint64_t	mc_decode[CHERRYSTONE_BANKS_PER_MC];
diff --git a/usr/src/uts/sun4u/chicago/os/chicago.c b/usr/src/uts/sun4u/chicago/os/chicago.c
index 6b6c396afcd9..8898e2f307f2 100644
--- a/usr/src/uts/sun4u/chicago/os/chicago.c
+++ b/usr/src/uts/sun4u/chicago/os/chicago.c
@@ -206,7 +206,7 @@ plat_lgrp_cpu_to_hand(processorid_t id)
 void
 plat_lgrp_init(void)
 {
-	dnode_t		curnode;
+	pnode_t		curnode;
 	char		tmp_name[MAXSYSNAME];
 	int		portid;
 	int		cpucnt = 0;
@@ -309,7 +309,7 @@ plat_pfn_to_mem_node(pfn_t pfn)
  * Assign memnode to lgroups
  */
 void
-plat_fill_mc(dnode_t nodeid)
+plat_fill_mc(pnode_t nodeid)
 {
 	int		portid;
 
@@ -396,7 +396,7 @@ plat_shared_i2c_exit(dev_info_t *i2cnexus_dip)
 static void
 get_ebus_rtc_vaddr()
 {
-	dnode_t node;
+	pnode_t node;
 	int size;
 	uint32_t eaddr;
 
diff --git a/usr/src/uts/sun4u/cpu/mach_cpu_module.c b/usr/src/uts/sun4u/cpu/mach_cpu_module.c
index c9dc47c06188..6fb3b95d396c 100644
--- a/usr/src/uts/sun4u/cpu/mach_cpu_module.c
+++ b/usr/src/uts/sun4u/cpu/mach_cpu_module.c
@@ -31,7 +31,7 @@
 #include <vm/seg_map.h>
 
 void
-cpu_fiximp(dnode_t dnode)
+cpu_fiximp(pnode_t dnode)
 {}
 
 void
diff --git a/usr/src/uts/sun4u/cpu/spitfire.c b/usr/src/uts/sun4u/cpu/spitfire.c
index 79dc7f16f95d..35d97e6b2340 100644
--- a/usr/src/uts/sun4u/cpu/spitfire.c
+++ b/usr/src/uts/sun4u/cpu/spitfire.c
@@ -538,7 +538,7 @@ cpu_setup(void)
 }
 
 static int
-getintprop(dnode_t node, char *name, int deflt)
+getintprop(pnode_t node, char *name, int deflt)
 {
 	int	value;
 
@@ -563,7 +563,7 @@ getintprop(dnode_t node, char *name, int deflt)
  * Set the magic constants of the implementation.
  */
 void
-cpu_fiximp(dnode_t dnode)
+cpu_fiximp(pnode_t dnode)
 {
 	extern int vac_size, vac_shift;
 	extern uint_t vac_mask;
diff --git a/usr/src/uts/sun4u/cpu/us3_cheetah.c b/usr/src/uts/sun4u/cpu/us3_cheetah.c
index 6ff125f311fe..b530b6754d79 100644
--- a/usr/src/uts/sun4u/cpu/us3_cheetah.c
+++ b/usr/src/uts/sun4u/cpu/us3_cheetah.c
@@ -83,7 +83,7 @@ cpu_init_trap(void)
 }
 
 static int
-getintprop(dnode_t node, char *name, int deflt)
+getintprop(pnode_t node, char *name, int deflt)
 {
 	int	value;
 
@@ -105,7 +105,7 @@ getintprop(dnode_t node, char *name, int deflt)
  */
 /*ARGSUSED*/
 void
-cpu_fiximp(dnode_t dnode)
+cpu_fiximp(pnode_t dnode)
 {
 	int i, a;
 
diff --git a/usr/src/uts/sun4u/cpu/us3_cheetahplus.c b/usr/src/uts/sun4u/cpu/us3_cheetahplus.c
index 7cf23b60984b..c56b4952ce07 100644
--- a/usr/src/uts/sun4u/cpu/us3_cheetahplus.c
+++ b/usr/src/uts/sun4u/cpu/us3_cheetahplus.c
@@ -109,7 +109,7 @@ cpu_init_trap(void)
  */
 /*ARGSUSED*/
 void
-cpu_fiximp(dnode_t dnode)
+cpu_fiximp(pnode_t dnode)
 {
 	int i, a;
 	extern int vac_size, vac_shift;
diff --git a/usr/src/uts/sun4u/cpu/us3_jalapeno.c b/usr/src/uts/sun4u/cpu/us3_jalapeno.c
index 016604efcd24..cd7184820037 100644
--- a/usr/src/uts/sun4u/cpu/us3_jalapeno.c
+++ b/usr/src/uts/sun4u/cpu/us3_jalapeno.c
@@ -120,7 +120,7 @@ cpu_init_trap(void)
 
 
 static int
-getintprop(dnode_t node, char *name, int deflt)
+getintprop(pnode_t node, char *name, int deflt)
 {
 	int	value;
 
@@ -142,7 +142,7 @@ getintprop(dnode_t node, char *name, int deflt)
  */
 /*ARGSUSED*/
 void
-cpu_fiximp(dnode_t dnode)
+cpu_fiximp(pnode_t dnode)
 {
 	int i, a;
 	extern int vac_size, vac_shift;
diff --git a/usr/src/uts/sun4u/daktari/os/daktari.c b/usr/src/uts/sun4u/daktari/os/daktari.c
index 1dda0e4d5ada..2f63f0be5022 100644
--- a/usr/src/uts/sun4u/daktari/os/daktari.c
+++ b/usr/src/uts/sun4u/daktari/os/daktari.c
@@ -336,7 +336,7 @@ update_mem_bounds(int boardid, int cpuid, int bankid,
  * the cpu memory decoder registers at boot time.
  */
 void
-plat_fill_mc(dnode_t nodeid)
+plat_fill_mc(pnode_t nodeid)
 {
 	uint64_t	mc_addr, saf_addr;
 	uint64_t	mc_decode[DAK_BANKS_PER_MC];
diff --git a/usr/src/uts/sun4u/enchilada/os/enchilada.c b/usr/src/uts/sun4u/enchilada/os/enchilada.c
index 5f761e7908b6..7415e77807ce 100644
--- a/usr/src/uts/sun4u/enchilada/os/enchilada.c
+++ b/usr/src/uts/sun4u/enchilada/os/enchilada.c
@@ -253,7 +253,7 @@ plat_lgrp_cpu_to_hand(processorid_t id)
 void
 plat_lgrp_init(void)
 {
-	dnode_t		curnode;
+	pnode_t		curnode;
 	char		tmp_name[MAXSYSNAME];
 	int		portid;
 	int		cpucnt = 0;
@@ -353,7 +353,7 @@ plat_pfn_to_mem_node(pfn_t pfn)
  * Assign memnode to lgroups
  */
 void
-plat_fill_mc(dnode_t nodeid)
+plat_fill_mc(pnode_t nodeid)
 {
 	int		portid;
 
diff --git a/usr/src/uts/sun4u/io/pcicfg.e.c b/usr/src/uts/sun4u/io/pcicfg.e.c
index 1109623f9e88..3fb3f387173b 100644
--- a/usr/src/uts/sun4u/io/pcicfg.e.c
+++ b/usr/src/uts/sun4u/io/pcicfg.e.c
@@ -745,7 +745,7 @@ pcicfg_configure_ntbridge(dev_info_t *new_device, uint_t bus, uint_t device)
 	for (devno = pcicfg_start_devno; devno < max_devs; devno++) {
 
 		if (ndi_devi_alloc(new_device, DEVI_PSEUDO_NEXNAME,
-		    (dnode_t)DEVI_SID_NODEID, &new_ntbridgechild)
+		    (pnode_t)DEVI_SID_NODEID, &new_ntbridgechild)
 							!= NDI_SUCCESS) {
 
 			DEBUG0("pcicfg: Failed to alloc test node\n");
@@ -1082,7 +1082,7 @@ pcicfg_ntbridge_unconfigure_child(dev_info_t *new_device, uint_t devno)
 	bus = pci_bus_range.lo; /* primary bus number of this bus node */
 
 	if (ndi_devi_alloc(new_device, DEVI_PSEUDO_NEXNAME,
-	    (dnode_t)DEVI_SID_NODEID, &new_ntbridgechild) != NDI_SUCCESS) {
+	    (pnode_t)DEVI_SID_NODEID, &new_ntbridgechild) != NDI_SUCCESS) {
 
 		DEBUG0("pcicfg: Failed to alloc test node\n");
 		return (PCICFG_FAILURE);
@@ -3240,7 +3240,7 @@ pcicfg_probe_children(dev_info_t *parent, uint_t bus,
 	 * ndi_devi_alloc() is called as ndi_devi_alloc_sleep()
 	 */
 	if (ndi_devi_alloc(parent, DEVI_PSEUDO_NEXNAME,
-		(dnode_t)DEVI_SID_NODEID, &new_child)
+		(pnode_t)DEVI_SID_NODEID, &new_child)
 		!= NDI_SUCCESS) {
 		DEBUG0("pcicfg_probe_children(): Failed to alloc child node\n");
 		ndi_devi_exit(parent, circ);
@@ -3485,7 +3485,7 @@ pcicfg_fcode_probe(dev_info_t *parent, uint_t bus,
 	 */
 
 	if (ndi_devi_alloc(parent, DEVI_PSEUDO_NEXNAME,
-		(dnode_t)DEVI_SID_NODEID, &new_child)
+		(pnode_t)DEVI_SID_NODEID, &new_child)
 		!= NDI_SUCCESS) {
 		DEBUG0("pcicfg_fcode_probe(): Failed to alloc child node\n");
 		return (PCICFG_FAILURE);
diff --git a/usr/src/uts/sun4u/io/px/px_tools_4u.c b/usr/src/uts/sun4u/io/px/px_tools_4u.c
index 1270cea459c1..a0a9eb597ae5 100644
--- a/usr/src/uts/sun4u/io/px/px_tools_4u.c
+++ b/usr/src/uts/sun4u/io/px/px_tools_4u.c
@@ -314,7 +314,7 @@ pxtool_dev_reg_ops_platchk(dev_info_t *dip, pcitool_reg_t *prg_p)
 	 * Guard against checking a root nexus which is empty.
 	 * On some systems this will result in a Fatal Reset.
 	 */
-	if ((int)prom_childnode((dnode_t)devi_nodeid) == OBP_NONODE) {
+	if ((int)prom_childnode((pnode_t)devi_nodeid) == OBP_NONODE) {
 		DBG(DBG_TOOLS, dip,
 		    "pxtool_dev_reg_ops set/get reg: nexus has no devs!\n");
 		prg_p->status = PCITOOL_IO_ERROR;
diff --git a/usr/src/uts/sun4u/io/todds1337.c b/usr/src/uts/sun4u/io/todds1337.c
index 5c5ace9b3c42..309c30010ad0 100644
--- a/usr/src/uts/sun4u/io/todds1337.c
+++ b/usr/src/uts/sun4u/io/todds1337.c
@@ -906,7 +906,7 @@ todds1337_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg,
 static int
 todds1337_setup_prom()
 {
-	dnode_t todnode;
+	pnode_t todnode;
 	char tod1337_devpath[MAXNAMELEN];
 
 	if ((todnode = prom_findnode_bydevtype(prom_rootnode(),
diff --git a/usr/src/uts/sun4u/montecarlo/io/ttymux_dacf/ttymux_dacf.c b/usr/src/uts/sun4u/montecarlo/io/ttymux_dacf/ttymux_dacf.c
index 2706bf0d7a8c..c02a9984e74f 100644
--- a/usr/src/uts/sun4u/montecarlo/io/ttymux_dacf/ttymux_dacf.c
+++ b/usr/src/uts/sun4u/montecarlo/io/ttymux_dacf/ttymux_dacf.c
@@ -339,7 +339,7 @@ get_device_list(ihandle_t ihdl, ihandle_t *ihdls, size_t maxi)
  * The caller is responsible for freeing the memory.
  */
 static int
-read_prop(dnode_t node, char *propname, char **propval)
+read_prop(pnode_t node, char *propname, char **propval)
 {
 	int	proplen = -1;
 
@@ -744,7 +744,7 @@ find_consoles(sm_mux_state_t *ms, dev_info_t *dip, dev_t dev)
 	int	len;
 	char	*propval;
 	char	devtype[32];
-	dnode_t	node;
+	pnode_t	node;
 	uint_t	flags;
 
 	/*
@@ -872,7 +872,7 @@ ttymux_config(dacf_infohdl_t info_hdl, dacf_arghdl_t arg_hdl, int flags)
 	ms->sm_cons_stdout.sm_o_ihdl = prom_stdout_ihandle();
 
 	if (prom_is_openprom()) {
-		dnode_t	node = prom_optionsnode();
+		pnode_t	node = prom_optionsnode();
 
 		if (prom_getproplen(node, INPUT_ALIAS) > 0) {
 			ms->sm_ialias = kmem_alloc(
diff --git a/usr/src/uts/sun4u/os/cpr_impl.c b/usr/src/uts/sun4u/os/cpr_impl.c
index 3c585ee30a58..a6c92fc07692 100644
--- a/usr/src/uts/sun4u/os/cpr_impl.c
+++ b/usr/src/uts/sun4u/os/cpr_impl.c
@@ -1769,7 +1769,7 @@ int
 i_cpr_is_supported(void)
 {
 	char es_prop[] = "energystar-v2";
-	dnode_t node;
+	pnode_t node;
 	int last;
 	extern int cpr_supported_override;
 	extern int cpr_platform_enable;
diff --git a/usr/src/uts/sun4u/os/ecc.c b/usr/src/uts/sun4u/os/ecc.c
index 773ae8033dc8..10b6cb523f7a 100644
--- a/usr/src/uts/sun4u/os/ecc.c
+++ b/usr/src/uts/sun4u/os/ecc.c
@@ -202,7 +202,7 @@ void
 error_init(void)
 {
 	char tmp_name[MAXSYSNAME];
-	dnode_t node;
+	pnode_t node;
 	size_t size = cpu_aflt_size();
 
 	/*
diff --git a/usr/src/uts/sun4u/os/fillsysinfo.c b/usr/src/uts/sun4u/os/fillsysinfo.c
index 700a61c9eb86..fcdfbf7fedf1 100644
--- a/usr/src/uts/sun4u/os/fillsysinfo.c
+++ b/usr/src/uts/sun4u/os/fillsysinfo.c
@@ -104,10 +104,10 @@ struct cpu_node cpunodes[NCPU];
 
 static void	check_cpus_ver(void);
 static void	check_cpus_set(void);
-void	fill_cpu(dnode_t);
+void	fill_cpu(pnode_t);
 void	fill_cpu_ddi(dev_info_t *);
 void	empty_cpu(int);
-void	plat_fill_mc(dnode_t);
+void	plat_fill_mc(pnode_t);
 #pragma weak plat_fill_mc
 
 uint64_t	system_clock_freq;
@@ -132,7 +132,7 @@ uint_t		niommu_tsbs = 0;
  */
 #define	CHOSEN_EEPROM	"eeprom"
 #define	WATCHDOG_ENABLE "watchdog-enable"
-static dnode_t 		chosen_eeprom;
+static pnode_t 		chosen_eeprom;
 
 /*
  * Appropriate tod module will be dynamically selected while booting
@@ -162,17 +162,17 @@ int cpr_platform_enable = 0;
 /*
  * Some nodes have functions that need to be called when they're seen.
  */
-static void	have_sbus(dnode_t);
-static void	have_pci(dnode_t);
-static void	have_eeprom(dnode_t);
-static void	have_auxio(dnode_t);
-static void	have_rtc(dnode_t);
-static void	have_tod(dnode_t);
-static void	have_pmc(dnode_t);
+static void	have_sbus(pnode_t);
+static void	have_pci(pnode_t);
+static void	have_eeprom(pnode_t);
+static void	have_auxio(pnode_t);
+static void	have_rtc(pnode_t);
+static void	have_tod(pnode_t);
+static void	have_pmc(pnode_t);
 
 static struct wkdevice {
 	char *wk_namep;
-	void (*wk_func)(dnode_t);
+	void (*wk_func)(pnode_t);
 	caddr_t *wk_vaddrp;
 	ushort_t wk_flags;
 #define	V_OPTIONAL	0x0000
@@ -189,14 +189,14 @@ static struct wkdevice {
 	{ 0, },
 };
 
-static void map_wellknown(dnode_t);
+static void map_wellknown(pnode_t);
 
 void
 map_wellknown_devices()
 {
 	struct wkdevice *wkp;
 	phandle_t	ieeprom;
-	dnode_t	root;
+	pnode_t	root;
 	uint_t	stick_freq;
 
 	/*
@@ -205,16 +205,16 @@ map_wellknown_devices()
 	if (GETPROPLEN(prom_chosennode(), CHOSEN_EEPROM) ==
 	    sizeof (phandle_t) &&
 	    GETPROP(prom_chosennode(), CHOSEN_EEPROM, (caddr_t)&ieeprom) != -1)
-		chosen_eeprom = (dnode_t)prom_decode_int(ieeprom);
+		chosen_eeprom = (pnode_t)prom_decode_int(ieeprom);
 
-	root = prom_nextnode((dnode_t)0);
+	root = prom_nextnode((pnode_t)0);
 	/*
 	 * Get System clock frequency from root node if it exists.
 	 */
 	if (GETPROP(root, "stick-frequency", (caddr_t)&stick_freq) != -1)
 		system_clock_freq = stick_freq;
 
-	map_wellknown(NEXT((dnode_t)0));
+	map_wellknown(NEXT((pnode_t)0));
 
 	/*
 	 * See if it worked
@@ -240,11 +240,11 @@ map_wellknown_devices()
  * map_wellknown - map known devices & registers
  */
 static void
-map_wellknown(dnode_t curnode)
+map_wellknown(pnode_t curnode)
 {
 	extern int status_okay(int, char *, int);
 	char tmp_name[MAXSYSNAME];
-	static void fill_address(dnode_t, char *);
+	static void fill_address(pnode_t, char *);
 	int sok;
 
 #ifdef VPRINTF
@@ -305,7 +305,7 @@ map_wellknown(dnode_t curnode)
 }
 
 static void
-fill_address(dnode_t curnode, char *namep)
+fill_address(pnode_t curnode, char *namep)
 {
 	struct wkdevice *wkp;
 	int size;
@@ -354,7 +354,7 @@ fill_address(dnode_t curnode, char *namep)
 }
 
 int
-get_portid(dnode_t node, dnode_t *cmpp)
+get_portid(pnode_t node, pnode_t *cmpp)
 {
 	int portid;
 	char dev_type[OBP_MAXPROPNAME];
@@ -429,7 +429,7 @@ adj_ecache_setsize(int ecsetsize)
 }
 
 void
-fill_cpu(dnode_t node)
+fill_cpu(pnode_t node)
 {
 	extern int cpu_get_cpu_unum(int, char *, int, int *);
 	struct cpu_node *cpunode;
@@ -438,7 +438,7 @@ fill_cpu(dnode_t node)
 	int tlbsize;
 	int size;
 	uint_t clk_freq;
-	dnode_t cmpnode;
+	pnode_t cmpnode;
 	char namebuf[OBP_MAXPROPNAME], unum[UNUM_NAMLEN];
 	char *namebufp;
 
@@ -484,7 +484,7 @@ fill_cpu(dnode_t node)
 		/*
 		 * If we didn't find it in the CPU node, look in the root node.
 		 */
-		dnode_t root = prom_nextnode((dnode_t)0);
+		pnode_t root = prom_nextnode((pnode_t)0);
 		if (GETPROP(root, "clock-frequency", (caddr_t)&clk_freq) == -1)
 			clk_freq = 0;
 	}
@@ -962,7 +962,7 @@ check_cpus_set(void)
  * handling purposes, referenced by v_sysio_addr in machdep.c.
  */
 static void
-have_sbus(dnode_t node)
+have_sbus(pnode_t node)
 {
 	int size;
 	uint_t portid;
@@ -989,7 +989,7 @@ have_sbus(dnode_t node)
  * handling purposes.
  */
 static void
-have_pci(dnode_t node)
+have_pci(pnode_t node)
 {
 	int size;
 	uint_t portid;
@@ -1026,7 +1026,7 @@ have_pci(dnode_t node)
  * by v_eeprom_addr in locore.s.
  */
 static void
-have_eeprom(dnode_t node)
+have_eeprom(pnode_t node)
 {
 	int size;
 	uint32_t eaddr;
@@ -1085,7 +1085,7 @@ have_eeprom(dnode_t node)
 }
 
 static void
-have_rtc(dnode_t node)
+have_rtc(pnode_t node)
 {
 	int size;
 	uint32_t eaddr;
@@ -1128,15 +1128,15 @@ have_rtc(dnode_t node)
 }
 
 static void
-have_pmc(dnode_t node)
+have_pmc(pnode_t node)
 {
 	uint32_t vaddr;
-	dnode_t root;
+	pnode_t root;
 
 	/*
 	 * Watchdog property is in the root node.
 	 */
-	root = prom_nextnode((dnode_t)0);
+	root = prom_nextnode((pnode_t)0);
 	if (GETPROPLEN(root, WATCHDOG_ENABLE) != -1) {
 		/*
 		 * The hardware watchdog timer resides within logical
@@ -1155,7 +1155,7 @@ have_pmc(dnode_t node)
 }
 
 static void
-have_auxio(dnode_t node)
+have_auxio(pnode_t node)
 {
 	size_t size, n;
 	uint32_t addr[5];
@@ -1189,7 +1189,7 @@ have_auxio(dnode_t node)
 }
 
 static void
-have_tod(dnode_t node)
+have_tod(pnode_t node)
 {
 	static char tod_name[MAXSYSNAME];
 
diff --git a/usr/src/uts/sun4u/os/mach_ddi_impl.c b/usr/src/uts/sun4u/os/mach_ddi_impl.c
index 6b90f9b4320e..05d18b5ca8e5 100644
--- a/usr/src/uts/sun4u/os/mach_ddi_impl.c
+++ b/usr/src/uts/sun4u/os/mach_ddi_impl.c
@@ -143,7 +143,7 @@ get_boardnum(int nid, dev_info_t *par)
 {
 	int board_num;
 
-	if (prom_getprop((dnode_t)nid, OBP_BOARDNUM,
+	if (prom_getprop((pnode_t)nid, OBP_BOARDNUM,
 	    (caddr_t)&board_num) != -1)
 		return (board_num);
 
@@ -154,7 +154,7 @@ get_boardnum(int nid, dev_info_t *par)
 	while (par) {
 		nid = ddi_get_nodeid(par);
 
-		if (prom_getprop((dnode_t)nid, OBP_BOARDNUM,
+		if (prom_getprop((pnode_t)nid, OBP_BOARDNUM,
 		    (caddr_t)&board_num) != -1)
 			return (board_num);
 
@@ -172,11 +172,11 @@ getlongprop_buf(int id, char *name, char *buf, int maxlen)
 {
 	int size;
 
-	size = prom_getproplen((dnode_t)id, name);
+	size = prom_getproplen((pnode_t)id, name);
 	if (size <= 0 || (size > maxlen - 1))
 		return (-1);
 
-	if (-1 == prom_getprop((dnode_t)id, name, buf))
+	if (-1 == prom_getprop((pnode_t)id, name, buf))
 		return (-1);
 
 	/*
@@ -385,10 +385,10 @@ pf_is_dmacapable(pfn_t pfn)
 int
 dip_to_cpu_id(dev_info_t *dip, processorid_t *cpu_id)
 {
-	dnode_t		nodeid;
+	pnode_t		nodeid;
 	int		i;
 
-	nodeid = (dnode_t)ddi_get_nodeid(dip);
+	nodeid = (pnode_t)ddi_get_nodeid(dip);
 	for (i = 0; i < NCPU; i++) {
 		if (cpunodes[i].nodeid == nodeid) {
 			*cpu_id = i;
@@ -434,7 +434,7 @@ ndi2errno(int n)
  * Prom tree node list
  */
 struct ptnode {
-	dnode_t		nodeid;
+	pnode_t		nodeid;
 	struct ptnode	*next;
 };
 
@@ -450,10 +450,10 @@ struct pta {
 };
 
 static void
-visit_node(dnode_t nodeid, struct pta *ap)
+visit_node(pnode_t nodeid, struct pta *ap)
 {
 	struct ptnode	**nextp;
-	int		(*select)(dnode_t, void *, uint_t);
+	int		(*select)(pnode_t, void *, uint_t);
 
 	ASSERT(nodeid != OBP_NONODE && nodeid != OBP_BADNODE);
 
@@ -501,7 +501,7 @@ create_prom_branch(void *arg, int has_changed)
 {
 	int		circ, c;
 	int		exists, rv;
-	dnode_t		nodeid;
+	pnode_t		nodeid;
 	struct ptnode	*tnp;
 	dev_info_t	*dip;
 	struct pta	*ap = arg;
diff --git a/usr/src/uts/sun4u/os/mach_startup.c b/usr/src/uts/sun4u/os/mach_startup.c
index a88f0c774e31..a4b7bede3a01 100644
--- a/usr/src/uts/sun4u/os/mach_startup.c
+++ b/usr/src/uts/sun4u/os/mach_startup.c
@@ -236,7 +236,7 @@ iam_positron(void)
 {
 	char model[32];
 	const char proto_model[] = "SUNW,501-2732";
-	dnode_t root = prom_rootnode();
+	pnode_t root = prom_rootnode();
 
 	if (prom_getproplen(root, "model") != sizeof (proto_model))
 		return (0);
diff --git a/usr/src/uts/sun4u/schumacher/os/schumacher.c b/usr/src/uts/sun4u/schumacher/os/schumacher.c
index e6800cfd0414..c8463c42266d 100644
--- a/usr/src/uts/sun4u/schumacher/os/schumacher.c
+++ b/usr/src/uts/sun4u/schumacher/os/schumacher.c
@@ -236,7 +236,7 @@ plat_lgrp_cpu_to_hand(processorid_t id)
 void
 plat_lgrp_init(void)
 {
-	dnode_t		curnode;
+	pnode_t		curnode;
 	char		tmp_name[MAXSYSNAME];
 	int		portid;
 	int		cpucnt = 0;
@@ -336,7 +336,7 @@ plat_pfn_to_mem_node(pfn_t pfn)
  * Assign memnode to lgroups
  */
 void
-plat_fill_mc(dnode_t nodeid)
+plat_fill_mc(pnode_t nodeid)
 {
 	int		portid;
 
diff --git a/usr/src/uts/sun4u/snowbird/io/todds1307/todds1307.c b/usr/src/uts/sun4u/snowbird/io/todds1307/todds1307.c
index 6eccb146e00b..8f8bdfa2781c 100644
--- a/usr/src/uts/sun4u/snowbird/io/todds1307/todds1307.c
+++ b/usr/src/uts/sun4u/snowbird/io/todds1307/todds1307.c
@@ -676,7 +676,7 @@ bcd2int(unsigned char num) {
 static int
 todds1307_setup_prom()
 {
-	dnode_t todnode;
+	pnode_t todnode;
 	char tod1307_devpath[MAXNAMELEN];
 
 	if ((todnode = prom_findnode_bydevtype(prom_rootnode(),
diff --git a/usr/src/uts/sun4u/starfire/io/drmach.c b/usr/src/uts/sun4u/starfire/io/drmach.c
index 8739fb0c83a2..7d7ba4346a14 100644
--- a/usr/src/uts/sun4u/starfire/io/drmach.c
+++ b/usr/src/uts/sun4u/starfire/io/drmach.c
@@ -88,11 +88,11 @@ extern uint_t		ldphysio_il(uint64_t physaddr);
 extern void		stphysio_il(uint64_t physaddr, uint_t value);
 
 extern uint64_t		mc_get_mem_alignment(void);
-extern uint64_t		mc_get_asr_addr(dnode_t);
-extern uint64_t		mc_get_idle_addr(dnode_t);
-extern uint64_t		mc_get_alignment_mask(dnode_t);
-extern int		mc_read_asr(dnode_t, uint_t *);
-extern int		mc_write_asr(dnode_t, uint_t);
+extern uint64_t		mc_get_asr_addr(pnode_t);
+extern uint64_t		mc_get_idle_addr(pnode_t);
+extern uint64_t		mc_get_alignment_mask(pnode_t);
+extern int		mc_read_asr(pnode_t, uint_t *);
+extern int		mc_write_asr(pnode_t, uint_t);
 extern uint64_t		mc_asr_to_pa(uint_t);
 extern uint_t		mc_pa_to_asr(uint_t, uint64_t);
 
@@ -106,7 +106,7 @@ typedef struct {
 typedef struct drmach_node {
 	void		*here;
 
-	dnode_t		 (*get_dnode)(struct drmach_node *node);
+	pnode_t		 (*get_dnode)(struct drmach_node *node);
 	int		 (*walk)(struct drmach_node *node, void *data,
 				int (*cb)(drmach_node_walk_args_t *args));
 } drmach_node_t;
@@ -363,17 +363,17 @@ _info(struct modinfo *modinfop)
 	return (mod_info(&modlinkage, modinfop));
 }
 
-static dnode_t
+static pnode_t
 drmach_node_obp_get_dnode(drmach_node_t *np)
 {
-	return ((dnode_t)np->here);
+	return ((pnode_t)np->here);
 }
 
 static int
 drmach_node_obp_walk(drmach_node_t *np, void *data,
 		int (*cb)(drmach_node_walk_args_t *args))
 {
-	dnode_t			nodeid;
+	pnode_t			nodeid;
 	int			rv;
 	drmach_node_walk_args_t	args;
 
@@ -423,7 +423,7 @@ drmach_node_dispose(drmach_node_t *np)
 static dev_info_t *
 drmach_node_get_dip(drmach_node_t *np)
 {
-	dnode_t nodeid;
+	pnode_t nodeid;
 
 	nodeid = np->get_dnode(np);
 	if (nodeid == OBP_NONODE)
@@ -447,7 +447,7 @@ drmach_node_get_dip(drmach_node_t *np)
 	/*NOTREACHED*/
 }
 
-static dnode_t
+static pnode_t
 drmach_node_get_dnode(drmach_node_t *np)
 {
 	return (np->get_dnode(np));
@@ -463,7 +463,7 @@ drmach_node_walk(drmach_node_t *np, void *param,
 static int
 drmach_node_get_prop(drmach_node_t *np, char *name, void *buf)
 {
-	dnode_t	nodeid;
+	pnode_t	nodeid;
 	int	rv;
 
 	nodeid = np->get_dnode(np);
@@ -482,7 +482,7 @@ drmach_node_get_prop(drmach_node_t *np, char *name, void *buf)
 static int
 drmach_node_get_proplen(drmach_node_t *np, char *name, int *len)
 {
-	dnode_t	 nodeid;
+	pnode_t	 nodeid;
 	int	 rv;
 
 	nodeid = np->get_dnode(np);
@@ -599,10 +599,10 @@ drmach_array_dispose(drmach_array_t *arr, void (*disposer)(drmachid_t))
 
 /*ARGSUSED*/
 static int
-drmach_prom_select(dnode_t nodeid, void *arg, uint_t flags)
+drmach_prom_select(pnode_t nodeid, void *arg, uint_t flags)
 {
 	int			rprop[64];
-	dnode_t			saved;
+	pnode_t			saved;
 	drmach_config_args_t	*ap = (drmach_config_args_t *)arg;
 	drmach_device_t		*dp = ap->dp;
 	sbd_error_t		*err;
@@ -927,7 +927,7 @@ hold_rele_branch(dev_info_t *rdip, void *arg)
 static int
 drmach_init(void)
 {
-	dnode_t		nodeid;
+	pnode_t		nodeid;
 	dev_info_t	*rdip;
 	int		hold, circ;
 
@@ -1057,7 +1057,7 @@ static sbd_error_t *
 drmach_get_mc_asr_addr(drmachid_t id, uint64_t *pa)
 {
 	drmach_device_t	*dp;
-	dnode_t		nodeid;
+	pnode_t		nodeid;
 	uint64_t	addr;
 
 	if (!DRMACH_IS_MEM_ID(id))
@@ -1080,7 +1080,7 @@ static sbd_error_t *
 drmach_get_mc_idle_addr(drmachid_t id, uint64_t *pa)
 {
 	drmach_device_t	*dp;
-	dnode_t		nodeid;
+	pnode_t		nodeid;
 	uint64_t	addr;
 
 	if (!DRMACH_IS_MEM_ID(id))
@@ -1103,7 +1103,7 @@ static sbd_error_t *
 drmach_read_mc_asr(drmachid_t id, uint_t *mcregp)
 {
 	drmach_device_t	*dp;
-	dnode_t		 nodeid;
+	pnode_t		 nodeid;
 	sbd_error_t	*err;
 
 	if (!DRMACH_IS_MEM_ID(id))
@@ -1125,7 +1125,7 @@ static sbd_error_t *
 drmach_write_mc_asr(drmachid_t id, uint_t mcreg)
 {
 	drmach_device_t	*dp;
-	dnode_t		 nodeid;
+	pnode_t		 nodeid;
 	sbd_error_t	*err;
 
 	if (!DRMACH_IS_MEM_ID(id))
@@ -1912,7 +1912,7 @@ drmach_remove_counter_nodes(drmachid_t id)
 {
 	int		num;
 	char		name[OBP_MAXDRVNAME];
-	dnode_t		child;
+	pnode_t		child;
 	dev_info_t	*dip;
 	sbd_error_t	*err;
 	drmach_status_t	stat;
@@ -2481,7 +2481,7 @@ drmach_cpu_start(struct cpu *cp)
 	extern void	restart_other_cpu(int);
 
 	ASSERT(MUTEX_HELD(&cpu_lock));
-	ASSERT(cpunodes[cpuid].nodeid != (dnode_t)0);
+	ASSERT(cpunodes[cpuid].nodeid != (pnode_t)0);
 
 	cp->cpu_flags &= ~CPU_POWEROFF;
 
@@ -3197,7 +3197,7 @@ drmach_mem_get_alignment(drmachid_t id, uint64_t *mask)
 {
 	drmach_device_t	*mem;
 	sbd_error_t	*err;
-	dnode_t		 nodeid;
+	pnode_t		 nodeid;
 
 	if (!DRMACH_IS_MEM_ID(id))
 		return (drerr_new(0, ESTF_INAPPROP, NULL));
@@ -3695,7 +3695,7 @@ sbd_error_t *
 drmach_unconfigure(drmachid_t id, int flags)
 {
 	drmach_device_t	*dp;
-	dnode_t		 nodeid;
+	pnode_t		 nodeid;
 	dev_info_t	*dip, *fdip = NULL;
 	uint_t 		ddi_flags;
 
diff --git a/usr/src/uts/sun4u/starfire/io/idn.c b/usr/src/uts/sun4u/starfire/io/idn.c
index 7d4e9edcf918..4aeb9a135a55 100644
--- a/usr/src/uts/sun4u/starfire/io/idn.c
+++ b/usr/src/uts/sun4u/starfire/io/idn.c
@@ -5524,7 +5524,7 @@ idnxdc_broadcast(domainset_t domset, idn_msgtype_t *mtp,
 static int
 idn_prom_getsmr(uint_t *smrsz, uint64_t *paddrp, uint64_t *sizep)
 {
-	dnode_t		nodeid;
+	pnode_t		nodeid;
 	int		found = 0;
 	int		len;
 	uint_t		smrsize = 0;
diff --git a/usr/src/uts/sun4u/starfire/io/memctrl.c b/usr/src/uts/sun4u/starfire/io/memctrl.c
index 6826436aacfe..adc02672bd40 100644
--- a/usr/src/uts/sun4u/starfire/io/memctrl.c
+++ b/usr/src/uts/sun4u/starfire/io/memctrl.c
@@ -20,8 +20,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 1998, 2001 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -85,7 +85,7 @@ mc_get_mem_alignment()
 }
 
 uint64_t
-mc_get_asr_addr(dnode_t nodeid)
+mc_get_asr_addr(pnode_t nodeid)
 {
 	int		rlen;
 	uint64_t	psi_addr;
@@ -105,7 +105,7 @@ mc_get_asr_addr(dnode_t nodeid)
 }
 
 uint64_t
-mc_get_idle_addr(dnode_t nodeid)
+mc_get_idle_addr(pnode_t nodeid)
 {
 	int		rlen;
 	uint64_t	psi_addr;
@@ -125,7 +125,7 @@ mc_get_idle_addr(dnode_t nodeid)
 }
 
 int
-mc_get_dimm_size(dnode_t nodeid)
+mc_get_dimm_size(pnode_t nodeid)
 {
 	uint64_t	psi_addr;
 	uint_t		dimmtype;
@@ -157,7 +157,7 @@ mc_get_dimm_size(dnode_t nodeid)
 }
 
 uint64_t
-mc_get_alignment_mask(dnode_t nodeid)
+mc_get_alignment_mask(pnode_t nodeid)
 {
 	uint64_t	psi_addr, seg_sz;
 	uint_t		mcreg, seg_sz_mask;
@@ -199,7 +199,7 @@ mc_get_alignment_mask(dnode_t nodeid)
 }
 
 int
-mc_read_asr(dnode_t nodeid, uint_t *mcregp)
+mc_read_asr(pnode_t nodeid, uint_t *mcregp)
 {
 	uint64_t	psi_addr;
 
@@ -215,7 +215,7 @@ mc_read_asr(dnode_t nodeid, uint_t *mcregp)
 }
 
 int
-mc_write_asr(dnode_t nodeid, uint_t mcreg)
+mc_write_asr(pnode_t nodeid, uint_t mcreg)
 {
 	uint_t		mcreg_rd;
 	uint64_t	psi_addr;
diff --git a/usr/src/uts/sun4u/sys/cpr_impl.h b/usr/src/uts/sun4u/sys/cpr_impl.h
index 37cb2b50fb4c..40d36129ddb8 100644
--- a/usr/src/uts/sun4u/sys/cpr_impl.h
+++ b/usr/src/uts/sun4u/sys/cpr_impl.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -89,7 +89,7 @@ typedef struct sun4u_tlb sutlb_t;
  * processor info
  */
 struct sun4u_cpu_info {
-	dnode_t node;
+	pnode_t node;
 	processorid_t cpu_id;
 };
 
diff --git a/usr/src/uts/sun4u/sys/cpu_module.h b/usr/src/uts/sun4u/sys/cpu_module.h
index d6e6ddf407cc..8cd0361d3f5b 100644
--- a/usr/src/uts/sun4u/sys/cpu_module.h
+++ b/usr/src/uts/sun4u/sys/cpu_module.h
@@ -65,7 +65,7 @@ void	cpu_kdi_init(struct kdi *);
  *	The default 4 page sizes to 6 page sizes for Panther-only domains,
  *	and is called from fillsysinfo.c:check_cpus_set at early bootup time.
  */
-void	cpu_fiximp(dnode_t dnode);
+void	cpu_fiximp(pnode_t dnode);
 #pragma weak cpu_fix_allpanther
 void	cpu_fix_allpanther(void);
 #pragma weak mmu_init_mmu_page_sizes
diff --git a/usr/src/uts/sun4u/sys/fc_plat.h b/usr/src/uts/sun4u/sys/fc_plat.h
index 4cfd8de931cf..77211e30f388 100644
--- a/usr/src/uts/sun4u/sys/fc_plat.h
+++ b/usr/src/uts/sun4u/sys/fc_plat.h
@@ -20,8 +20,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 1998 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #ifndef	_SYS_FC_PLAT_H
@@ -71,7 +71,7 @@ typedef uint32_t fc_phandle_t;
 #define	fc_size2cell(u)		((fc_cell_t)((size_t)(u)))
 #define	fc_ssize2cell(i)	((fc_cell_t)((ssize_t)(i)))
 #define	fc_phandle2cell(ph)	((fc_cell_t)((unsigned int)((phandle_t)(ph))))
-#define	fc_dnode2cell(d)	((fc_cell_t)((unsigned int)((dnode_t)(d))))
+#define	fc_dnode2cell(d)	((fc_cell_t)((unsigned int)((pnode_t)(d))))
 #define	fc_ull2cell_high(ll)	(0LL)
 #define	fc_ull2cell_low(ll)	((fc_cell_t)(ll))
 #define	fc_uintptr2cell(i)	((fc_cell_t)((uintptr_t)(i)))
@@ -88,7 +88,7 @@ typedef uint32_t fc_phandle_t;
 #define	fc_cell2size(u)		((size_t)((fc_cell_t)(u)))
 #define	fc_cell2ssize(i)	((ssize_t)((fc_cell_t)(i)))
 #define	fc_cell2phandle(ph)	((phandle_t)((fc_cell_t)(ph)))
-#define	fc_cell2dnode(d)	((dnode_t)((fc_cell_t)(d)))
+#define	fc_cell2dnode(d)	((pnode_t)((fc_cell_t)(d)))
 #define	fc_cells2ull(h, l)	((unsigned long long)(fc_cell_t)(l))
 #define	fc_cell2uintptr(i)	((uintptr_t)((fc_cell_t)(i)))
 #define	fc_cell2uchar(c)	((unsigned char)(fc_cell_t)(c))
diff --git a/usr/src/uts/sun4u/sys/machcpuvar.h b/usr/src/uts/sun4u/sys/machcpuvar.h
index 1953374a651a..ef0d98e8b55d 100644
--- a/usr/src/uts/sun4u/sys/machcpuvar.h
+++ b/usr/src/uts/sun4u/sys/machcpuvar.h
@@ -166,7 +166,7 @@ struct cpu_node {
 	int	implementation;
 	int	version;
 	int	portid;
-	dnode_t	nodeid;
+	pnode_t	nodeid;
 	uint64_t	clock_freq;
 	uint_t	tick_nsec_scale;
 	union {
diff --git a/usr/src/uts/sun4u/sys/machsystm.h b/usr/src/uts/sun4u/sys/machsystm.h
index f69d2a6f1e25..2150c563347c 100644
--- a/usr/src/uts/sun4u/sys/machsystm.h
+++ b/usr/src/uts/sun4u/sys/machsystm.h
@@ -374,7 +374,7 @@ typedef struct devi_branch {
 	void		(*devi_branch_callback)(dev_info_t *, void *, uint_t);
 	int		type;
 	union {
-		int	(*prom_branch_select)(dnode_t, void *, uint_t);
+		int	(*prom_branch_select)(pnode_t, void *, uint_t);
 		int	(*sid_branch_create)(dev_info_t *, void *, uint_t);
 	} create;
 } devi_branch_t;
diff --git a/usr/src/uts/sun4u/sys/prom_plat.h b/usr/src/uts/sun4u/sys/prom_plat.h
index 9a6380311193..c557eee0a1b4 100644
--- a/usr/src/uts/sun4u/sys/prom_plat.h
+++ b/usr/src/uts/sun4u/sys/prom_plat.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -204,17 +204,17 @@ extern	int		prom_getmacaddr(ihandle_t hd, caddr_t ea);
 /*
  * CPU Control Group: MP's only.
  */
-extern	int		prom_startcpu(dnode_t node, caddr_t pc, int arg);
+extern	int		prom_startcpu(pnode_t node, caddr_t pc, int arg);
 extern	int		prom_startcpu_bycpuid(int cpuid, caddr_t pc, int arg);
 extern	int		prom_stopcpu_bycpuid(int);
 extern	int		prom_sunfire_cpu_off(void);	/* SunFire only */
-extern	int		prom_wakeupcpu(dnode_t node);
-extern	int		prom_serengeti_wakeupcpu(dnode_t node);
+extern	int		prom_wakeupcpu(pnode_t node);
+extern	int		prom_serengeti_wakeupcpu(pnode_t node);
 extern	int		prom_hotaddcpu(int cpuid);
 extern	int		prom_hotremovecpu(int cpuid);
 extern	void		promsafe_pause_cpus(void);
 extern	void		promsafe_xc_attention(cpuset_t cpuset);
-extern	int		prom_serengeti_cpu_off(dnode_t node);
+extern	int		prom_serengeti_cpu_off(pnode_t node);
 
 /*
  * Set trap table
diff --git a/usr/src/uts/sun4u/sys/sbd.h b/usr/src/uts/sun4u/sys/sbd.h
index 0ec8c2e5b8cb..44e075e4be58 100644
--- a/usr/src/uts/sun4u/sys/sbd.h
+++ b/usr/src/uts/sun4u/sys/sbd.h
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2002-2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -66,7 +66,7 @@ typedef struct {
 } sbd_devattr_t;
 
 typedef struct {
-	dnode_t		dnodeid;
+	pnode_t		dnodeid;
 	uint64_t	*basepa;
 } sbd_basephys_t;
 
diff --git a/usr/src/uts/sun4v/ontario/os/ontario.c b/usr/src/uts/sun4v/ontario/os/ontario.c
index 5c646600acd8..c807992b78ac 100644
--- a/usr/src/uts/sun4v/ontario/os/ontario.c
+++ b/usr/src/uts/sun4v/ontario/os/ontario.c
@@ -72,7 +72,7 @@ void
 load_platform_drivers(void)
 {
 	dev_info_t 		*dip;		/* dip of the isa driver */
-	dnode_t 		nodeid;
+	pnode_t 		nodeid;
 
 	/*
 	 * Install ISA driver. This is required for the southbridge IDE
diff --git a/usr/src/uts/sun4v/os/error.c b/usr/src/uts/sun4v/os/error.c
index fd073c834f75..956142692ed6 100644
--- a/usr/src/uts/sun4v/os/error.c
+++ b/usr/src/uts/sun4v/os/error.c
@@ -699,7 +699,7 @@ void
 error_init(void)
 {
 	char tmp_name[MAXSYSNAME];
-	dnode_t node;
+	pnode_t node;
 	size_t size = cpu_aflt_size();
 
 	/*
diff --git a/usr/src/uts/sun4v/os/fillsysinfo.c b/usr/src/uts/sun4v/os/fillsysinfo.c
index 80297ac315b5..7cfb68fe7fac 100644
--- a/usr/src/uts/sun4v/os/fillsysinfo.c
+++ b/usr/src/uts/sun4v/os/fillsysinfo.c
@@ -98,8 +98,8 @@ int	debug_fillsysinfo = 0;
 int ncpunode;
 struct cpu_node cpunodes[NCPU];
 
-void	fill_cpu(dnode_t);
-void	plat_fill_mc(dnode_t);
+void	fill_cpu(pnode_t);
+void	plat_fill_mc(pnode_t);
 #pragma weak plat_fill_mc
 
 uint64_t	system_clock_freq;
@@ -110,7 +110,7 @@ uint_t		niommu_tsbs = 0;
  * Hardware watchdog support.
  */
 #define	CHOSEN_EEPROM	"eeprom"
-static dnode_t 		chosen_eeprom;
+static pnode_t 		chosen_eeprom;
 
 /*
  * If this variable is non-zero, cpr should return "not supported" when
@@ -129,11 +129,11 @@ int cpr_platform_enable = 0;
 /*
  * Some nodes have functions that need to be called when they're seen.
  */
-static void	have_pci(dnode_t);
+static void	have_pci(pnode_t);
 
 static struct wkdevice {
 	char *wk_namep;
-	void (*wk_func)(dnode_t);
+	void (*wk_func)(pnode_t);
 	caddr_t *wk_vaddrp;
 	ushort_t wk_flags;
 #define	V_OPTIONAL	0x0000
@@ -145,14 +145,14 @@ static struct wkdevice {
 	{ 0, },
 };
 
-static void map_wellknown(dnode_t);
+static void map_wellknown(pnode_t);
 
 void
 map_wellknown_devices()
 {
 	struct wkdevice *wkp;
 	phandle_t	ieeprom;
-	dnode_t	root;
+	pnode_t	root;
 	uint_t	stick_freq;
 
 	/*
@@ -161,16 +161,16 @@ map_wellknown_devices()
 	if (GETPROPLEN(prom_chosennode(), CHOSEN_EEPROM) ==
 	    sizeof (phandle_t) &&
 	    GETPROP(prom_chosennode(), CHOSEN_EEPROM, (caddr_t)&ieeprom) != -1)
-		chosen_eeprom = (dnode_t)prom_decode_int(ieeprom);
+		chosen_eeprom = (pnode_t)prom_decode_int(ieeprom);
 
-	root = prom_nextnode((dnode_t)0);
+	root = prom_nextnode((pnode_t)0);
 	/*
 	 * Get System clock frequency from root node if it exists.
 	 */
 	if (GETPROP(root, "stick-frequency", (caddr_t)&stick_freq) != -1)
 		system_clock_freq = stick_freq;
 
-	map_wellknown(NEXT((dnode_t)0));
+	map_wellknown(NEXT((pnode_t)0));
 
 	/*
 	 * See if it worked
@@ -187,11 +187,11 @@ map_wellknown_devices()
  * map_wellknown - map known devices & registers
  */
 static void
-map_wellknown(dnode_t curnode)
+map_wellknown(pnode_t curnode)
 {
 	extern int status_okay(int, char *, int);
 	char tmp_name[MAXSYSNAME];
-	static void fill_address(dnode_t, char *);
+	static void fill_address(pnode_t, char *);
 	int sok;
 
 #ifdef VPRINTF
@@ -251,7 +251,7 @@ map_wellknown(dnode_t curnode)
 }
 
 static void
-fill_address(dnode_t curnode, char *namep)
+fill_address(pnode_t curnode, char *namep)
 {
 	struct wkdevice *wkp;
 	int size;
@@ -300,7 +300,7 @@ fill_address(dnode_t curnode, char *namep)
 }
 
 void
-fill_cpu(dnode_t node)
+fill_cpu(pnode_t node)
 {
 	struct cpu_node *cpunode;
 	processorid_t cpuid;
@@ -337,7 +337,7 @@ fill_cpu(dnode_t node)
 		/*
 		 * If we didn't find it in the CPU node, look in the root node.
 		 */
-		dnode_t root = prom_nextnode((dnode_t)0);
+		pnode_t root = prom_nextnode((pnode_t)0);
 		if (GETPROP(root, "clock-frequency", (caddr_t)&clk_freq) == -1)
 			clk_freq = 0;
 	}
@@ -368,7 +368,7 @@ fill_cpu(dnode_t node)
  * handling purposes.
  */
 static void
-have_pci(dnode_t node)
+have_pci(pnode_t node)
 {
 	int size;
 	uint_t portid;
diff --git a/usr/src/uts/sun4v/os/mach_cpu_states.c b/usr/src/uts/sun4v/os/mach_cpu_states.c
index 0e7c2d9041d2..5e6a757cfae3 100644
--- a/usr/src/uts/sun4v/os/mach_cpu_states.c
+++ b/usr/src/uts/sun4v/os/mach_cpu_states.c
@@ -543,7 +543,7 @@ update_hvdump_buffer(void)
 
 
 static int
-getintprop(dnode_t node, char *name, int deflt)
+getintprop(pnode_t node, char *name, int deflt)
 {
 	int	value;
 
diff --git a/usr/src/uts/sun4v/sys/machcpuvar.h b/usr/src/uts/sun4v/sys/machcpuvar.h
index a88f48a076cd..2ed0b9dddc52 100644
--- a/usr/src/uts/sun4v/sys/machcpuvar.h
+++ b/usr/src/uts/sun4v/sys/machcpuvar.h
@@ -193,7 +193,7 @@ struct cpu_node {
 	char	name[MAXSYSNAME];
 	char	fru_fmri[sizeof (CPU_FRU_FMRI) + UNUM_NAMLEN];
 	int	cpuid;
-	dnode_t	nodeid;
+	pnode_t	nodeid;
 	uint64_t	clock_freq;
 	uint_t	tick_nsec_scale;
 	union {
diff --git a/usr/src/uts/sun4v/sys/machsystm.h b/usr/src/uts/sun4v/sys/machsystm.h
index 8db9dca1ffc3..01e5699f8b81 100644
--- a/usr/src/uts/sun4v/sys/machsystm.h
+++ b/usr/src/uts/sun4v/sys/machsystm.h
@@ -362,7 +362,7 @@ typedef struct devi_branch {
 	void		(*devi_branch_callback)(dev_info_t *, void *, uint_t);
 	int		type;
 	union {
-		int	(*prom_branch_select)(dnode_t, void *, uint_t);
+		int	(*prom_branch_select)(pnode_t, void *, uint_t);
 		int	(*sid_branch_create)(dev_info_t *, void *, uint_t);
 	} create;
 } devi_branch_t;
diff --git a/usr/src/uts/sun4v/sys/prom_plat.h b/usr/src/uts/sun4v/sys/prom_plat.h
index 37bb0ca4f51e..d46c9825d27d 100644
--- a/usr/src/uts/sun4v/sys/prom_plat.h
+++ b/usr/src/uts/sun4v/sys/prom_plat.h
@@ -199,7 +199,7 @@ extern	int		prom_getmacaddr(ihandle_t hd, caddr_t ea);
 /*
  * CPU Control Group: MP's only.
  */
-extern	int		prom_startcpu(dnode_t node, caddr_t pc, int arg);
+extern	int		prom_startcpu(pnode_t node, caddr_t pc, int arg);
 extern	int		prom_startcpu_bycpuid(int cpuid, caddr_t pc, int arg);
 extern	int		prom_stopcpu_bycpuid(int);
 extern	void		promsafe_pause_cpus(void);