diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 6b5ec4201994..f88291386407 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -2488,6 +2488,9 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	dmu_object_type_t type;
 	boolean_t is_metadata;
 
+	if (bp == NULL)
+		return (0);
+
 	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
@@ -2984,7 +2987,7 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	avl_index_t where;
 	zdb_ddt_entry_t *zdde, zdde_search;
 
-	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return (0);
 
 	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
diff --git a/cmd/zdb/zdb.c.orig b/cmd/zdb/zdb.c.orig
new file mode 100644
index 000000000000..6b5ec4201994
--- /dev/null
+++ b/cmd/zdb/zdb.c.orig
@@ -0,0 +1,3890 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2015, Intel Corporation.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdio_ext.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_pool.h>
+#include <sys/dbuf.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/dmu_traverse.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/zfs_fuid.h>
+#include <sys/arc.h>
+#include <sys/ddt.h>
+#include <sys/zfeature.h>
+#include <zfs_comutil.h>
+#undef ZFS_MAXNAMELEN
+#include <libzfs.h>
+
+#define	ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ?	\
+	zio_compress_table[(idx)].ci_name : "UNKNOWN")
+#define	ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ?	\
+	zio_checksum_table[(idx)].ci_name : "UNKNOWN")
+#define	ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ?	\
+	dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ?	\
+	dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN")
+#define	ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) :		\
+	(((idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA) ?	\
+	DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES))
+
+#ifndef lint
+extern int zfs_recover;
+extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
+extern int zfs_vdev_async_read_max_active;
+#else
+int zfs_recover;
+uint64_t zfs_arc_max, zfs_arc_meta_limit;
+int zfs_vdev_async_read_max_active;
+#endif
+
+const char cmdname[] = "zdb";
+uint8_t dump_opt[256];
+
+typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
+
+extern void dump_intent_log(zilog_t *);
+uint64_t *zopt_object = NULL;
+int zopt_objects = 0;
+libzfs_handle_t *g_zfs;
+uint64_t max_inflight = 1000;
+
+static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *);
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+const char *
+_umem_debug_init(void)
+{
+	return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+	return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+
+static void
+usage(void)
+{
+	(void) fprintf(stderr,
+	    "Usage: %s [-CumMdibcsDvhLXFPA] [-t txg] [-e [-p path...]] "
+	    "[-U config] [-I inflight I/Os] poolname [object...]\n"
+	    "       %s [-divPA] [-e -p path...] [-U config] dataset "
+	    "[object...]\n"
+	    "       %s -mM [-LXFPA] [-t txg] [-e [-p path...]] [-U config] "
+	    "poolname [vdev [metaslab...]]\n"
+	    "       %s -R [-A] [-e [-p path...]] poolname "
+	    "vdev:offset:size[:flags]\n"
+	    "       %s -S [-PA] [-e [-p path...]] [-U config] poolname\n"
+	    "       %s -l [-uA] device\n"
+	    "       %s -C [-A] [-U config]\n\n",
+	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
+
+	(void) fprintf(stderr, "    Dataset name must include at least one "
+	    "separator character '/' or '@'\n");
+	(void) fprintf(stderr, "    If dataset name is specified, only that "
+	    "dataset is dumped\n");
+	(void) fprintf(stderr, "    If object numbers are specified, only "
+	    "those objects are dumped\n\n");
+	(void) fprintf(stderr, "    Options to control amount of output:\n");
+	(void) fprintf(stderr, "        -u uberblock\n");
+	(void) fprintf(stderr, "        -d dataset(s)\n");
+	(void) fprintf(stderr, "        -i intent logs\n");
+	(void) fprintf(stderr, "        -C config (or cachefile if alone)\n");
+	(void) fprintf(stderr, "        -h pool history\n");
+	(void) fprintf(stderr, "        -b block statistics\n");
+	(void) fprintf(stderr, "        -m metaslabs\n");
+	(void) fprintf(stderr, "        -M metaslab groups\n");
+	(void) fprintf(stderr, "        -c checksum all metadata (twice for "
+	    "all data) blocks\n");
+	(void) fprintf(stderr, "        -s report stats on zdb's I/O\n");
+	(void) fprintf(stderr, "        -D dedup statistics\n");
+	(void) fprintf(stderr, "        -S simulate dedup to measure effect\n");
+	(void) fprintf(stderr, "        -v verbose (applies to all others)\n");
+	(void) fprintf(stderr, "        -l dump label contents\n");
+	(void) fprintf(stderr, "        -L disable leak tracking (do not "
+	    "load spacemaps)\n");
+	(void) fprintf(stderr, "        -R read and display block from a "
+	    "device\n\n");
+	(void) fprintf(stderr, "    Below options are intended for use "
+	    "with other options (except -l):\n");
+	(void) fprintf(stderr, "        -A ignore assertions (-A), enable "
+	    "panic recovery (-AA) or both (-AAA)\n");
+	(void) fprintf(stderr, "        -F attempt automatic rewind within "
+	    "safe range of transaction groups\n");
+	(void) fprintf(stderr, "        -U <cachefile_path> -- use alternate "
+	    "cachefile\n");
+	(void) fprintf(stderr, "        -X attempt extreme rewind (does not "
+	    "work with dataset)\n");
+	(void) fprintf(stderr, "        -e pool is exported/destroyed/"
+	    "has altroot/not in a cachefile\n");
+	(void) fprintf(stderr, "        -p <path> -- use one or more with "
+	    "-e to specify path to vdev dir\n");
+	(void) fprintf(stderr, "        -P print numbers in parseable form\n");
+	(void) fprintf(stderr, "        -t <txg> -- highest txg to use when "
+	    "searching for uberblocks\n");
+	(void) fprintf(stderr, "        -I <number of inflight I/Os> -- "
+	    "specify the maximum number of checksumming I/Os "
+	    "[default is 200]\n");
+	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
+	    "to make only that option verbose\n");
+	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
+	exit(1);
+}
+
+/*
+ * Called for usage errors that are discovered after a call to spa_open(),
+ * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
+ */
+
+static void
+fatal(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	(void) fprintf(stderr, "%s: ", cmdname);
+	(void) vfprintf(stderr, fmt, ap);
+	va_end(ap);
+	(void) fprintf(stderr, "\n");
+
+	exit(1);
+}
+
+/* ARGSUSED */
+static void
+dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	nvlist_t *nv;
+	size_t nvsize = *(uint64_t *)data;
+	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
+
+	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
+
+	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
+
+	umem_free(packed, nvsize);
+
+	dump_nvlist(nv, 8);
+
+	nvlist_free(nv);
+}
+
+/* ARGSUSED */
+static void
+dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	spa_history_phys_t *shp = data;
+
+	if (shp == NULL)
+		return;
+
+	(void) printf("\t\tpool_create_len = %llu\n",
+	    (u_longlong_t)shp->sh_pool_create_len);
+	(void) printf("\t\tphys_max_off = %llu\n",
+	    (u_longlong_t)shp->sh_phys_max_off);
+	(void) printf("\t\tbof = %llu\n",
+	    (u_longlong_t)shp->sh_bof);
+	(void) printf("\t\teof = %llu\n",
+	    (u_longlong_t)shp->sh_eof);
+	(void) printf("\t\trecords_lost = %llu\n",
+	    (u_longlong_t)shp->sh_records_lost);
+}
+
+static void
+zdb_nicenum(uint64_t num, char *buf)
+{
+	if (dump_opt['P'])
+		(void) sprintf(buf, "%llu", (longlong_t)num);
+	else
+		nicenum(num, buf);
+}
+
+const char histo_stars[] = "****************************************";
+const int histo_width = sizeof (histo_stars) - 1;
+
+static void
+dump_histogram(const uint64_t *histo, int size, int offset)
+{
+	int i;
+	int minidx = size - 1;
+	int maxidx = 0;
+	uint64_t max = 0;
+
+	for (i = 0; i < size; i++) {
+		if (histo[i] > max)
+			max = histo[i];
+		if (histo[i] > 0 && i > maxidx)
+			maxidx = i;
+		if (histo[i] > 0 && i < minidx)
+			minidx = i;
+	}
+
+	if (max < histo_width)
+		max = histo_width;
+
+	for (i = minidx; i <= maxidx; i++) {
+		(void) printf("\t\t\t%3u: %6llu %s\n",
+		    i + offset, (u_longlong_t)histo[i],
+		    &histo_stars[(max - histo[i]) * histo_width / max]);
+	}
+}
+
+static void
+dump_zap_stats(objset_t *os, uint64_t object)
+{
+	int error;
+	zap_stats_t zs;
+
+	error = zap_get_stats(os, object, &zs);
+	if (error)
+		return;
+
+	if (zs.zs_ptrtbl_len == 0) {
+		ASSERT(zs.zs_num_blocks == 1);
+		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
+		    (u_longlong_t)zs.zs_blocksize,
+		    (u_longlong_t)zs.zs_num_entries);
+		return;
+	}
+
+	(void) printf("\tFat ZAP stats:\n");
+
+	(void) printf("\t\tPointer table:\n");
+	(void) printf("\t\t\t%llu elements\n",
+	    (u_longlong_t)zs.zs_ptrtbl_len);
+	(void) printf("\t\t\tzt_blk: %llu\n",
+	    (u_longlong_t)zs.zs_ptrtbl_zt_blk);
+	(void) printf("\t\t\tzt_numblks: %llu\n",
+	    (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
+	(void) printf("\t\t\tzt_shift: %llu\n",
+	    (u_longlong_t)zs.zs_ptrtbl_zt_shift);
+	(void) printf("\t\t\tzt_blks_copied: %llu\n",
+	    (u_longlong_t)zs.zs_ptrtbl_blks_copied);
+	(void) printf("\t\t\tzt_nextblk: %llu\n",
+	    (u_longlong_t)zs.zs_ptrtbl_nextblk);
+
+	(void) printf("\t\tZAP entries: %llu\n",
+	    (u_longlong_t)zs.zs_num_entries);
+	(void) printf("\t\tLeaf blocks: %llu\n",
+	    (u_longlong_t)zs.zs_num_leafs);
+	(void) printf("\t\tTotal blocks: %llu\n",
+	    (u_longlong_t)zs.zs_num_blocks);
+	(void) printf("\t\tzap_block_type: 0x%llx\n",
+	    (u_longlong_t)zs.zs_block_type);
+	(void) printf("\t\tzap_magic: 0x%llx\n",
+	    (u_longlong_t)zs.zs_magic);
+	(void) printf("\t\tzap_salt: 0x%llx\n",
+	    (u_longlong_t)zs.zs_salt);
+
+	(void) printf("\t\tLeafs with 2^n pointers:\n");
+	dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
+
+	(void) printf("\t\tBlocks with n*5 entries:\n");
+	dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
+
+	(void) printf("\t\tBlocks n/10 full:\n");
+	dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
+
+	(void) printf("\t\tEntries with n chunks:\n");
+	dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
+
+	(void) printf("\t\tBuckets with n entries:\n");
+	dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
+}
+
+/*ARGSUSED*/
+static void
+dump_none(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	(void) printf("\tUNKNOWN OBJECT TYPE\n");
+}
+
+/*ARGSUSED*/
+void
+dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+	void *prop;
+	int i;
+
+	dump_zap_stats(os, object);
+	(void) printf("\n");
+
+	for (zap_cursor_init(&zc, os, object);
+	    zap_cursor_retrieve(&zc, &attr) == 0;
+	    zap_cursor_advance(&zc)) {
+		(void) printf("\t\t%s = ", attr.za_name);
+		if (attr.za_num_integers == 0) {
+			(void) printf("\n");
+			continue;
+		}
+		prop = umem_zalloc(attr.za_num_integers *
+		    attr.za_integer_length, UMEM_NOFAIL);
+		(void) zap_lookup(os, object, attr.za_name,
+		    attr.za_integer_length, attr.za_num_integers, prop);
+		if (attr.za_integer_length == 1) {
+			(void) printf("%s", (char *)prop);
+		} else {
+			for (i = 0; i < attr.za_num_integers; i++) {
+				switch (attr.za_integer_length) {
+				case 2:
+					(void) printf("%u ",
+					    ((uint16_t *)prop)[i]);
+					break;
+				case 4:
+					(void) printf("%u ",
+					    ((uint32_t *)prop)[i]);
+					break;
+				case 8:
+					(void) printf("%lld ",
+					    (u_longlong_t)((int64_t *)prop)[i]);
+					break;
+				}
+			}
+		}
+		(void) printf("\n");
+		umem_free(prop, attr.za_num_integers * attr.za_integer_length);
+	}
+	zap_cursor_fini(&zc);
+}
+
+static void
+dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	bpobj_phys_t *bpop = data;
+	uint64_t i;
+	char bytes[32], comp[32], uncomp[32];
+
+	if (bpop == NULL)
+		return;
+
+	zdb_nicenum(bpop->bpo_bytes, bytes);
+	zdb_nicenum(bpop->bpo_comp, comp);
+	zdb_nicenum(bpop->bpo_uncomp, uncomp);
+
+	(void) printf("\t\tnum_blkptrs = %llu\n",
+	    (u_longlong_t)bpop->bpo_num_blkptrs);
+	(void) printf("\t\tbytes = %s\n", bytes);
+	if (size >= BPOBJ_SIZE_V1) {
+		(void) printf("\t\tcomp = %s\n", comp);
+		(void) printf("\t\tuncomp = %s\n", uncomp);
+	}
+	if (size >= sizeof (*bpop)) {
+		(void) printf("\t\tsubobjs = %llu\n",
+		    (u_longlong_t)bpop->bpo_subobjs);
+		(void) printf("\t\tnum_subobjs = %llu\n",
+		    (u_longlong_t)bpop->bpo_num_subobjs);
+	}
+
+	if (dump_opt['d'] < 5)
+		return;
+
+	for (i = 0; i < bpop->bpo_num_blkptrs; i++) {
+		char blkbuf[BP_SPRINTF_LEN];
+		blkptr_t bp;
+
+		int err = dmu_read(os, object,
+		    i * sizeof (bp), sizeof (bp), &bp, 0);
+		if (err != 0) {
+			(void) printf("got error %u from dmu_read\n", err);
+			break;
+		}
+		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp);
+		(void) printf("\t%s\n", blkbuf);
+	}
+}
+
+/* ARGSUSED */
+static void
+dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	dmu_object_info_t doi;
+	int64_t i;
+
+	VERIFY0(dmu_object_info(os, object, &doi));
+	uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
+
+	int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
+	if (err != 0) {
+		(void) printf("got error %u from dmu_read\n", err);
+		kmem_free(subobjs, doi.doi_max_offset);
+		return;
+	}
+
+	int64_t last_nonzero = -1;
+	for (i = 0; i < doi.doi_max_offset / 8; i++) {
+		if (subobjs[i] != 0)
+			last_nonzero = i;
+	}
+
+	for (i = 0; i <= last_nonzero; i++) {
+		(void) printf("\t%llu\n", (u_longlong_t)subobjs[i]);
+	}
+	kmem_free(subobjs, doi.doi_max_offset);
+}
+
+/*ARGSUSED*/
+static void
+dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	dump_zap_stats(os, object);
+	/* contents are printed elsewhere, properly decoded */
+}
+
+/*ARGSUSED*/
+static void
+dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+
+	dump_zap_stats(os, object);
+	(void) printf("\n");
+
+	for (zap_cursor_init(&zc, os, object);
+	    zap_cursor_retrieve(&zc, &attr) == 0;
+	    zap_cursor_advance(&zc)) {
+		(void) printf("\t\t%s = ", attr.za_name);
+		if (attr.za_num_integers == 0) {
+			(void) printf("\n");
+			continue;
+		}
+		(void) printf(" %llx : [%d:%d:%d]\n",
+		    (u_longlong_t)attr.za_first_integer,
+		    (int)ATTR_LENGTH(attr.za_first_integer),
+		    (int)ATTR_BSWAP(attr.za_first_integer),
+		    (int)ATTR_NUM(attr.za_first_integer));
+	}
+	zap_cursor_fini(&zc);
+}
+
+/*ARGSUSED*/
+static void
+dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+	uint16_t *layout_attrs;
+	int i;
+
+	dump_zap_stats(os, object);
+	(void) printf("\n");
+
+	for (zap_cursor_init(&zc, os, object);
+	    zap_cursor_retrieve(&zc, &attr) == 0;
+	    zap_cursor_advance(&zc)) {
+		(void) printf("\t\t%s = [", attr.za_name);
+		if (attr.za_num_integers == 0) {
+			(void) printf("\n");
+			continue;
+		}
+
+		VERIFY(attr.za_integer_length == 2);
+		layout_attrs = umem_zalloc(attr.za_num_integers *
+		    attr.za_integer_length, UMEM_NOFAIL);
+
+		VERIFY(zap_lookup(os, object, attr.za_name,
+		    attr.za_integer_length,
+		    attr.za_num_integers, layout_attrs) == 0);
+
+		for (i = 0; i != attr.za_num_integers; i++)
+			(void) printf(" %d ", (int)layout_attrs[i]);
+		(void) printf("]\n");
+		umem_free(layout_attrs,
+		    attr.za_num_integers * attr.za_integer_length);
+	}
+	zap_cursor_fini(&zc);
+}
+
+/*ARGSUSED*/
+static void
+dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+	const char *typenames[] = {
+		/* 0 */ "not specified",
+		/* 1 */ "FIFO",
+		/* 2 */ "Character Device",
+		/* 3 */ "3 (invalid)",
+		/* 4 */ "Directory",
+		/* 5 */ "5 (invalid)",
+		/* 6 */ "Block Device",
+		/* 7 */ "7 (invalid)",
+		/* 8 */ "Regular File",
+		/* 9 */ "9 (invalid)",
+		/* 10 */ "Symbolic Link",
+		/* 11 */ "11 (invalid)",
+		/* 12 */ "Socket",
+		/* 13 */ "Door",
+		/* 14 */ "Event Port",
+		/* 15 */ "15 (invalid)",
+	};
+
+	dump_zap_stats(os, object);
+	(void) printf("\n");
+
+	for (zap_cursor_init(&zc, os, object);
+	    zap_cursor_retrieve(&zc, &attr) == 0;
+	    zap_cursor_advance(&zc)) {
+		(void) printf("\t\t%s = %lld (type: %s)\n",
+		    attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
+		    typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
+	}
+	zap_cursor_fini(&zc);
+}
+
+int
+get_dtl_refcount(vdev_t *vd)
+{
+	int refcount = 0;
+	int c;
+
+	if (vd->vdev_ops->vdev_op_leaf) {
+		space_map_t *sm = vd->vdev_dtl_sm;
+
+		if (sm != NULL &&
+		    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
+			return (1);
+		return (0);
+	}
+
+	for (c = 0; c < vd->vdev_children; c++)
+		refcount += get_dtl_refcount(vd->vdev_child[c]);
+	return (refcount);
+}
+
+int
+get_metaslab_refcount(vdev_t *vd)
+{
+	int refcount = 0;
+	int c, m;
+
+	if (vd->vdev_top == vd && !vd->vdev_removing) {
+		for (m = 0; m < vd->vdev_ms_count; m++) {
+			space_map_t *sm = vd->vdev_ms[m]->ms_sm;
+
+			if (sm != NULL &&
+			    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
+				refcount++;
+		}
+	}
+	for (c = 0; c < vd->vdev_children; c++)
+		refcount += get_metaslab_refcount(vd->vdev_child[c]);
+
+	return (refcount);
+}
+
+static int
+verify_spacemap_refcounts(spa_t *spa)
+{
+	uint64_t expected_refcount = 0;
+	uint64_t actual_refcount;
+
+	(void) feature_get_refcount(spa,
+	    &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
+	    &expected_refcount);
+	actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
+	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
+
+	if (expected_refcount != actual_refcount) {
+		(void) printf("space map refcount mismatch: expected %lld != "
+		    "actual %lld\n",
+		    (longlong_t)expected_refcount,
+		    (longlong_t)actual_refcount);
+		return (2);
+	}
+	return (0);
+}
+
+static void
+dump_spacemap(objset_t *os, space_map_t *sm)
+{
+	uint64_t alloc, offset, entry;
+	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
+			    "INVALID", "INVALID", "INVALID", "INVALID" };
+
+	if (sm == NULL)
+		return;
+
+	/*
+	 * Print out the freelist entries in both encoded and decoded form.
+	 */
+	alloc = 0;
+	for (offset = 0; offset < space_map_length(sm);
+	    offset += sizeof (entry)) {
+		uint8_t mapshift = sm->sm_shift;
+
+		VERIFY0(dmu_read(os, space_map_object(sm), offset,
+		    sizeof (entry), &entry, DMU_READ_PREFETCH));
+		if (SM_DEBUG_DECODE(entry)) {
+
+			(void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
+			    (u_longlong_t)(offset / sizeof (entry)),
+			    ddata[SM_DEBUG_ACTION_DECODE(entry)],
+			    (u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
+			    (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
+		} else {
+			(void) printf("\t    [%6llu]    %c  range:"
+			    " %010llx-%010llx  size: %06llx\n",
+			    (u_longlong_t)(offset / sizeof (entry)),
+			    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
+			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
+			    mapshift) + sm->sm_start),
+			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
+			    mapshift) + sm->sm_start +
+			    (SM_RUN_DECODE(entry) << mapshift)),
+			    (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
+			if (SM_TYPE_DECODE(entry) == SM_ALLOC)
+				alloc += SM_RUN_DECODE(entry) << mapshift;
+			else
+				alloc -= SM_RUN_DECODE(entry) << mapshift;
+		}
+	}
+	if (alloc != space_map_allocated(sm)) {
+		(void) printf("space_map_object alloc (%llu) INCONSISTENT "
+		    "with space map summary (%llu)\n",
+		    (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
+	}
+}
+
+static void
+dump_metaslab_stats(metaslab_t *msp)
+{
+	char maxbuf[32];
+	range_tree_t *rt = msp->ms_tree;
+	avl_tree_t *t = &msp->ms_size_tree;
+	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
+
+	zdb_nicenum(metaslab_block_maxsize(msp), maxbuf);
+
+	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
+	    "segments", avl_numnodes(t), "maxsize", maxbuf,
+	    "freepct", free_pct);
+	(void) printf("\tIn-memory histogram:\n");
+	dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+}
+
+static void
+dump_metaslab(metaslab_t *msp)
+{
+	vdev_t *vd = msp->ms_group->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+	space_map_t *sm = msp->ms_sm;
+	char freebuf[32];
+
+	zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf);
+
+	(void) printf(
+	    "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
+	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
+	    (u_longlong_t)space_map_object(sm), freebuf);
+
+	if (dump_opt['m'] > 2 && !dump_opt['L']) {
+		mutex_enter(&msp->ms_lock);
+		metaslab_load_wait(msp);
+		if (!msp->ms_loaded) {
+			VERIFY0(metaslab_load(msp));
+			range_tree_stat_verify(msp->ms_tree);
+		}
+		dump_metaslab_stats(msp);
+		metaslab_unload(msp);
+		mutex_exit(&msp->ms_lock);
+	}
+
+	if (dump_opt['m'] > 1 && sm != NULL &&
+	    spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+		/*
+		 * The space map histogram represents free space in chunks
+		 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
+		 */
+		(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
+		    (u_longlong_t)msp->ms_fragmentation);
+		dump_histogram(sm->sm_phys->smp_histogram,
+		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
+	}
+
+	if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
+		ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
+
+		mutex_enter(&msp->ms_lock);
+		dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
+		mutex_exit(&msp->ms_lock);
+	}
+}
+
+static void
+print_vdev_metaslab_header(vdev_t *vd)
+{
+	(void) printf("\tvdev %10llu\n\t%-10s%5llu   %-19s   %-15s   %-10s\n",
+	    (u_longlong_t)vd->vdev_id,
+	    "metaslabs", (u_longlong_t)vd->vdev_ms_count,
+	    "offset", "spacemap", "free");
+	(void) printf("\t%15s   %19s   %15s   %10s\n",
+	    "---------------", "-------------------",
+	    "---------------", "-------------");
+}
+
+static void
+dump_metaslab_groups(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	metaslab_class_t *mc = spa_normal_class(spa);
+	uint64_t fragmentation;
+	int c;
+
+	metaslab_class_histogram_verify(mc);
+
+	for (c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
+
+		if (mg->mg_class != mc)
+			continue;
+
+		metaslab_group_histogram_verify(mg);
+		mg->mg_fragmentation = metaslab_group_fragmentation(mg);
+
+		(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
+		    "fragmentation",
+		    (u_longlong_t)tvd->vdev_id,
+		    (u_longlong_t)tvd->vdev_ms_count);
+		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
+			(void) printf("%3s\n", "-");
+		} else {
+			(void) printf("%3llu%%\n",
+			    (u_longlong_t)mg->mg_fragmentation);
+		}
+		dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+	}
+
+	(void) printf("\tpool %s\tfragmentation", spa_name(spa));
+	fragmentation = metaslab_class_fragmentation(mc);
+	if (fragmentation == ZFS_FRAG_INVALID)
+		(void) printf("\t%3s\n", "-");
+	else
+		(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
+	dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+}
+
+static void
+dump_metaslabs(spa_t *spa)
+{
+	vdev_t *vd, *rvd = spa->spa_root_vdev;
+	uint64_t m, c = 0, children = rvd->vdev_children;
+
+	(void) printf("\nMetaslabs:\n");
+
+	if (!dump_opt['d'] && zopt_objects > 0) {
+		c = zopt_object[0];
+
+		if (c >= children)
+			(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
+
+		if (zopt_objects > 1) {
+			vd = rvd->vdev_child[c];
+			print_vdev_metaslab_header(vd);
+
+			for (m = 1; m < zopt_objects; m++) {
+				if (zopt_object[m] < vd->vdev_ms_count)
+					dump_metaslab(
+					    vd->vdev_ms[zopt_object[m]]);
+				else
+					(void) fprintf(stderr, "bad metaslab "
+					    "number %llu\n",
+					    (u_longlong_t)zopt_object[m]);
+			}
+			(void) printf("\n");
+			return;
+		}
+		children = c + 1;
+	}
+	for (; c < children; c++) {
+		vd = rvd->vdev_child[c];
+		print_vdev_metaslab_header(vd);
+
+		for (m = 0; m < vd->vdev_ms_count; m++)
+			dump_metaslab(vd->vdev_ms[m]);
+		(void) printf("\n");
+	}
+}
+
+static void
+dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
+{
+	const ddt_phys_t *ddp = dde->dde_phys;
+	const ddt_key_t *ddk = &dde->dde_key;
+	char *types[4] = { "ditto", "single", "double", "triple" };
+	char blkbuf[BP_SPRINTF_LEN];
+	blkptr_t blk;
+	int p;
+
+	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		if (ddp->ddp_phys_birth == 0)
+			continue;
+		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
+		(void) printf("index %llx refcnt %llu %s %s\n",
+		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
+		    types[p], blkbuf);
+	}
+}
+
+static void
+dump_dedup_ratio(const ddt_stat_t *dds)
+{
+	double rL, rP, rD, D, dedup, compress, copies;
+
+	if (dds->dds_blocks == 0)
+		return;
+
+	rL = (double)dds->dds_ref_lsize;
+	rP = (double)dds->dds_ref_psize;
+	rD = (double)dds->dds_ref_dsize;
+	D = (double)dds->dds_dsize;
+
+	dedup = rD / D;
+	compress = rL / rP;
+	copies = rD / rP;
+
+	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
+	    "dedup * compress / copies = %.2f\n\n",
+	    dedup, compress, copies, dedup * compress / copies);
+}
+
+static void
+dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+	char name[DDT_NAMELEN];
+	ddt_entry_t dde;
+	uint64_t walk = 0;
+	dmu_object_info_t doi;
+	uint64_t count, dspace, mspace;
+	int error;
+
+	error = ddt_object_info(ddt, type, class, &doi);
+
+	if (error == ENOENT)
+		return;
+	ASSERT(error == 0);
+
+	error = ddt_object_count(ddt, type, class, &count);
+	ASSERT(error == 0);
+	if (count == 0)
+		return;
+
+	dspace = doi.doi_physical_blocks_512 << 9;
+	mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+	ddt_object_name(ddt, type, class, name);
+
+	(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
+	    name,
+	    (u_longlong_t)count,
+	    (u_longlong_t)(dspace / count),
+	    (u_longlong_t)(mspace / count));
+
+	if (dump_opt['D'] < 3)
+		return;
+
+	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
+
+	if (dump_opt['D'] < 4)
+		return;
+
+	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
+		return;
+
+	(void) printf("%s contents:\n\n", name);
+
+	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
+		dump_dde(ddt, &dde, walk);
+
+	ASSERT(error == ENOENT);
+
+	(void) printf("\n");
+}
+
+static void
+dump_all_ddts(spa_t *spa)
+{
+	ddt_histogram_t ddh_total;
+	ddt_stat_t dds_total;
+	enum zio_checksum c;
+	enum ddt_type type;
+	enum ddt_class class;
+
+	bzero(&ddh_total, sizeof (ddt_histogram_t));
+	bzero(&dds_total, sizeof (ddt_stat_t));
+
+	for (c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		for (type = 0; type < DDT_TYPES; type++) {
+			for (class = 0; class < DDT_CLASSES;
+			    class++) {
+				dump_ddt(ddt, type, class);
+			}
+		}
+	}
+
+	ddt_get_dedup_stats(spa, &dds_total);
+
+	if (dds_total.dds_blocks == 0) {
+		(void) printf("All DDTs are empty\n");
+		return;
+	}
+
+	(void) printf("\n");
+
+	if (dump_opt['D'] > 1) {
+		(void) printf("DDT histogram (aggregated over all DDTs):\n");
+		ddt_get_dedup_histogram(spa, &ddh_total);
+		zpool_dump_ddt(&dds_total, &ddh_total);
+	}
+
+	dump_dedup_ratio(&dds_total);
+}
+
+static void
+dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
+{
+	char *prefix = arg;
+
+	(void) printf("%s [%llu,%llu) length %llu\n",
+	    prefix,
+	    (u_longlong_t)start,
+	    (u_longlong_t)(start + size),
+	    (u_longlong_t)(size));
+}
+
+static void
+dump_dtl(vdev_t *vd, int indent)
+{
+	spa_t *spa = vd->vdev_spa;
+	boolean_t required;
+	char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
+	char prefix[256];
+	int c, t;
+
+	spa_vdev_state_enter(spa, SCL_NONE);
+	required = vdev_dtl_required(vd);
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+
+	if (indent == 0)
+		(void) printf("\nDirty time logs:\n\n");
+
+	(void) printf("\t%*s%s [%s]\n", indent, "",
+	    vd->vdev_path ? vd->vdev_path :
+	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
+	    required ? "DTL-required" : "DTL-expendable");
+
+	for (t = 0; t < DTL_TYPES; t++) {
+		range_tree_t *rt = vd->vdev_dtl[t];
+		if (range_tree_space(rt) == 0)
+			continue;
+		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
+		    indent + 2, "", name[t]);
+		mutex_enter(rt->rt_lock);
+		range_tree_walk(rt, dump_dtl_seg, prefix);
+		mutex_exit(rt->rt_lock);
+		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
+			dump_spacemap(spa->spa_meta_objset,
+			    vd->vdev_dtl_sm);
+	}
+
+	for (c = 0; c < vd->vdev_children; c++)
+		dump_dtl(vd->vdev_child[c], indent + 4);
+}
+
+static void
+dump_history(spa_t *spa)
+{
+	nvlist_t **events = NULL;
+	char *buf;
+	uint64_t resid, len, off = 0;
+	uint_t num = 0;
+	int error;
+	time_t tsec;
+	struct tm t;
+	char tbuf[30];
+	char internalstr[MAXPATHLEN];
+	int i;
+
+	if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) {
+		(void) fprintf(stderr, "%s: unable to allocate I/O buffer\n",
+		    __func__);
+		return;
+	}
+
+	do {
+		len = SPA_OLD_MAXBLOCKSIZE;
+
+		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
+			(void) fprintf(stderr, "Unable to read history: "
+			    "error %d\n", error);
+			free(buf);
+			return;
+		}
+
+		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
+			break;
+
+		off -= resid;
+	} while (len != 0);
+
+	(void) printf("\nHistory:\n");
+	for (i = 0; i < num; i++) {
+		uint64_t time, txg, ievent;
+		char *cmd, *intstr;
+		boolean_t printed = B_FALSE;
+
+		if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
+		    &time) != 0)
+			goto next;
+		if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
+		    &cmd) != 0) {
+			if (nvlist_lookup_uint64(events[i],
+			    ZPOOL_HIST_INT_EVENT, &ievent) != 0)
+				goto next;
+			verify(nvlist_lookup_uint64(events[i],
+			    ZPOOL_HIST_TXG, &txg) == 0);
+			verify(nvlist_lookup_string(events[i],
+			    ZPOOL_HIST_INT_STR, &intstr) == 0);
+			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
+				goto next;
+
+			(void) snprintf(internalstr,
+			    sizeof (internalstr),
+			    "[internal %s txg:%lld] %s",
+			    zfs_history_event_names[ievent],
+			    (longlong_t)txg, intstr);
+			cmd = internalstr;
+		}
+		tsec = time;
+		(void) localtime_r(&tsec, &t);
+		(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
+		(void) printf("%s %s\n", tbuf, cmd);
+		printed = B_TRUE;
+
+next:
+		if (dump_opt['h'] > 1) {
+			if (!printed)
+				(void) printf("unrecognized record:\n");
+			dump_nvlist(events[i], 2);
+		}
+	}
+	free(buf);
+}
+
+/*ARGSUSED*/
+static void
+dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+static uint64_t
+blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
+    const zbookmark_phys_t *zb)
+{
+	if (dnp == NULL) {
+		ASSERT(zb->zb_level < 0);
+		if (zb->zb_object == 0)
+			return (zb->zb_blkid);
+		return (zb->zb_blkid * BP_GET_LSIZE(bp));
+	}
+
+	ASSERT(zb->zb_level >= 0);
+
+	return ((zb->zb_blkid <<
+	    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
+	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+}
+
+static void
+snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
+{
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
+	int i;
+
+	if (dump_opt['b'] >= 6) {
+		snprintf_blkptr(blkbuf, buflen, bp);
+		return;
+	}
+
+	if (BP_IS_EMBEDDED(bp)) {
+		(void) sprintf(blkbuf,
+		    "EMBEDDED et=%u %llxL/%llxP B=%llu",
+		    (int)BPE_GET_ETYPE(bp),
+		    (u_longlong_t)BPE_GET_LSIZE(bp),
+		    (u_longlong_t)BPE_GET_PSIZE(bp),
+		    (u_longlong_t)bp->blk_birth);
+		return;
+	}
+
+	blkbuf[0] = '\0';
+
+	for (i = 0; i < ndvas; i++)
+		(void) snprintf(blkbuf + strlen(blkbuf),
+		    buflen - strlen(blkbuf), "%llu:%llx:%llx ",
+		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
+		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
+		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
+
+	if (BP_IS_HOLE(bp)) {
+		(void) snprintf(blkbuf + strlen(blkbuf),
+		    buflen - strlen(blkbuf),
+		    "%llxL B=%llu",
+		    (u_longlong_t)BP_GET_LSIZE(bp),
+		    (u_longlong_t)bp->blk_birth);
+	} else {
+		(void) snprintf(blkbuf + strlen(blkbuf),
+		    buflen - strlen(blkbuf),
+		    "%llxL/%llxP F=%llu B=%llu/%llu",
+		    (u_longlong_t)BP_GET_LSIZE(bp),
+		    (u_longlong_t)BP_GET_PSIZE(bp),
+		    (u_longlong_t)BP_GET_FILL(bp),
+		    (u_longlong_t)bp->blk_birth,
+		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
+	}
+}
+
+static void
+print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb,
+    const dnode_phys_t *dnp)
+{
+	char blkbuf[BP_SPRINTF_LEN];
+	int l;
+
+	if (!BP_IS_EMBEDDED(bp)) {
+		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
+		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
+	}
+
+	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
+
+	ASSERT(zb->zb_level >= 0);
+
+	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
+		if (l == zb->zb_level) {
+			(void) printf("L%llx", (u_longlong_t)zb->zb_level);
+		} else {
+			(void) printf(" ");
+		}
+	}
+
+	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
+	(void) printf("%s\n", blkbuf);
+}
+
+static int
+visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
+    blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+	int err = 0;
+
+	if (bp->blk_birth == 0)
+		return (0);
+
+	print_indirect(bp, zb, dnp);
+
+	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
+		arc_flags_t flags = ARC_FLAG_WAIT;
+		int i;
+		blkptr_t *cbp;
+		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+		arc_buf_t *buf;
+		uint64_t fill = 0;
+
+		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		if (err)
+			return (err);
+		ASSERT(buf->b_data);
+
+		/* recursively visit blocks below this */
+		cbp = buf->b_data;
+		for (i = 0; i < epb; i++, cbp++) {
+			zbookmark_phys_t czb;
+
+			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+			    zb->zb_level - 1,
+			    zb->zb_blkid * epb + i);
+			err = visit_indirect(spa, dnp, cbp, &czb);
+			if (err)
+				break;
+			fill += BP_GET_FILL(cbp);
+		}
+		if (!err)
+			ASSERT3U(fill, ==, BP_GET_FILL(bp));
+		(void) arc_buf_remove_ref(buf, &buf);
+	}
+
+	return (err);
+}
+
+/*ARGSUSED*/
+static void
+dump_indirect(dnode_t *dn)
+{
+	dnode_phys_t *dnp = dn->dn_phys;
+	int j;
+	zbookmark_phys_t czb;
+
+	(void) printf("Indirect blocks:\n");
+
+	SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
+	    dn->dn_object, dnp->dn_nlevels - 1, 0);
+	for (j = 0; j < dnp->dn_nblkptr; j++) {
+		czb.zb_blkid = j;
+		(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
+		    &dnp->dn_blkptr[j], &czb);
+	}
+
+	(void) printf("\n");
+}
+
+/*ARGSUSED*/
+static void
+dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	dsl_dir_phys_t *dd = data;
+	time_t crtime;
+	char nice[32];
+
+	if (dd == NULL)
+		return;
+
+	ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
+
+	crtime = dd->dd_creation_time;
+	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
+	(void) printf("\t\thead_dataset_obj = %llu\n",
+	    (u_longlong_t)dd->dd_head_dataset_obj);
+	(void) printf("\t\tparent_dir_obj = %llu\n",
+	    (u_longlong_t)dd->dd_parent_obj);
+	(void) printf("\t\torigin_obj = %llu\n",
+	    (u_longlong_t)dd->dd_origin_obj);
+	(void) printf("\t\tchild_dir_zapobj = %llu\n",
+	    (u_longlong_t)dd->dd_child_dir_zapobj);
+	zdb_nicenum(dd->dd_used_bytes, nice);
+	(void) printf("\t\tused_bytes = %s\n", nice);
+	zdb_nicenum(dd->dd_compressed_bytes, nice);
+	(void) printf("\t\tcompressed_bytes = %s\n", nice);
+	zdb_nicenum(dd->dd_uncompressed_bytes, nice);
+	(void) printf("\t\tuncompressed_bytes = %s\n", nice);
+	zdb_nicenum(dd->dd_quota, nice);
+	(void) printf("\t\tquota = %s\n", nice);
+	zdb_nicenum(dd->dd_reserved, nice);
+	(void) printf("\t\treserved = %s\n", nice);
+	(void) printf("\t\tprops_zapobj = %llu\n",
+	    (u_longlong_t)dd->dd_props_zapobj);
+	(void) printf("\t\tdeleg_zapobj = %llu\n",
+	    (u_longlong_t)dd->dd_deleg_zapobj);
+	(void) printf("\t\tflags = %llx\n",
+	    (u_longlong_t)dd->dd_flags);
+
+#define	DO(which) \
+	zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \
+	(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
+	DO(HEAD);
+	DO(SNAP);
+	DO(CHILD);
+	DO(CHILD_RSRV);
+	DO(REFRSRV);
+#undef DO
+}
+
+/*ARGSUSED*/
+static void
+dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	dsl_dataset_phys_t *ds = data;
+	time_t crtime;
+	char used[32], compressed[32], uncompressed[32], unique[32];
+	char blkbuf[BP_SPRINTF_LEN];
+
+	if (ds == NULL)
+		return;
+
+	ASSERT(size == sizeof (*ds));
+	crtime = ds->ds_creation_time;
+	zdb_nicenum(ds->ds_referenced_bytes, used);
+	zdb_nicenum(ds->ds_compressed_bytes, compressed);
+	zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed);
+	zdb_nicenum(ds->ds_unique_bytes, unique);
+	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
+
+	(void) printf("\t\tdir_obj = %llu\n",
+	    (u_longlong_t)ds->ds_dir_obj);
+	(void) printf("\t\tprev_snap_obj = %llu\n",
+	    (u_longlong_t)ds->ds_prev_snap_obj);
+	(void) printf("\t\tprev_snap_txg = %llu\n",
+	    (u_longlong_t)ds->ds_prev_snap_txg);
+	(void) printf("\t\tnext_snap_obj = %llu\n",
+	    (u_longlong_t)ds->ds_next_snap_obj);
+	(void) printf("\t\tsnapnames_zapobj = %llu\n",
+	    (u_longlong_t)ds->ds_snapnames_zapobj);
+	(void) printf("\t\tnum_children = %llu\n",
+	    (u_longlong_t)ds->ds_num_children);
+	(void) printf("\t\tuserrefs_obj = %llu\n",
+	    (u_longlong_t)ds->ds_userrefs_obj);
+	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
+	(void) printf("\t\tcreation_txg = %llu\n",
+	    (u_longlong_t)ds->ds_creation_txg);
+	(void) printf("\t\tdeadlist_obj = %llu\n",
+	    (u_longlong_t)ds->ds_deadlist_obj);
+	(void) printf("\t\tused_bytes = %s\n", used);
+	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
+	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
+	(void) printf("\t\tunique = %s\n", unique);
+	(void) printf("\t\tfsid_guid = %llu\n",
+	    (u_longlong_t)ds->ds_fsid_guid);
+	(void) printf("\t\tguid = %llu\n",
+	    (u_longlong_t)ds->ds_guid);
+	(void) printf("\t\tflags = %llx\n",
+	    (u_longlong_t)ds->ds_flags);
+	(void) printf("\t\tnext_clones_obj = %llu\n",
+	    (u_longlong_t)ds->ds_next_clones_obj);
+	(void) printf("\t\tprops_obj = %llu\n",
+	    (u_longlong_t)ds->ds_props_obj);
+	(void) printf("\t\tbp = %s\n", blkbuf);
+}
+
+/* ARGSUSED */
+static int
+dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	char blkbuf[BP_SPRINTF_LEN];
+
+	if (bp->blk_birth != 0) {
+		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+		(void) printf("\t%s\n", blkbuf);
+	}
+	return (0);
+}
+
+static void
+dump_bptree(objset_t *os, uint64_t obj, char *name)
+{
+	char bytes[32];
+	bptree_phys_t *bt;
+	dmu_buf_t *db;
+
+	if (dump_opt['d'] < 3)
+		return;
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+	bt = db->db_data;
+	zdb_nicenum(bt->bt_bytes, bytes);
+	(void) printf("\n    %s: %llu datasets, %s\n",
+	    name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
+	dmu_buf_rele(db, FTAG);
+
+	if (dump_opt['d'] < 5)
+		return;
+
+	(void) printf("\n");
+
+	(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
+}
+
+/* ARGSUSED */
+static int
+dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	char blkbuf[BP_SPRINTF_LEN];
+
+	ASSERT(bp->blk_birth != 0);
+	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
+	(void) printf("\t%s\n", blkbuf);
+	return (0);
+}
+
+static void
+dump_full_bpobj(bpobj_t *bpo, char *name, int indent)
+{
+	char bytes[32];
+	char comp[32];
+	char uncomp[32];
+	uint64_t i;
+
+	if (dump_opt['d'] < 3)
+		return;
+
+	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes);
+	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
+		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp);
+		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp);
+		(void) printf("    %*s: object %llu, %llu local blkptrs, "
+		    "%llu subobjs in object, %llu, %s (%s/%s comp)\n",
+		    indent * 8, name,
+		    (u_longlong_t)bpo->bpo_object,
+		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+		    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
+		    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
+		    bytes, comp, uncomp);
+
+		for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
+			uint64_t subobj;
+			bpobj_t subbpo;
+			int error;
+			VERIFY0(dmu_read(bpo->bpo_os,
+			    bpo->bpo_phys->bpo_subobjs,
+			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
+			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
+			if (error != 0) {
+				(void) printf("ERROR %u while trying to open "
+				    "subobj id %llu\n",
+				    error, (u_longlong_t)subobj);
+				continue;
+			}
+			dump_full_bpobj(&subbpo, "subobj", indent + 1);
+		}
+	} else {
+		(void) printf("    %*s: object %llu, %llu blkptrs, %s\n",
+		    indent * 8, name,
+		    (u_longlong_t)bpo->bpo_object,
+		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+		    bytes);
+	}
+
+	if (dump_opt['d'] < 5)
+		return;
+
+
+	if (indent == 0) {
+		(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
+		(void) printf("\n");
+	}
+}
+
+static void
+dump_deadlist(dsl_deadlist_t *dl)
+{
+	dsl_deadlist_entry_t *dle;
+	uint64_t unused;
+	char bytes[32];
+	char comp[32];
+	char uncomp[32];
+
+	if (dump_opt['d'] < 3)
+		return;
+
+	if (dl->dl_oldfmt) {
+		dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
+		return;
+	}
+
+	zdb_nicenum(dl->dl_phys->dl_used, bytes);
+	zdb_nicenum(dl->dl_phys->dl_comp, comp);
+	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp);
+	(void) printf("\n    Deadlist: %s (%s/%s comp)\n",
+	    bytes, comp, uncomp);
+
+	if (dump_opt['d'] < 4)
+		return;
+
+	(void) printf("\n");
+
+	/* force the tree to be loaded */
+	dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
+
+	for (dle = avl_first(&dl->dl_tree); dle;
+	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
+		if (dump_opt['d'] >= 5) {
+			char buf[128];
+			(void) snprintf(buf, sizeof (buf),
+			    "mintxg %llu -> obj %llu",
+			    (longlong_t)dle->dle_mintxg,
+			    (longlong_t)dle->dle_bpobj.bpo_object);
+
+			dump_full_bpobj(&dle->dle_bpobj, buf, 0);
+		} else {
+			(void) printf("mintxg %llu -> obj %llu\n",
+			    (longlong_t)dle->dle_mintxg,
+			    (longlong_t)dle->dle_bpobj.bpo_object);
+
+		}
+	}
+}
+
+static avl_tree_t idx_tree;
+static avl_tree_t domain_tree;
+static boolean_t fuid_table_loaded;
+static boolean_t sa_loaded;
+sa_attr_type_t *sa_attr_table;
+
+static void
+fuid_table_destroy(void)
+{
+	if (fuid_table_loaded) {
+		zfs_fuid_table_destroy(&idx_tree, &domain_tree);
+		fuid_table_loaded = B_FALSE;
+	}
+}
+
+/*
+ * print uid or gid information.
+ * For normal POSIX id just the id is printed in decimal format.
+ * For CIFS files with FUID the fuid is printed in hex followed by
+ * the domain-rid string.
+ */
+static void
+print_idstr(uint64_t id, const char *id_type)
+{
+	if (FUID_INDEX(id)) {
+		char *domain;
+
+		domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
+		(void) printf("\t%s     %llx [%s-%d]\n", id_type,
+		    (u_longlong_t)id, domain, (int)FUID_RID(id));
+	} else {
+		(void) printf("\t%s     %llu\n", id_type, (u_longlong_t)id);
+	}
+
+}
+
+static void
+dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
+{
+	uint32_t uid_idx, gid_idx;
+
+	uid_idx = FUID_INDEX(uid);
+	gid_idx = FUID_INDEX(gid);
+
+	/* Load domain table, if not already loaded */
+	if (!fuid_table_loaded && (uid_idx || gid_idx)) {
+		uint64_t fuid_obj;
+
+		/* first find the fuid object.  It lives in the master node */
+		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
+		    8, 1, &fuid_obj) == 0);
+		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
+		(void) zfs_fuid_table_load(os, fuid_obj,
+		    &idx_tree, &domain_tree);
+		fuid_table_loaded = B_TRUE;
+	}
+
+	print_idstr(uid, "uid");
+	print_idstr(gid, "gid");
+}
+
+static void
+dump_znode_sa_xattr(sa_handle_t *hdl)
+{
+	nvlist_t *sa_xattr;
+	nvpair_t *elem = NULL;
+	int sa_xattr_size = 0;
+	int sa_xattr_entries = 0;
+	int error;
+	char *sa_xattr_packed;
+
+	error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size);
+	if (error || sa_xattr_size == 0)
+		return;
+
+	sa_xattr_packed = malloc(sa_xattr_size);
+	if (sa_xattr_packed == NULL)
+		return;
+
+	error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR],
+	    sa_xattr_packed, sa_xattr_size);
+	if (error) {
+		free(sa_xattr_packed);
+		return;
+	}
+
+	error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0);
+	if (error) {
+		free(sa_xattr_packed);
+		return;
+	}
+
+	while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL)
+		sa_xattr_entries++;
+
+	(void) printf("\tSA xattrs: %d bytes, %d entries\n\n",
+	    sa_xattr_size, sa_xattr_entries);
+	while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) {
+		uchar_t *value;
+		uint_t cnt, idx;
+
+		(void) printf("\t\t%s = ", nvpair_name(elem));
+		nvpair_value_byte_array(elem, &value, &cnt);
+		for (idx = 0; idx < cnt; ++idx) {
+			if (isprint(value[idx]))
+				(void) putchar(value[idx]);
+			else
+				(void) printf("\\%3.3o", value[idx]);
+		}
+		(void) putchar('\n');
+	}
+
+	nvlist_free(sa_xattr);
+	free(sa_xattr_packed);
+}
+
+/*ARGSUSED*/
+static void
+dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
+	sa_handle_t *hdl;
+	uint64_t xattr, rdev, gen;
+	uint64_t uid, gid, mode, fsize, parent, links;
+	uint64_t pflags;
+	uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
+	time_t z_crtime, z_atime, z_mtime, z_ctime;
+	sa_bulk_attr_t bulk[12];
+	int idx = 0;
+	int error;
+
+	if (!sa_loaded) {
+		uint64_t sa_attrs = 0;
+		uint64_t version;
+
+		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+		    8, 1, &version) == 0);
+		if (version >= ZPL_VERSION_SA) {
+			VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
+			    8, 1, &sa_attrs) == 0);
+		}
+		if ((error = sa_setup(os, sa_attrs, zfs_attr_table,
+		    ZPL_END, &sa_attr_table)) != 0) {
+			(void) printf("sa_setup failed errno %d, can't "
+			    "display znode contents\n", error);
+			return;
+		}
+		sa_loaded = B_TRUE;
+	}
+
+	if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
+		(void) printf("Failed to get handle for SA znode\n");
+		return;
+	}
+
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
+	    &links, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
+	    &mode, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
+	    NULL, &parent, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
+	    &fsize, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
+	    acctm, 16);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
+	    modtm, 16);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
+	    crtm, 16);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
+	    chgtm, 16);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
+	    &pflags, 8);
+
+	if (sa_bulk_lookup(hdl, bulk, idx)) {
+		(void) sa_handle_destroy(hdl);
+		return;
+	}
+
+	error = zfs_obj_to_path(os, object, path, sizeof (path));
+	if (error != 0) {
+		(void) snprintf(path, sizeof (path), "\?\?\?<object#%llu>",
+		    (u_longlong_t)object);
+	}
+	if (dump_opt['d'] < 3) {
+		(void) printf("\t%s\n", path);
+		(void) sa_handle_destroy(hdl);
+		return;
+	}
+
+	z_crtime = (time_t)crtm[0];
+	z_atime = (time_t)acctm[0];
+	z_mtime = (time_t)modtm[0];
+	z_ctime = (time_t)chgtm[0];
+
+	(void) printf("\tpath	%s\n", path);
+	dump_uidgid(os, uid, gid);
+	(void) printf("\tatime	%s", ctime(&z_atime));
+	(void) printf("\tmtime	%s", ctime(&z_mtime));
+	(void) printf("\tctime	%s", ctime(&z_ctime));
+	(void) printf("\tcrtime	%s", ctime(&z_crtime));
+	(void) printf("\tgen	%llu\n", (u_longlong_t)gen);
+	(void) printf("\tmode	%llo\n", (u_longlong_t)mode);
+	(void) printf("\tsize	%llu\n", (u_longlong_t)fsize);
+	(void) printf("\tparent	%llu\n", (u_longlong_t)parent);
+	(void) printf("\tlinks	%llu\n", (u_longlong_t)links);
+	(void) printf("\tpflags	%llx\n", (u_longlong_t)pflags);
+	if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
+	    sizeof (uint64_t)) == 0)
+		(void) printf("\txattr	%llu\n", (u_longlong_t)xattr);
+	if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
+	    sizeof (uint64_t)) == 0)
+		(void) printf("\trdev	0x%016llx\n", (u_longlong_t)rdev);
+	dump_znode_sa_xattr(hdl);
+	sa_handle_destroy(hdl);
+}
+
+/*ARGSUSED*/
+static void
+dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
+	dump_none,		/* unallocated			*/
+	dump_zap,		/* object directory		*/
+	dump_uint64,		/* object array			*/
+	dump_none,		/* packed nvlist		*/
+	dump_packed_nvlist,	/* packed nvlist size		*/
+	dump_none,		/* bpobj			*/
+	dump_bpobj,		/* bpobj header			*/
+	dump_none,		/* SPA space map header		*/
+	dump_none,		/* SPA space map		*/
+	dump_none,		/* ZIL intent log		*/
+	dump_dnode,		/* DMU dnode			*/
+	dump_dmu_objset,	/* DMU objset			*/
+	dump_dsl_dir,		/* DSL directory		*/
+	dump_zap,		/* DSL directory child map	*/
+	dump_zap,		/* DSL dataset snap map		*/
+	dump_zap,		/* DSL props			*/
+	dump_dsl_dataset,	/* DSL dataset			*/
+	dump_znode,		/* ZFS znode			*/
+	dump_acl,		/* ZFS V0 ACL			*/
+	dump_uint8,		/* ZFS plain file		*/
+	dump_zpldir,		/* ZFS directory		*/
+	dump_zap,		/* ZFS master node		*/
+	dump_zap,		/* ZFS delete queue		*/
+	dump_uint8,		/* zvol object			*/
+	dump_zap,		/* zvol prop			*/
+	dump_uint8,		/* other uint8[]		*/
+	dump_uint64,		/* other uint64[]		*/
+	dump_zap,		/* other ZAP			*/
+	dump_zap,		/* persistent error log		*/
+	dump_uint8,		/* SPA history			*/
+	dump_history_offsets,	/* SPA history offsets		*/
+	dump_zap,		/* Pool properties		*/
+	dump_zap,		/* DSL permissions		*/
+	dump_acl,		/* ZFS ACL			*/
+	dump_uint8,		/* ZFS SYSACL			*/
+	dump_none,		/* FUID nvlist			*/
+	dump_packed_nvlist,	/* FUID nvlist size		*/
+	dump_zap,		/* DSL dataset next clones	*/
+	dump_zap,		/* DSL scrub queue		*/
+	dump_zap,		/* ZFS user/group used		*/
+	dump_zap,		/* ZFS user/group quota		*/
+	dump_zap,		/* snapshot refcount tags	*/
+	dump_ddt_zap,		/* DDT ZAP object		*/
+	dump_zap,		/* DDT statistics		*/
+	dump_znode,		/* SA object			*/
+	dump_zap,		/* SA Master Node		*/
+	dump_sa_attrs,		/* SA attribute registration	*/
+	dump_sa_layouts,	/* SA attribute layouts		*/
+	dump_zap,		/* DSL scrub translations	*/
+	dump_none,		/* fake dedup BP		*/
+	dump_zap,		/* deadlist			*/
+	dump_none,		/* deadlist hdr			*/
+	dump_zap,		/* dsl clones			*/
+	dump_bpobj_subobjs,	/* bpobj subobjs		*/
+	dump_unknown,		/* Unknown type, must be last	*/
+};
+
+static void
+dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
+{
+	dmu_buf_t *db = NULL;
+	dmu_object_info_t doi;
+	dnode_t *dn;
+	void *bonus = NULL;
+	size_t bsize = 0;
+	char iblk[32], dblk[32], lsize[32], asize[32], fill[32];
+	char bonus_size[32];
+	char aux[50];
+	int error;
+
+	if (*print_header) {
+		(void) printf("\n%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
+		    "Object", "lvl", "iblk", "dblk", "dsize", "lsize",
+		    "%full", "type");
+		*print_header = 0;
+	}
+
+	if (object == 0) {
+		dn = DMU_META_DNODE(os);
+	} else {
+		error = dmu_bonus_hold(os, object, FTAG, &db);
+		if (error)
+			fatal("dmu_bonus_hold(%llu) failed, errno %u",
+			    object, error);
+		bonus = db->db_data;
+		bsize = db->db_size;
+		dn = DB_DNODE((dmu_buf_impl_t *)db);
+	}
+	dmu_object_info_from_dnode(dn, &doi);
+
+	zdb_nicenum(doi.doi_metadata_block_size, iblk);
+	zdb_nicenum(doi.doi_data_block_size, dblk);
+	zdb_nicenum(doi.doi_max_offset, lsize);
+	zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize);
+	zdb_nicenum(doi.doi_bonus_size, bonus_size);
+	(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
+	    doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
+	    doi.doi_max_offset);
+
+	aux[0] = '\0';
+
+	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
+		(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
+		    ZDB_CHECKSUM_NAME(doi.doi_checksum));
+	}
+
+	if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
+		(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
+		    ZDB_COMPRESS_NAME(doi.doi_compress));
+	}
+
+	(void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %6s  %s%s\n",
+	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
+	    asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
+
+	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
+		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
+		    "", "", "", "", "", bonus_size, "bonus",
+		    ZDB_OT_NAME(doi.doi_bonus_type));
+	}
+
+	if (verbosity >= 4) {
+		(void) printf("\tdnode flags: %s%s%s\n",
+		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
+		    "USED_BYTES " : "",
+		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
+		    "USERUSED_ACCOUNTED " : "",
+		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
+		    "SPILL_BLKPTR" : "");
+		(void) printf("\tdnode maxblkid: %llu\n",
+		    (longlong_t)dn->dn_phys->dn_maxblkid);
+
+		object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object,
+		    bonus, bsize);
+		object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0);
+		*print_header = 1;
+	}
+
+	if (verbosity >= 5)
+		dump_indirect(dn);
+
+	if (verbosity >= 5) {
+		/*
+		 * Report the list of segments that comprise the object.
+		 */
+		uint64_t start = 0;
+		uint64_t end;
+		uint64_t blkfill = 1;
+		int minlvl = 1;
+
+		if (dn->dn_type == DMU_OT_DNODE) {
+			minlvl = 0;
+			blkfill = DNODES_PER_BLOCK;
+		}
+
+		for (;;) {
+			char segsize[32];
+			error = dnode_next_offset(dn,
+			    0, &start, minlvl, blkfill, 0);
+			if (error)
+				break;
+			end = start;
+			error = dnode_next_offset(dn,
+			    DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
+			zdb_nicenum(end - start, segsize);
+			(void) printf("\t\tsegment [%016llx, %016llx)"
+			    " size %5s\n", (u_longlong_t)start,
+			    (u_longlong_t)end, segsize);
+			if (error)
+				break;
+			start = end;
+		}
+	}
+
+	if (db != NULL)
+		dmu_buf_rele(db, FTAG);
+}
+
+static char *objset_types[DMU_OST_NUMTYPES] = {
+	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
+
+static void
+dump_dir(objset_t *os)
+{
+	dmu_objset_stats_t dds;
+	uint64_t object, object_count;
+	uint64_t refdbytes, usedobjs, scratch;
+	char numbuf[32];
+	char blkbuf[BP_SPRINTF_LEN + 20];
+	char osname[MAXNAMELEN];
+	char *type = "UNKNOWN";
+	int verbosity = dump_opt['d'];
+	int print_header = 1;
+	int i, error;
+
+	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+	dmu_objset_fast_stat(os, &dds);
+	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+
+	if (dds.dds_type < DMU_OST_NUMTYPES)
+		type = objset_types[dds.dds_type];
+
+	if (dds.dds_type == DMU_OST_META) {
+		dds.dds_creation_txg = TXG_INITIAL;
+		usedobjs = BP_GET_FILL(os->os_rootbp);
+		refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
+		    dd_used_bytes;
+	} else {
+		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
+	}
+
+	ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
+
+	zdb_nicenum(refdbytes, numbuf);
+
+	if (verbosity >= 4) {
+		(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
+		(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
+		    sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
+	} else {
+		blkbuf[0] = '\0';
+	}
+
+	dmu_objset_name(os, osname);
+
+	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
+	    "%s, %llu objects%s\n",
+	    osname, type, (u_longlong_t)dmu_objset_id(os),
+	    (u_longlong_t)dds.dds_creation_txg,
+	    numbuf, (u_longlong_t)usedobjs, blkbuf);
+
+	if (zopt_objects != 0) {
+		for (i = 0; i < zopt_objects; i++)
+			dump_object(os, zopt_object[i], verbosity,
+			    &print_header);
+		(void) printf("\n");
+		return;
+	}
+
+	if (dump_opt['i'] != 0 || verbosity >= 2)
+		dump_intent_log(dmu_objset_zil(os));
+
+	if (dmu_objset_ds(os) != NULL)
+		dump_deadlist(&dmu_objset_ds(os)->ds_deadlist);
+
+	if (verbosity < 2)
+		return;
+
+	if (BP_IS_HOLE(os->os_rootbp))
+		return;
+
+	dump_object(os, 0, verbosity, &print_header);
+	object_count = 0;
+	if (DMU_USERUSED_DNODE(os) != NULL &&
+	    DMU_USERUSED_DNODE(os)->dn_type != 0) {
+		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
+		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
+	}
+
+	object = 0;
+	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
+		dump_object(os, object, verbosity, &print_header);
+		object_count++;
+	}
+
+	ASSERT3U(object_count, ==, usedobjs);
+
+	(void) printf("\n");
+
+	if (error != ESRCH) {
+		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
+		abort();
+	}
+}
+
+static void
+dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
+{
+	time_t timestamp = ub->ub_timestamp;
+
+	(void) printf("%s", header ? header : "");
+	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
+	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
+	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
+	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
+	(void) printf("\ttimestamp = %llu UTC = %s",
+	    (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
+	if (dump_opt['u'] >= 3) {
+		char blkbuf[BP_SPRINTF_LEN];
+		snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
+		(void) printf("\trootbp = %s\n", blkbuf);
+	}
+	(void) printf("%s", footer ? footer : "");
+}
+
+static void
+dump_config(spa_t *spa)
+{
+	dmu_buf_t *db;
+	size_t nvsize = 0;
+	int error = 0;
+
+
+	error = dmu_bonus_hold(spa->spa_meta_objset,
+	    spa->spa_config_object, FTAG, &db);
+
+	if (error == 0) {
+		nvsize = *(uint64_t *)db->db_data;
+		dmu_buf_rele(db, FTAG);
+
+		(void) printf("\nMOS Configuration:\n");
+		dump_packed_nvlist(spa->spa_meta_objset,
+		    spa->spa_config_object, (void *)&nvsize, 1);
+	} else {
+		(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
+		    (u_longlong_t)spa->spa_config_object, error);
+	}
+}
+
+static void
+dump_cachefile(const char *cachefile)
+{
+	int fd;
+	struct stat64 statbuf;
+	char *buf;
+	nvlist_t *config;
+
+	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
+		(void) printf("cannot open '%s': %s\n", cachefile,
+		    strerror(errno));
+		exit(1);
+	}
+
+	if (fstat64(fd, &statbuf) != 0) {
+		(void) printf("failed to stat '%s': %s\n", cachefile,
+		    strerror(errno));
+		exit(1);
+	}
+
+	if ((buf = malloc(statbuf.st_size)) == NULL) {
+		(void) fprintf(stderr, "failed to allocate %llu bytes\n",
+		    (u_longlong_t)statbuf.st_size);
+		exit(1);
+	}
+
+	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
+		(void) fprintf(stderr, "failed to read %llu bytes\n",
+		    (u_longlong_t)statbuf.st_size);
+		exit(1);
+	}
+
+	(void) close(fd);
+
+	if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
+		(void) fprintf(stderr, "failed to unpack nvlist\n");
+		exit(1);
+	}
+
+	free(buf);
+
+	dump_nvlist(config, 0);
+
+	nvlist_free(config);
+}
+
+#define	ZDB_MAX_UB_HEADER_SIZE 32
+
+static void
+dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
+{
+	vdev_t vd;
+	vdev_t *vdp = &vd;
+	char header[ZDB_MAX_UB_HEADER_SIZE];
+	int i;
+
+	vd.vdev_ashift = ashift;
+	vdp->vdev_top = vdp;
+
+	for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
+		uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
+		uberblock_t *ub = (void *)((char *)lbl + uoff);
+
+		if (uberblock_verify(ub))
+			continue;
+		(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
+		    "Uberblock[%d]\n", i);
+		dump_uberblock(ub, header, "");
+	}
+}
+
+static void
+dump_label(const char *dev)
+{
+	int fd;
+	vdev_label_t label;
+	char *path, *buf = label.vl_vdev_phys.vp_nvlist;
+	size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
+	struct stat64 statbuf;
+	uint64_t psize, ashift;
+	int len = strlen(dev) + 1;
+	int l;
+
+	if (strncmp(dev, "/dev/dsk/", 9) == 0) {
+		len++;
+		path = malloc(len);
+		(void) snprintf(path, len, "%s%s", "/dev/rdsk/", dev + 9);
+	} else {
+		path = strdup(dev);
+	}
+
+	if ((fd = open64(path, O_RDONLY)) < 0) {
+		(void) printf("cannot open '%s': %s\n", path, strerror(errno));
+		free(path);
+		exit(1);
+	}
+
+	if (fstat64_blk(fd, &statbuf) != 0) {
+		(void) printf("failed to stat '%s': %s\n", path,
+		    strerror(errno));
+		free(path);
+		(void) close(fd);
+		exit(1);
+	}
+
+	psize = statbuf.st_size;
+	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
+
+	for (l = 0; l < VDEV_LABELS; l++) {
+		nvlist_t *config = NULL;
+
+		(void) printf("--------------------------------------------\n");
+		(void) printf("LABEL %d\n", l);
+		(void) printf("--------------------------------------------\n");
+
+		if (pread64(fd, &label, sizeof (label),
+		    vdev_label_offset(psize, l, 0)) != sizeof (label)) {
+			(void) printf("failed to read label %d\n", l);
+			continue;
+		}
+
+		if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
+			(void) printf("failed to unpack label %d\n", l);
+			ashift = SPA_MINBLOCKSHIFT;
+		} else {
+			nvlist_t *vdev_tree = NULL;
+
+			dump_nvlist(config, 4);
+			if ((nvlist_lookup_nvlist(config,
+			    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
+			    (nvlist_lookup_uint64(vdev_tree,
+			    ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
+				ashift = SPA_MINBLOCKSHIFT;
+			nvlist_free(config);
+		}
+		if (dump_opt['u'])
+			dump_label_uberblocks(&label, ashift);
+	}
+
+	free(path);
+	(void) close(fd);
+}
+
+static uint64_t dataset_feature_count[SPA_FEATURES];
+
+/*ARGSUSED*/
+static int
+dump_one_dir(const char *dsname, void *arg)
+{
+	int error;
+	objset_t *os;
+	spa_feature_t f;
+
+	error = dmu_objset_own(dsname, DMU_OST_ANY, B_TRUE, FTAG, &os);
+	if (error) {
+		(void) printf("Could not open %s, error %d\n", dsname, error);
+		return (0);
+	}
+
+	for (f = 0; f < SPA_FEATURES; f++) {
+		if (!dmu_objset_ds(os)->ds_feature_inuse[f])
+			continue;
+		ASSERT(spa_feature_table[f].fi_flags &
+		    ZFEATURE_FLAG_PER_DATASET);
+		dataset_feature_count[f]++;
+	}
+
+	dump_dir(os);
+	dmu_objset_disown(os, FTAG);
+	fuid_table_destroy();
+	sa_loaded = B_FALSE;
+	return (0);
+}
+
+/*
+ * Block statistics.
+ */
+#define	PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
+typedef struct zdb_blkstats {
+	uint64_t zb_asize;
+	uint64_t zb_lsize;
+	uint64_t zb_psize;
+	uint64_t zb_count;
+	uint64_t zb_gangs;
+	uint64_t zb_ditto_samevdev;
+	uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
+} zdb_blkstats_t;
+
+/*
+ * Extended object types to report deferred frees and dedup auto-ditto blocks.
+ */
+#define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
+#define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
+#define	ZDB_OT_OTHER	(DMU_OT_NUMTYPES + 2)
+#define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 3)
+
+static char *zdb_ot_extname[] = {
+	"deferred free",
+	"dedup ditto",
+	"other",
+	"Total",
+};
+
+#define	ZB_TOTAL	DN_MAX_LEVELS
+
+typedef struct zdb_cb {
+	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
+	uint64_t	zcb_dedup_asize;
+	uint64_t	zcb_dedup_blocks;
+	uint64_t	zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
+	uint64_t	zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
+	    [BPE_PAYLOAD_SIZE];
+	uint64_t	zcb_start;
+	uint64_t	zcb_lastprint;
+	uint64_t	zcb_totalasize;
+	uint64_t	zcb_errors[256];
+	int		zcb_readfails;
+	int		zcb_haderrors;
+	spa_t		*zcb_spa;
+} zdb_cb_t;
+
+static void
+zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
+    dmu_object_type_t type)
+{
+	uint64_t refcnt = 0;
+	int i;
+
+	ASSERT(type < ZDB_OT_TOTAL);
+
+	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
+		return;
+
+	for (i = 0; i < 4; i++) {
+		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
+		int t = (i & 1) ? type : ZDB_OT_TOTAL;
+		int equal;
+		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
+
+		zb->zb_asize += BP_GET_ASIZE(bp);
+		zb->zb_lsize += BP_GET_LSIZE(bp);
+		zb->zb_psize += BP_GET_PSIZE(bp);
+		zb->zb_count++;
+
+		/*
+		 * The histogram is only big enough to record blocks up to
+		 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
+		 * "other", bucket.
+		 */
+		int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
+		idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
+		zb->zb_psize_histogram[idx]++;
+
+		zb->zb_gangs += BP_COUNT_GANG(bp);
+
+		switch (BP_GET_NDVAS(bp)) {
+		case 2:
+			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[1]))
+				zb->zb_ditto_samevdev++;
+			break;
+		case 3:
+			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[1])) +
+			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[2])) +
+			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[2]));
+			if (equal != 0)
+				zb->zb_ditto_samevdev++;
+			break;
+		}
+
+	}
+
+	if (BP_IS_EMBEDDED(bp)) {
+		zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
+		zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
+		    [BPE_GET_PSIZE(bp)]++;
+		return;
+	}
+
+	if (dump_opt['L'])
+		return;
+
+	if (BP_GET_DEDUP(bp)) {
+		ddt_t *ddt;
+		ddt_entry_t *dde;
+
+		ddt = ddt_select(zcb->zcb_spa, bp);
+		ddt_enter(ddt);
+		dde = ddt_lookup(ddt, bp, B_FALSE);
+
+		if (dde == NULL) {
+			refcnt = 0;
+		} else {
+			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
+			ddt_phys_decref(ddp);
+			refcnt = ddp->ddp_refcnt;
+			if (ddt_phys_total_refcnt(dde) == 0)
+				ddt_remove(ddt, dde);
+		}
+		ddt_exit(ddt);
+	}
+
+	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
+	    refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
+	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
+}
+
+static void
+zdb_blkptr_done(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	blkptr_t *bp = zio->io_bp;
+	int ioerr = zio->io_error;
+	zdb_cb_t *zcb = zio->io_private;
+	zbookmark_phys_t *zb = &zio->io_bookmark;
+
+	zio_data_buf_free(zio->io_data, zio->io_size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_inflight--;
+	cv_broadcast(&spa->spa_scrub_io_cv);
+
+	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+		char blkbuf[BP_SPRINTF_LEN];
+
+		zcb->zcb_haderrors = 1;
+		zcb->zcb_errors[ioerr]++;
+
+		if (dump_opt['b'] >= 2)
+			snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+		else
+			blkbuf[0] = '\0';
+
+		(void) printf("zdb_blkptr_cb: "
+		    "Got error %d reading "
+		    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
+		    ioerr,
+		    (u_longlong_t)zb->zb_objset,
+		    (u_longlong_t)zb->zb_object,
+		    (u_longlong_t)zb->zb_level,
+		    (u_longlong_t)zb->zb_blkid,
+		    blkbuf);
+	}
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	zdb_cb_t *zcb = arg;
+	dmu_object_type_t type;
+	boolean_t is_metadata;
+
+	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
+		char blkbuf[BP_SPRINTF_LEN];
+		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+		(void) printf("objset %llu object %llu "
+		    "level %lld offset 0x%llx %s\n",
+		    (u_longlong_t)zb->zb_objset,
+		    (u_longlong_t)zb->zb_object,
+		    (longlong_t)zb->zb_level,
+		    (u_longlong_t)blkid2offset(dnp, bp, zb),
+		    blkbuf);
+	}
+
+	if (BP_IS_HOLE(bp))
+		return (0);
+
+	type = BP_GET_TYPE(bp);
+
+	zdb_count_block(zcb, zilog, bp,
+	    (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
+
+	is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
+
+	if (!BP_IS_EMBEDDED(bp) &&
+	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
+		size_t size = BP_GET_PSIZE(bp);
+		void *data = zio_data_buf_alloc(size);
+		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
+
+		/* If it's an intent log block, failure is expected. */
+		if (zb->zb_level == ZB_ZIL_LEVEL)
+			flags |= ZIO_FLAG_SPECULATIVE;
+
+		mutex_enter(&spa->spa_scrub_lock);
+		while (spa->spa_scrub_inflight > max_inflight)
+			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+		spa->spa_scrub_inflight++;
+		mutex_exit(&spa->spa_scrub_lock);
+
+		zio_nowait(zio_read(NULL, spa, bp, data, size,
+		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
+	}
+
+	zcb->zcb_readfails = 0;
+
+	/* only call gethrtime() every 100 blocks */
+	static int iters;
+	if (++iters > 100)
+		iters = 0;
+	else
+		return (0);
+
+	if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
+		uint64_t now = gethrtime();
+		char buf[10];
+		uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
+		int kb_per_sec =
+		    1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
+		int sec_remaining =
+		    (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
+
+		zfs_nicenum(bytes, buf, sizeof (buf));
+		(void) fprintf(stderr,
+		    "\r%5s completed (%4dMB/s) "
+		    "estimated time remaining: %uhr %02umin %02usec        ",
+		    buf, kb_per_sec / 1024,
+		    sec_remaining / 60 / 60,
+		    sec_remaining / 60 % 60,
+		    sec_remaining % 60);
+
+		zcb->zcb_lastprint = now;
+	}
+
+	return (0);
+}
+
+static void
+zdb_leak(void *arg, uint64_t start, uint64_t size)
+{
+	vdev_t *vd = arg;
+
+	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
+	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
+}
+
+static metaslab_ops_t zdb_metaslab_ops = {
+	NULL	/* alloc */
+};
+
+static void
+zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
+{
+	ddt_bookmark_t ddb = { 0 };
+	ddt_entry_t dde;
+	int error;
+	int p;
+
+	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
+		blkptr_t blk;
+		ddt_phys_t *ddp = dde.dde_phys;
+
+		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
+			return;
+
+		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
+
+		for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+			if (ddp->ddp_phys_birth == 0)
+				continue;
+			ddt_bp_create(ddb.ddb_checksum,
+			    &dde.dde_key, ddp, &blk);
+			if (p == DDT_PHYS_DITTO) {
+				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
+			} else {
+				zcb->zcb_dedup_asize +=
+				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
+				zcb->zcb_dedup_blocks++;
+			}
+		}
+		if (!dump_opt['L']) {
+			ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+			ddt_enter(ddt);
+			VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+			ddt_exit(ddt);
+		}
+	}
+
+	ASSERT(error == ENOENT);
+}
+
+static void
+zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
+{
+	zcb->zcb_spa = spa;
+	uint64_t c, m;
+
+	if (!dump_opt['L']) {
+		vdev_t *rvd = spa->spa_root_vdev;
+		for (c = 0; c < rvd->vdev_children; c++) {
+			vdev_t *vd = rvd->vdev_child[c];
+			for (m = 0; m < vd->vdev_ms_count; m++) {
+				metaslab_t *msp = vd->vdev_ms[m];
+				mutex_enter(&msp->ms_lock);
+				metaslab_unload(msp);
+
+				/*
+				 * For leak detection, we overload the metaslab
+				 * ms_tree to contain allocated segments
+				 * instead of free segments. As a result,
+				 * we can't use the normal metaslab_load/unload
+				 * interfaces.
+				 */
+				if (msp->ms_sm != NULL) {
+					(void) fprintf(stderr,
+					    "\rloading space map for "
+					    "vdev %llu of %llu, "
+					    "metaslab %llu of %llu ...",
+					    (longlong_t)c,
+					    (longlong_t)rvd->vdev_children,
+					    (longlong_t)m,
+					    (longlong_t)vd->vdev_ms_count);
+
+					msp->ms_ops = &zdb_metaslab_ops;
+
+					/*
+					 * We don't want to spend the CPU
+					 * manipulating the size-ordered
+					 * tree, so clear the range_tree
+					 * ops.
+					 */
+					msp->ms_tree->rt_ops = NULL;
+					VERIFY0(space_map_load(msp->ms_sm,
+					    msp->ms_tree, SM_ALLOC));
+					msp->ms_loaded = B_TRUE;
+				}
+				mutex_exit(&msp->ms_lock);
+			}
+		}
+		(void) fprintf(stderr, "\n");
+	}
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	zdb_ddt_leak_init(spa, zcb);
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+static void
+zdb_leak_fini(spa_t *spa)
+{
+	int c, m;
+
+	if (!dump_opt['L']) {
+		vdev_t *rvd = spa->spa_root_vdev;
+		for (c = 0; c < rvd->vdev_children; c++) {
+			vdev_t *vd = rvd->vdev_child[c];
+			for (m = 0; m < vd->vdev_ms_count; m++) {
+				metaslab_t *msp = vd->vdev_ms[m];
+				mutex_enter(&msp->ms_lock);
+
+				/*
+				 * The ms_tree has been overloaded to
+				 * contain allocated segments. Now that we
+				 * finished traversing all blocks, any
+				 * block that remains in the ms_tree
+				 * represents an allocated block that we
+				 * did not claim during the traversal.
+				 * Claimed blocks would have been removed
+				 * from the ms_tree.
+				 */
+				range_tree_vacate(msp->ms_tree, zdb_leak, vd);
+				msp->ms_loaded = B_FALSE;
+
+				mutex_exit(&msp->ms_lock);
+			}
+		}
+	}
+}
+
+/* ARGSUSED */
+static int
+count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	zdb_cb_t *zcb = arg;
+
+	if (dump_opt['b'] >= 5) {
+		char blkbuf[BP_SPRINTF_LEN];
+		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+		(void) printf("[%s] %s\n",
+		    "deferred free", blkbuf);
+	}
+	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
+	return (0);
+}
+
+static int
+dump_block_stats(spa_t *spa)
+{
+	zdb_cb_t zcb;
+	zdb_blkstats_t *zb, *tzb;
+	uint64_t norm_alloc, norm_space, total_alloc, total_found;
+	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
+	boolean_t leaks = B_FALSE;
+	int e, c;
+	bp_embedded_type_t i;
+
+	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
+	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
+	    (dump_opt['c'] == 1) ? "metadata " : "",
+	    dump_opt['c'] ? "checksums " : "",
+	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
+	    !dump_opt['L'] ? "nothing leaked " : "");
+
+	/*
+	 * Load all space maps as SM_ALLOC maps, then traverse the pool
+	 * claiming each block we discover.  If the pool is perfectly
+	 * consistent, the space maps will be empty when we're done.
+	 * Anything left over is a leak; any block we can't claim (because
+	 * it's not part of any space map) is a double allocation,
+	 * reference to a freed block, or an unclaimed log block.
+	 */
+	bzero(&zcb, sizeof (zdb_cb_t));
+	zdb_leak_init(spa, &zcb);
+
+	/*
+	 * If there's a deferred-free bplist, process that first.
+	 */
+	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
+	    count_block_cb, &zcb, NULL);
+	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
+		    count_block_cb, &zcb, NULL);
+	}
+	if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
+		VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
+		    spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
+		    &zcb, NULL));
+	}
+
+	if (dump_opt['c'] > 1)
+		flags |= TRAVERSE_PREFETCH_DATA;
+
+	zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
+	zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
+	zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
+
+	/*
+	 * If we've traversed the data blocks then we need to wait for those
+	 * I/Os to complete. We leverage "The Godfather" zio to wait on
+	 * all async I/Os to complete.
+	 */
+	if (dump_opt['c']) {
+		for (c = 0; c < max_ncpus; c++) {
+			(void) zio_wait(spa->spa_async_zio_root[c]);
+			spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL,
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+			    ZIO_FLAG_GODFATHER);
+		}
+	}
+
+	if (zcb.zcb_haderrors) {
+		(void) printf("\nError counts:\n\n");
+		(void) printf("\t%5s  %s\n", "errno", "count");
+		for (e = 0; e < 256; e++) {
+			if (zcb.zcb_errors[e] != 0) {
+				(void) printf("\t%5d  %llu\n",
+				    e, (u_longlong_t)zcb.zcb_errors[e]);
+			}
+		}
+	}
+
+	/*
+	 * Report any leaked segments.
+	 */
+	zdb_leak_fini(spa);
+
+	tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
+
+	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+	norm_space = metaslab_class_get_space(spa_normal_class(spa));
+
+	total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
+	total_found = tzb->zb_asize - zcb.zcb_dedup_asize;
+
+	if (total_found == total_alloc) {
+		if (!dump_opt['L'])
+			(void) printf("\n\tNo leaks (block sum matches space"
+			    " maps exactly)\n");
+	} else {
+		(void) printf("block traversal size %llu != alloc %llu "
+		    "(%s %lld)\n",
+		    (u_longlong_t)total_found,
+		    (u_longlong_t)total_alloc,
+		    (dump_opt['L']) ? "unreachable" : "leaked",
+		    (longlong_t)(total_alloc - total_found));
+		leaks = B_TRUE;
+	}
+
+	if (tzb->zb_count == 0)
+		return (2);
+
+	(void) printf("\n");
+	(void) printf("\tbp count:      %10llu\n",
+	    (u_longlong_t)tzb->zb_count);
+	(void) printf("\tganged count:  %10llu\n",
+	    (longlong_t)tzb->zb_gangs);
+	(void) printf("\tbp logical:    %10llu      avg: %6llu\n",
+	    (u_longlong_t)tzb->zb_lsize,
+	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
+	(void) printf("\tbp physical:   %10llu      avg:"
+	    " %6llu     compression: %6.2f\n",
+	    (u_longlong_t)tzb->zb_psize,
+	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
+	    (double)tzb->zb_lsize / tzb->zb_psize);
+	(void) printf("\tbp allocated:  %10llu      avg:"
+	    " %6llu     compression: %6.2f\n",
+	    (u_longlong_t)tzb->zb_asize,
+	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
+	    (double)tzb->zb_lsize / tzb->zb_asize);
+	(void) printf("\tbp deduped:    %10llu    ref>1:"
+	    " %6llu   deduplication: %6.2f\n",
+	    (u_longlong_t)zcb.zcb_dedup_asize,
+	    (u_longlong_t)zcb.zcb_dedup_blocks,
+	    (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
+	(void) printf("\tSPA allocated: %10llu     used: %5.2f%%\n",
+	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
+
+	for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
+		if (zcb.zcb_embedded_blocks[i] == 0)
+			continue;
+		(void) printf("\n");
+		(void) printf("\tadditional, non-pointer bps of type %u: "
+		    "%10llu\n",
+		    i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
+
+		if (dump_opt['b'] >= 3) {
+			(void) printf("\t number of (compressed) bytes:  "
+			    "number of bps\n");
+			dump_histogram(zcb.zcb_embedded_histogram[i],
+			    sizeof (zcb.zcb_embedded_histogram[i]) /
+			    sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
+		}
+	}
+
+	if (tzb->zb_ditto_samevdev != 0) {
+		(void) printf("\tDittoed blocks on same vdev: %llu\n",
+		    (longlong_t)tzb->zb_ditto_samevdev);
+	}
+
+	if (dump_opt['b'] >= 2) {
+		int l, t, level;
+		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
+		    "\t  avg\t comp\t%%Total\tType\n");
+
+		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
+			char csize[32], lsize[32], psize[32], asize[32];
+			char avg[32], gang[32];
+			char *typename;
+
+			if (t < DMU_OT_NUMTYPES)
+				typename = dmu_ot[t].ot_name;
+			else
+				typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
+
+			if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
+				(void) printf("%6s\t%5s\t%5s\t%5s"
+				    "\t%5s\t%5s\t%6s\t%s\n",
+				    "-",
+				    "-",
+				    "-",
+				    "-",
+				    "-",
+				    "-",
+				    "-",
+				    typename);
+				continue;
+			}
+
+			for (l = ZB_TOTAL - 1; l >= -1; l--) {
+				level = (l == -1 ? ZB_TOTAL : l);
+				zb = &zcb.zcb_type[level][t];
+
+				if (zb->zb_asize == 0)
+					continue;
+
+				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
+					continue;
+
+				if (level == 0 && zb->zb_asize ==
+				    zcb.zcb_type[ZB_TOTAL][t].zb_asize)
+					continue;
+
+				zdb_nicenum(zb->zb_count, csize);
+				zdb_nicenum(zb->zb_lsize, lsize);
+				zdb_nicenum(zb->zb_psize, psize);
+				zdb_nicenum(zb->zb_asize, asize);
+				zdb_nicenum(zb->zb_asize / zb->zb_count, avg);
+				zdb_nicenum(zb->zb_gangs, gang);
+
+				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
+				    "\t%5.2f\t%6.2f\t",
+				    csize, lsize, psize, asize, avg,
+				    (double)zb->zb_lsize / zb->zb_psize,
+				    100.0 * zb->zb_asize / tzb->zb_asize);
+
+				if (level == ZB_TOTAL)
+					(void) printf("%s\n", typename);
+				else
+					(void) printf("    L%d %s\n",
+					    level, typename);
+
+				if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
+					(void) printf("\t number of ganged "
+					    "blocks: %s\n", gang);
+				}
+
+				if (dump_opt['b'] >= 4) {
+					(void) printf("psize "
+					    "(in 512-byte sectors): "
+					    "number of blocks\n");
+					dump_histogram(zb->zb_psize_histogram,
+					    PSIZE_HISTO_SIZE, 0);
+				}
+			}
+		}
+	}
+
+	(void) printf("\n");
+
+	if (leaks)
+		return (2);
+
+	if (zcb.zcb_haderrors)
+		return (3);
+
+	return (0);
+}
+
+typedef struct zdb_ddt_entry {
+	ddt_key_t	zdde_key;
+	uint64_t	zdde_ref_blocks;
+	uint64_t	zdde_ref_lsize;
+	uint64_t	zdde_ref_psize;
+	uint64_t	zdde_ref_dsize;
+	avl_node_t	zdde_node;
+} zdb_ddt_entry_t;
+
+/* ARGSUSED */
+static int
+zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	avl_tree_t *t = arg;
+	avl_index_t where;
+	zdb_ddt_entry_t *zdde, zdde_search;
+
+	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+		return (0);
+
+	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
+		(void) printf("traversing objset %llu, %llu objects, "
+		    "%lu blocks so far\n",
+		    (u_longlong_t)zb->zb_objset,
+		    (u_longlong_t)BP_GET_FILL(bp),
+		    avl_numnodes(t));
+	}
+
+	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
+	    BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
+		return (0);
+
+	ddt_key_fill(&zdde_search.zdde_key, bp);
+
+	zdde = avl_find(t, &zdde_search, &where);
+
+	if (zdde == NULL) {
+		zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
+		zdde->zdde_key = zdde_search.zdde_key;
+		avl_insert(t, zdde, where);
+	}
+
+	zdde->zdde_ref_blocks += 1;
+	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
+	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
+	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
+
+	return (0);
+}
+
+static void
+dump_simulated_ddt(spa_t *spa)
+{
+	avl_tree_t t;
+	void *cookie = NULL;
+	zdb_ddt_entry_t *zdde;
+	ddt_histogram_t ddh_total;
+	ddt_stat_t dds_total;
+
+	bzero(&ddh_total, sizeof (ddt_histogram_t));
+	bzero(&dds_total, sizeof (ddt_stat_t));
+
+	avl_create(&t, ddt_entry_compare,
+	    sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
+	    zdb_ddt_add_cb, &t);
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
+		ddt_stat_t dds;
+		uint64_t refcnt = zdde->zdde_ref_blocks;
+		ASSERT(refcnt != 0);
+
+		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
+		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
+		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
+		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
+
+		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
+		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
+		dds.dds_ref_psize = zdde->zdde_ref_psize;
+		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
+
+		ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
+		    &dds, 0);
+
+		umem_free(zdde, sizeof (*zdde));
+	}
+
+	avl_destroy(&t);
+
+	ddt_histogram_stat(&dds_total, &ddh_total);
+
+	(void) printf("Simulated DDT histogram:\n");
+
+	zpool_dump_ddt(&dds_total, &ddh_total);
+
+	dump_dedup_ratio(&dds_total);
+}
+
+static void
+dump_zpool(spa_t *spa)
+{
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	int rc = 0;
+
+	if (dump_opt['S']) {
+		dump_simulated_ddt(spa);
+		return;
+	}
+
+	if (!dump_opt['e'] && dump_opt['C'] > 1) {
+		(void) printf("\nCached configuration:\n");
+		dump_nvlist(spa->spa_config, 8);
+	}
+
+	if (dump_opt['C'])
+		dump_config(spa);
+
+	if (dump_opt['u'])
+		dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
+
+	if (dump_opt['D'])
+		dump_all_ddts(spa);
+
+	if (dump_opt['d'] > 2 || dump_opt['m'])
+		dump_metaslabs(spa);
+	if (dump_opt['M'])
+		dump_metaslab_groups(spa);
+
+	if (dump_opt['d'] || dump_opt['i']) {
+		spa_feature_t f;
+
+		dump_dir(dp->dp_meta_objset);
+		if (dump_opt['d'] >= 3) {
+			dump_full_bpobj(&spa->spa_deferred_bpobj,
+			    "Deferred frees", 0);
+			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+				dump_full_bpobj(
+				    &spa->spa_dsl_pool->dp_free_bpobj,
+				    "Pool snapshot frees", 0);
+			}
+
+			if (spa_feature_is_active(spa,
+			    SPA_FEATURE_ASYNC_DESTROY)) {
+				dump_bptree(spa->spa_meta_objset,
+				    spa->spa_dsl_pool->dp_bptree_obj,
+				    "Pool dataset frees");
+			}
+			dump_dtl(spa->spa_root_vdev, 0);
+		}
+		(void) dmu_objset_find(spa_name(spa), dump_one_dir,
+		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+
+		for (f = 0; f < SPA_FEATURES; f++) {
+			uint64_t refcount;
+
+			if (!(spa_feature_table[f].fi_flags &
+			    ZFEATURE_FLAG_PER_DATASET)) {
+				ASSERT0(dataset_feature_count[f]);
+				continue;
+			}
+			if (feature_get_refcount(spa, &spa_feature_table[f],
+			    &refcount) == ENOTSUP)
+				continue;
+			if (dataset_feature_count[f] != refcount) {
+				(void) printf("%s feature refcount mismatch: "
+				    "%lld datasets != %lld refcount\n",
+				    spa_feature_table[f].fi_uname,
+				    (longlong_t)dataset_feature_count[f],
+				    (longlong_t)refcount);
+				rc = 2;
+			} else {
+				(void) printf("Verified %s feature refcount "
+				    "of %llu is correct\n",
+				    spa_feature_table[f].fi_uname,
+				    (longlong_t)refcount);
+			}
+		}
+	}
+	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
+		rc = dump_block_stats(spa);
+
+	if (rc == 0)
+		rc = verify_spacemap_refcounts(spa);
+
+	if (dump_opt['s'])
+		show_pool_stats(spa);
+
+	if (dump_opt['h'])
+		dump_history(spa);
+
+	if (rc != 0)
+		exit(rc);
+}
+
+#define	ZDB_FLAG_CHECKSUM	0x0001
+#define	ZDB_FLAG_DECOMPRESS	0x0002
+#define	ZDB_FLAG_BSWAP		0x0004
+#define	ZDB_FLAG_GBH		0x0008
+#define	ZDB_FLAG_INDIRECT	0x0010
+#define	ZDB_FLAG_PHYS		0x0020
+#define	ZDB_FLAG_RAW		0x0040
+#define	ZDB_FLAG_PRINT_BLKPTR	0x0080
+
+int flagbits[256];
+
+static void
+zdb_print_blkptr(blkptr_t *bp, int flags)
+{
+	char blkbuf[BP_SPRINTF_LEN];
+
+	if (flags & ZDB_FLAG_BSWAP)
+		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
+
+	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+	(void) printf("%s\n", blkbuf);
+}
+
+static void
+zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
+{
+	int i;
+
+	for (i = 0; i < nbps; i++)
+		zdb_print_blkptr(&bp[i], flags);
+}
+
+static void
+zdb_dump_gbh(void *buf, int flags)
+{
+	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
+}
+
+static void
+zdb_dump_block_raw(void *buf, uint64_t size, int flags)
+{
+	if (flags & ZDB_FLAG_BSWAP)
+		byteswap_uint64_array(buf, size);
+	VERIFY(write(fileno(stdout), buf, size) == size);
+}
+
+static void
+zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
+{
+	uint64_t *d = (uint64_t *)buf;
+	int nwords = size / sizeof (uint64_t);
+	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
+	int i, j;
+	char *hdr, *c;
+
+
+	if (do_bswap)
+		hdr = " 7 6 5 4 3 2 1 0   f e d c b a 9 8";
+	else
+		hdr = " 0 1 2 3 4 5 6 7   8 9 a b c d e f";
+
+	(void) printf("\n%s\n%6s   %s  0123456789abcdef\n", label, "", hdr);
+
+#ifdef _LITTLE_ENDIAN
+	/* correct the endianess */
+	do_bswap = !do_bswap;
+#endif
+	for (i = 0; i < nwords; i += 2) {
+		(void) printf("%06llx:  %016llx  %016llx  ",
+		    (u_longlong_t)(i * sizeof (uint64_t)),
+		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
+		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
+
+		c = (char *)&d[i];
+		for (j = 0; j < 2 * sizeof (uint64_t); j++)
+			(void) printf("%c", isprint(c[j]) ? c[j] : '.');
+		(void) printf("\n");
+	}
+}
+
+/*
+ * There are two acceptable formats:
+ *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
+ *	child[.child]*    - For example: 0.1.1
+ *
+ * The second form can be used to specify arbitrary vdevs anywhere
+ * in the heirarchy.  For example, in a pool with a mirror of
+ * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
+ */
+static vdev_t *
+zdb_vdev_lookup(vdev_t *vdev, char *path)
+{
+	char *s, *p, *q;
+	int i;
+
+	if (vdev == NULL)
+		return (NULL);
+
+	/* First, assume the x.x.x.x format */
+	i = (int)strtoul(path, &s, 10);
+	if (s == path || (s && *s != '.' && *s != '\0'))
+		goto name;
+	if (i < 0 || i >= vdev->vdev_children)
+		return (NULL);
+
+	vdev = vdev->vdev_child[i];
+	if (*s == '\0')
+		return (vdev);
+	return (zdb_vdev_lookup(vdev, s+1));
+
+name:
+	for (i = 0; i < vdev->vdev_children; i++) {
+		vdev_t *vc = vdev->vdev_child[i];
+
+		if (vc->vdev_path == NULL) {
+			vc = zdb_vdev_lookup(vc, path);
+			if (vc == NULL)
+				continue;
+			else
+				return (vc);
+		}
+
+		p = strrchr(vc->vdev_path, '/');
+		p = p ? p + 1 : vc->vdev_path;
+		q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
+
+		if (strcmp(vc->vdev_path, path) == 0)
+			return (vc);
+		if (strcmp(p, path) == 0)
+			return (vc);
+		if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
+			return (vc);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Read a block from a pool and print it out.  The syntax of the
+ * block descriptor is:
+ *
+ *	pool:vdev_specifier:offset:size[:flags]
+ *
+ *	pool           - The name of the pool you wish to read from
+ *	vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
+ *	offset         - offset, in hex, in bytes
+ *	size           - Amount of data to read, in hex, in bytes
+ *	flags          - A string of characters specifying options
+ *		 b: Decode a blkptr at given offset within block
+ *		*c: Calculate and display checksums
+ *		 d: Decompress data before dumping
+ *		 e: Byteswap data before dumping
+ *		 g: Display data as a gang block header
+ *		 i: Display as an indirect block
+ *		 p: Do I/O to physical offset
+ *		 r: Dump raw data to stdout
+ *
+ *              * = not yet implemented
+ */
+static void
+zdb_read_block(char *thing, spa_t *spa)
+{
+	blkptr_t blk, *bp = &blk;
+	dva_t *dva = bp->blk_dva;
+	int flags = 0;
+	uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
+	zio_t *zio;
+	vdev_t *vd;
+	void *pbuf, *lbuf, *buf;
+	char *s, *p, *dup, *vdev, *flagstr;
+	int i, error;
+
+	dup = strdup(thing);
+	s = strtok(dup, ":");
+	vdev = s ? s : "";
+	s = strtok(NULL, ":");
+	offset = strtoull(s ? s : "", NULL, 16);
+	s = strtok(NULL, ":");
+	size = strtoull(s ? s : "", NULL, 16);
+	s = strtok(NULL, ":");
+	flagstr = s ? s : "";
+
+	s = NULL;
+	if (size == 0)
+		s = "size must not be zero";
+	if (!IS_P2ALIGNED(size, DEV_BSIZE))
+		s = "size must be a multiple of sector size";
+	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
+		s = "offset must be a multiple of sector size";
+	if (s) {
+		(void) printf("Invalid block specifier: %s  - %s\n", thing, s);
+		free(dup);
+		return;
+	}
+
+	for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
+		for (i = 0; flagstr[i]; i++) {
+			int bit = flagbits[(uchar_t)flagstr[i]];
+
+			if (bit == 0) {
+				(void) printf("***Invalid flag: %c\n",
+				    flagstr[i]);
+				continue;
+			}
+			flags |= bit;
+
+			/* If it's not something with an argument, keep going */
+			if ((bit & (ZDB_FLAG_CHECKSUM |
+			    ZDB_FLAG_PRINT_BLKPTR)) == 0)
+				continue;
+
+			p = &flagstr[i + 1];
+			if (bit == ZDB_FLAG_PRINT_BLKPTR)
+				blkptr_offset = strtoull(p, &p, 16);
+			if (*p != ':' && *p != '\0') {
+				(void) printf("***Invalid flag arg: '%s'\n", s);
+				free(dup);
+				return;
+			}
+		}
+	}
+
+	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
+	if (vd == NULL) {
+		(void) printf("***Invalid vdev: %s\n", vdev);
+		free(dup);
+		return;
+	} else {
+		if (vd->vdev_path)
+			(void) fprintf(stderr, "Found vdev: %s\n",
+			    vd->vdev_path);
+		else
+			(void) fprintf(stderr, "Found vdev type: %s\n",
+			    vd->vdev_ops->vdev_op_type);
+	}
+
+	psize = size;
+	lsize = size;
+
+	pbuf = umem_alloc_aligned(SPA_MAXBLOCKSIZE, 512, UMEM_NOFAIL);
+	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+
+	BP_ZERO(bp);
+
+	DVA_SET_VDEV(&dva[0], vd->vdev_id);
+	DVA_SET_OFFSET(&dva[0], offset);
+	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
+	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
+
+	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
+
+	BP_SET_LSIZE(bp, lsize);
+	BP_SET_PSIZE(bp, psize);
+	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
+	BP_SET_TYPE(bp, DMU_OT_NONE);
+	BP_SET_LEVEL(bp, 0);
+	BP_SET_DEDUP(bp, 0);
+	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	zio = zio_root(spa, NULL, NULL, 0);
+
+	if (vd == vd->vdev_top) {
+		/*
+		 * Treat this as a normal block read.
+		 */
+		zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
+		    ZIO_PRIORITY_SYNC_READ,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
+	} else {
+		/*
+		 * Treat this as a vdev child I/O.
+		 */
+		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
+		    ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
+		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
+		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
+	}
+
+	error = zio_wait(zio);
+	spa_config_exit(spa, SCL_STATE, FTAG);
+
+	if (error) {
+		(void) printf("Read of %s failed, error: %d\n", thing, error);
+		goto out;
+	}
+
+	if (flags & ZDB_FLAG_DECOMPRESS) {
+		/*
+		 * We don't know how the data was compressed, so just try
+		 * every decompress function at every inflated blocksize.
+		 */
+		enum zio_compress c;
+		void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+		void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+
+		bcopy(pbuf, pbuf2, psize);
+
+		VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
+		    SPA_MAXBLOCKSIZE - psize) == 0);
+
+		VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
+		    SPA_MAXBLOCKSIZE - psize) == 0);
+
+		for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
+		    lsize -= SPA_MINBLOCKSIZE) {
+			for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
+				if (zio_decompress_data(c, pbuf, lbuf,
+				    psize, lsize) == 0 &&
+				    zio_decompress_data(c, pbuf2, lbuf2,
+				    psize, lsize) == 0 &&
+				    bcmp(lbuf, lbuf2, lsize) == 0)
+					break;
+			}
+			if (c != ZIO_COMPRESS_FUNCTIONS)
+				break;
+			lsize -= SPA_MINBLOCKSIZE;
+		}
+
+		umem_free(pbuf2, SPA_MAXBLOCKSIZE);
+		umem_free(lbuf2, SPA_MAXBLOCKSIZE);
+
+		if (lsize <= psize) {
+			(void) printf("Decompress of %s failed\n", thing);
+			goto out;
+		}
+		buf = lbuf;
+		size = lsize;
+	} else {
+		buf = pbuf;
+		size = psize;
+	}
+
+	if (flags & ZDB_FLAG_PRINT_BLKPTR)
+		zdb_print_blkptr((blkptr_t *)(void *)
+		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
+	else if (flags & ZDB_FLAG_RAW)
+		zdb_dump_block_raw(buf, size, flags);
+	else if (flags & ZDB_FLAG_INDIRECT)
+		zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
+		    flags);
+	else if (flags & ZDB_FLAG_GBH)
+		zdb_dump_gbh(buf, flags);
+	else
+		zdb_dump_block(thing, buf, size, flags);
+
+out:
+	umem_free(pbuf, SPA_MAXBLOCKSIZE);
+	umem_free(lbuf, SPA_MAXBLOCKSIZE);
+	free(dup);
+}
+
+static boolean_t
+pool_match(nvlist_t *cfg, char *tgt)
+{
+	uint64_t v, guid = strtoull(tgt, NULL, 0);
+	char *s;
+
+	if (guid != 0) {
+		if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
+			return (v == guid);
+	} else {
+		if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
+			return (strcmp(s, tgt) == 0);
+	}
+	return (B_FALSE);
+}
+
+static char *
+find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv)
+{
+	nvlist_t *pools;
+	nvlist_t *match = NULL;
+	char *name = NULL;
+	char *sepp = NULL;
+	char sep = 0;
+	int count = 0;
+	importargs_t args = { 0 };
+
+	args.paths = dirc;
+	args.path = dirv;
+	args.can_be_active = B_TRUE;
+
+	if ((sepp = strpbrk(*target, "/@")) != NULL) {
+		sep = *sepp;
+		*sepp = '\0';
+	}
+
+	pools = zpool_search_import(g_zfs, &args);
+
+	if (pools != NULL) {
+		nvpair_t *elem = NULL;
+		while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
+			verify(nvpair_value_nvlist(elem, configp) == 0);
+			if (pool_match(*configp, *target)) {
+				count++;
+				if (match != NULL) {
+					/* print previously found config */
+					if (name != NULL) {
+						(void) printf("%s\n", name);
+						dump_nvlist(match, 8);
+						name = NULL;
+					}
+					(void) printf("%s\n",
+					    nvpair_name(elem));
+					dump_nvlist(*configp, 8);
+				} else {
+					match = *configp;
+					name = nvpair_name(elem);
+				}
+			}
+		}
+	}
+	if (count > 1)
+		(void) fatal("\tMatched %d pools - use pool GUID "
+		    "instead of pool name or \n"
+		    "\tpool name part of a dataset name to select pool", count);
+
+	if (sepp)
+		*sepp = sep;
+	/*
+	 * If pool GUID was specified for pool id, replace it with pool name
+	 */
+	if (name && (strstr(*target, name) != *target)) {
+		int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0);
+
+		*target = umem_alloc(sz, UMEM_NOFAIL);
+		(void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : "");
+	}
+
+	*configp = name ? match : NULL;
+
+	return (name);
+}
+
+int
+main(int argc, char **argv)
+{
+	int i, c;
+	struct rlimit rl = { 1024, 1024 };
+	spa_t *spa = NULL;
+	objset_t *os = NULL;
+	int dump_all = 1;
+	int verbose = 0;
+	int error = 0;
+	char **searchdirs = NULL;
+	int nsearch = 0;
+	char *target;
+	nvlist_t *policy = NULL;
+	uint64_t max_txg = UINT64_MAX;
+	int flags = ZFS_IMPORT_MISSING_LOG;
+	int rewind = ZPOOL_NEVER_REWIND;
+	char *spa_config_path_env;
+	const char *opts = "bcdhilmMI:suCDRSAFLXevp:t:U:P";
+	boolean_t target_is_spa = B_TRUE;
+
+	(void) setrlimit(RLIMIT_NOFILE, &rl);
+	(void) enable_extended_FILE_stdio(-1, -1);
+
+	dprintf_setup(&argc, argv);
+
+	/*
+	 * If there is an environment variable SPA_CONFIG_PATH it overrides
+	 * default spa_config_path setting. If -U flag is specified it will
+	 * override this environment variable settings once again.
+	 */
+	spa_config_path_env = getenv("SPA_CONFIG_PATH");
+	if (spa_config_path_env != NULL)
+		spa_config_path = spa_config_path_env;
+
+	while ((c = getopt(argc, argv, opts)) != -1) {
+		switch (c) {
+		case 'b':
+		case 'c':
+		case 'd':
+		case 'h':
+		case 'i':
+		case 'l':
+		case 'm':
+		case 's':
+		case 'u':
+		case 'C':
+		case 'D':
+		case 'M':
+		case 'R':
+		case 'S':
+			dump_opt[c]++;
+			dump_all = 0;
+			break;
+		case 'A':
+		case 'F':
+		case 'L':
+		case 'X':
+		case 'e':
+		case 'P':
+			dump_opt[c]++;
+			break;
+		case 'V':
+			flags = ZFS_IMPORT_VERBATIM;
+			break;
+		case 'I':
+			max_inflight = strtoull(optarg, NULL, 0);
+			if (max_inflight == 0) {
+				(void) fprintf(stderr, "maximum number "
+				    "of inflight I/Os must be greater "
+				    "than 0\n");
+				usage();
+			}
+			break;
+		case 'p':
+			if (searchdirs == NULL) {
+				searchdirs = umem_alloc(sizeof (char *),
+				    UMEM_NOFAIL);
+			} else {
+				char **tmp = umem_alloc((nsearch + 1) *
+				    sizeof (char *), UMEM_NOFAIL);
+				bcopy(searchdirs, tmp, nsearch *
+				    sizeof (char *));
+				umem_free(searchdirs,
+				    nsearch * sizeof (char *));
+				searchdirs = tmp;
+			}
+			searchdirs[nsearch++] = optarg;
+			break;
+		case 't':
+			max_txg = strtoull(optarg, NULL, 0);
+			if (max_txg < TXG_INITIAL) {
+				(void) fprintf(stderr, "incorrect txg "
+				    "specified: %s\n", optarg);
+				usage();
+			}
+			break;
+		case 'U':
+			spa_config_path = optarg;
+			break;
+		case 'v':
+			verbose++;
+			break;
+		default:
+			usage();
+			break;
+		}
+	}
+
+	if (!dump_opt['e'] && searchdirs != NULL) {
+		(void) fprintf(stderr, "-p option requires use of -e\n");
+		usage();
+	}
+
+#if defined(_LP64)
+	/*
+	 * ZDB does not typically re-read blocks; therefore limit the ARC
+	 * to 256 MB, which can be used entirely for metadata.
+	 */
+	zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
+#endif
+
+	/*
+	 * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
+	 * "zdb -b" uses traversal prefetch which uses async reads.
+	 * For good performance, let several of them be active at once.
+	 */
+	zfs_vdev_async_read_max_active = 10;
+
+	kernel_init(FREAD);
+	if ((g_zfs = libzfs_init()) == NULL) {
+		(void) fprintf(stderr, "%s", libzfs_error_init(errno));
+		return (1);
+	}
+
+	if (dump_all)
+		verbose = MAX(verbose, 1);
+
+	for (c = 0; c < 256; c++) {
+		if (dump_all && !strchr("elAFLRSXP", c))
+			dump_opt[c] = 1;
+		if (dump_opt[c])
+			dump_opt[c] += verbose;
+	}
+
+	aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2);
+	zfs_recover = (dump_opt['A'] > 1);
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 2 && dump_opt['R'])
+		usage();
+	if (argc < 1) {
+		if (!dump_opt['e'] && dump_opt['C']) {
+			dump_cachefile(spa_config_path);
+			return (0);
+		}
+		usage();
+	}
+
+	if (dump_opt['l']) {
+		dump_label(argv[0]);
+		return (0);
+	}
+
+	if (dump_opt['X'] || dump_opt['F'])
+		rewind = ZPOOL_DO_REWIND |
+		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
+
+	if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
+	    nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 ||
+	    nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind) != 0)
+		fatal("internal error: %s", strerror(ENOMEM));
+
+	error = 0;
+	target = argv[0];
+
+	if (dump_opt['e']) {
+		nvlist_t *cfg = NULL;
+		char *name = find_zpool(&target, &cfg, nsearch, searchdirs);
+
+		error = ENOENT;
+		if (name) {
+			if (dump_opt['C'] > 1) {
+				(void) printf("\nConfiguration for import:\n");
+				dump_nvlist(cfg, 8);
+			}
+			if (nvlist_add_nvlist(cfg,
+			    ZPOOL_REWIND_POLICY, policy) != 0) {
+				fatal("can't open '%s': %s",
+				    target, strerror(ENOMEM));
+			}
+			error = spa_import(name, cfg, NULL, flags);
+		}
+	}
+
+	if (strpbrk(target, "/@") != NULL) {
+		size_t targetlen;
+
+		target_is_spa = B_FALSE;
+		targetlen = strlen(target);
+		if (targetlen && target[targetlen - 1] == '/')
+			target[targetlen - 1] = '\0';
+	}
+
+	if (error == 0) {
+		if (target_is_spa || dump_opt['R']) {
+			error = spa_open_rewind(target, &spa, FTAG, policy,
+			    NULL);
+			if (error) {
+				/*
+				 * If we're missing the log device then
+				 * try opening the pool after clearing the
+				 * log state.
+				 */
+				mutex_enter(&spa_namespace_lock);
+				if ((spa = spa_lookup(target)) != NULL &&
+				    spa->spa_log_state == SPA_LOG_MISSING) {
+					spa->spa_log_state = SPA_LOG_CLEAR;
+					error = 0;
+				}
+				mutex_exit(&spa_namespace_lock);
+
+				if (!error) {
+					error = spa_open_rewind(target, &spa,
+					    FTAG, policy, NULL);
+				}
+			}
+		} else {
+			error = dmu_objset_own(target, DMU_OST_ANY,
+			    B_TRUE, FTAG, &os);
+		}
+	}
+	nvlist_free(policy);
+
+	if (error)
+		fatal("can't open '%s': %s", target, strerror(error));
+
+	argv++;
+	argc--;
+	if (!dump_opt['R']) {
+		if (argc > 0) {
+			zopt_objects = argc;
+			zopt_object = calloc(zopt_objects, sizeof (uint64_t));
+			for (i = 0; i < zopt_objects; i++) {
+				errno = 0;
+				zopt_object[i] = strtoull(argv[i], NULL, 0);
+				if (zopt_object[i] == 0 && errno != 0)
+					fatal("bad number %s: %s",
+					    argv[i], strerror(errno));
+			}
+		}
+		if (os != NULL) {
+			dump_dir(os);
+		} else if (zopt_objects > 0 && !dump_opt['m']) {
+			dump_dir(spa->spa_meta_objset);
+		} else {
+			dump_zpool(spa);
+		}
+	} else {
+		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
+		flagbits['c'] = ZDB_FLAG_CHECKSUM;
+		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
+		flagbits['e'] = ZDB_FLAG_BSWAP;
+		flagbits['g'] = ZDB_FLAG_GBH;
+		flagbits['i'] = ZDB_FLAG_INDIRECT;
+		flagbits['p'] = ZDB_FLAG_PHYS;
+		flagbits['r'] = ZDB_FLAG_RAW;
+
+		for (i = 0; i < argc; i++)
+			zdb_read_block(argv[i], spa);
+	}
+
+	(os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG);
+
+	fuid_table_destroy();
+	sa_loaded = B_FALSE;
+
+	libzfs_fini(g_zfs);
+	kernel_fini();
+
+	return (0);
+}
diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index 356bf9aae4f6..beb6c7c87c4c 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -248,8 +248,9 @@ get_usage(zfs_help_t idx)
 		return (gettext("\tpromote <clone-filesystem>\n"));
 	case HELP_RECEIVE:
 		return (gettext("\treceive [-vnFu] <filesystem|volume|"
-		"snapshot>\n"
-		"\treceive [-vnFu] [-d | -e] <filesystem>\n"));
+		    "snapshot>\n"
+		    "\treceive [-vnFu] [-o origin=<snapshot>] [-d | -e] "
+		    "<filesystem>\n"));
 	case HELP_RENAME:
 		return (gettext("\trename [-f] <filesystem|volume|snapshot> "
 		    "<filesystem|volume|snapshot>\n"
@@ -792,7 +793,7 @@ zfs_do_create(int argc, char **argv)
 				nomem();
 			break;
 		case 'o':
-			if (parseprop(props, optarg))
+			if (parseprop(props, optarg) != 0)
 				goto error;
 			break;
 		case 's':
@@ -3622,7 +3623,7 @@ zfs_do_snapshot(int argc, char **argv)
 	while ((c = getopt(argc, argv, "ro:")) != -1) {
 		switch (c) {
 		case 'o':
-			if (parseprop(props, optarg))
+			if (parseprop(props, optarg) != 0)
 				return (1);
 			break;
 		case 'r':
@@ -3881,10 +3882,19 @@ zfs_do_receive(int argc, char **argv)
 {
 	int c, err;
 	recvflags_t flags = { 0 };
+	nvlist_t *props;
+	nvpair_t *nvp = NULL;
+
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":denuvF")) != -1) {
+	while ((c = getopt(argc, argv, ":o:denuvF")) != -1) {
 		switch (c) {
+		case 'o':
+			if (parseprop(props, optarg) != 0)
+				return (1);
+			break;
 		case 'd':
 			flags.isprefix = B_TRUE;
 			break;
@@ -3929,6 +3939,13 @@ zfs_do_receive(int argc, char **argv)
 		usage(B_FALSE);
 	}
 
+	while ((nvp = nvlist_next_nvpair(props, nvp))) {
+		if (strcmp(nvpair_name(nvp), "origin") != 0) {
+			(void) fprintf(stderr, gettext("invalid option"));
+			usage(B_FALSE);
+		}
+	}
+
 	if (isatty(STDIN_FILENO)) {
 		(void) fprintf(stderr,
 		    gettext("Error: Backup stream can not be read "
@@ -3937,7 +3954,7 @@ zfs_do_receive(int argc, char **argv)
 		return (1);
 	}
 
-	err = zfs_receive(g_zfs, argv[0], &flags, STDIN_FILENO, NULL);
+	err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL);
 
 	return (err != 0);
 }
diff --git a/cmd/zfs/zfs_main.c.orig b/cmd/zfs/zfs_main.c.orig
new file mode 100644
index 000000000000..356bf9aae4f6
--- /dev/null
+++ b/cmd/zfs/zfs_main.c.orig
@@ -0,0 +1,6766 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland.  All rights reserved.
+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <libgen.h>
+#include <libintl.h>
+#include <libuutil.h>
+#include <libnvpair.h>
+#include <locale.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <zone.h>
+#include <grp.h>
+#include <pwd.h>
+#include <signal.h>
+#include <sys/list.h>
+#include <sys/mkdev.h>
+#include <sys/mntent.h>
+#include <sys/mnttab.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/fs/zfs.h>
+#include <sys/types.h>
+#include <time.h>
+
+#include <libzfs.h>
+#include <libzfs_core.h>
+#include <zfs_prop.h>
+#include <zfs_deleg.h>
+#include <libuutil.h>
+#ifdef HAVE_IDMAP
+#include <aclutils.h>
+#include <directory.h>
+#endif /* HAVE_IDMAP */
+
+#include "zfs_iter.h"
+#include "zfs_util.h"
+#include "zfs_comutil.h"
+#include "libzfs_impl.h"
+
+libzfs_handle_t *g_zfs;
+
+static FILE *mnttab_file;
+static char history_str[HIS_MAX_RECORD_LEN];
+static boolean_t log_history = B_TRUE;
+
+static int zfs_do_clone(int argc, char **argv);
+static int zfs_do_create(int argc, char **argv);
+static int zfs_do_destroy(int argc, char **argv);
+static int zfs_do_get(int argc, char **argv);
+static int zfs_do_inherit(int argc, char **argv);
+static int zfs_do_list(int argc, char **argv);
+static int zfs_do_mount(int argc, char **argv);
+static int zfs_do_rename(int argc, char **argv);
+static int zfs_do_rollback(int argc, char **argv);
+static int zfs_do_set(int argc, char **argv);
+static int zfs_do_upgrade(int argc, char **argv);
+static int zfs_do_snapshot(int argc, char **argv);
+static int zfs_do_unmount(int argc, char **argv);
+static int zfs_do_share(int argc, char **argv);
+static int zfs_do_unshare(int argc, char **argv);
+static int zfs_do_send(int argc, char **argv);
+static int zfs_do_receive(int argc, char **argv);
+static int zfs_do_promote(int argc, char **argv);
+static int zfs_do_userspace(int argc, char **argv);
+static int zfs_do_allow(int argc, char **argv);
+static int zfs_do_unallow(int argc, char **argv);
+static int zfs_do_hold(int argc, char **argv);
+static int zfs_do_holds(int argc, char **argv);
+static int zfs_do_release(int argc, char **argv);
+static int zfs_do_diff(int argc, char **argv);
+static int zfs_do_bookmark(int argc, char **argv);
+
+/*
+ * Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
+ */
+
+#ifdef DEBUG
+const char *
+_umem_debug_init(void)
+{
+	return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+	return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+#endif
+
+typedef enum {
+	HELP_CLONE,
+	HELP_CREATE,
+	HELP_DESTROY,
+	HELP_GET,
+	HELP_INHERIT,
+	HELP_UPGRADE,
+	HELP_LIST,
+	HELP_MOUNT,
+	HELP_PROMOTE,
+	HELP_RECEIVE,
+	HELP_RENAME,
+	HELP_ROLLBACK,
+	HELP_SEND,
+	HELP_SET,
+	HELP_SHARE,
+	HELP_SNAPSHOT,
+	HELP_UNMOUNT,
+	HELP_UNSHARE,
+	HELP_ALLOW,
+	HELP_UNALLOW,
+	HELP_USERSPACE,
+	HELP_GROUPSPACE,
+	HELP_HOLD,
+	HELP_HOLDS,
+	HELP_RELEASE,
+	HELP_DIFF,
+	HELP_BOOKMARK,
+} zfs_help_t;
+
+typedef struct zfs_command {
+	const char	*name;
+	int		(*func)(int argc, char **argv);
+	zfs_help_t	usage;
+} zfs_command_t;
+
+/*
+ * Master command table.  Each ZFS command has a name, associated function, and
+ * usage message.  The usage messages need to be internationalized, so we have
+ * to have a function to return the usage message based on a command index.
+ *
+ * These commands are organized according to how they are displayed in the usage
+ * message.  An empty command (one with a NULL name) indicates an empty line in
+ * the generic usage message.
+ */
+static zfs_command_t command_table[] = {
+	{ "create",	zfs_do_create,		HELP_CREATE		},
+	{ "destroy",	zfs_do_destroy,		HELP_DESTROY		},
+	{ NULL },
+	{ "snapshot",	zfs_do_snapshot,	HELP_SNAPSHOT		},
+	{ "rollback",	zfs_do_rollback,	HELP_ROLLBACK		},
+	{ "clone",	zfs_do_clone,		HELP_CLONE		},
+	{ "promote",	zfs_do_promote,		HELP_PROMOTE		},
+	{ "rename",	zfs_do_rename,		HELP_RENAME		},
+	{ "bookmark",	zfs_do_bookmark,	HELP_BOOKMARK		},
+	{ NULL },
+	{ "list",	zfs_do_list,		HELP_LIST		},
+	{ NULL },
+	{ "set",	zfs_do_set,		HELP_SET		},
+	{ "get",	zfs_do_get,		HELP_GET		},
+	{ "inherit",	zfs_do_inherit,		HELP_INHERIT		},
+	{ "upgrade",	zfs_do_upgrade,		HELP_UPGRADE		},
+	{ "userspace",	zfs_do_userspace,	HELP_USERSPACE		},
+	{ "groupspace",	zfs_do_userspace,	HELP_GROUPSPACE		},
+	{ NULL },
+	{ "mount",	zfs_do_mount,		HELP_MOUNT		},
+	{ "unmount",	zfs_do_unmount,		HELP_UNMOUNT		},
+	{ "share",	zfs_do_share,		HELP_SHARE		},
+	{ "unshare",	zfs_do_unshare,		HELP_UNSHARE		},
+	{ NULL },
+	{ "send",	zfs_do_send,		HELP_SEND		},
+	{ "receive",	zfs_do_receive,		HELP_RECEIVE		},
+	{ NULL },
+	{ "allow",	zfs_do_allow,		HELP_ALLOW		},
+	{ NULL },
+	{ "unallow",	zfs_do_unallow,		HELP_UNALLOW		},
+	{ NULL },
+	{ "hold",	zfs_do_hold,		HELP_HOLD		},
+	{ "holds",	zfs_do_holds,		HELP_HOLDS		},
+	{ "release",	zfs_do_release,		HELP_RELEASE		},
+	{ "diff",	zfs_do_diff,		HELP_DIFF		},
+};
+
+#define	NCOMMAND	(sizeof (command_table) / sizeof (command_table[0]))
+
+zfs_command_t *current_command;
+
+static const char *
+get_usage(zfs_help_t idx)
+{
+	switch (idx) {
+	case HELP_CLONE:
+		return (gettext("\tclone [-p] [-o property=value] ... "
+		    "<snapshot> <filesystem|volume>\n"));
+	case HELP_CREATE:
+		return (gettext("\tcreate [-p] [-o property=value] ... "
+		    "<filesystem>\n"
+		    "\tcreate [-ps] [-b blocksize] [-o property=value] ... "
+		    "-V <size> <volume>\n"));
+	case HELP_DESTROY:
+		return (gettext("\tdestroy [-fnpRrv] <filesystem|volume>\n"
+		    "\tdestroy [-dnpRrv] "
+		    "<filesystem|volume>@<snap>[%<snap>][,...]\n"
+		    "\tdestroy <filesystem|volume>#<bookmark>\n"));
+	case HELP_GET:
+		return (gettext("\tget [-rHp] [-d max] "
+		    "[-o \"all\" | field[,...]]\n"
+		    "\t    [-t type[,...]] [-s source[,...]]\n"
+		    "\t    <\"all\" | property[,...]> "
+		    "[filesystem|volume|snapshot] ...\n"));
+	case HELP_INHERIT:
+		return (gettext("\tinherit [-rS] <property> "
+		    "<filesystem|volume|snapshot> ...\n"));
+	case HELP_UPGRADE:
+		return (gettext("\tupgrade [-v]\n"
+		    "\tupgrade [-r] [-V version] <-a | filesystem ...>\n"));
+	case HELP_LIST:
+		return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] "
+		    "[-s property]...\n\t    [-S property]... [-t type[,...]] "
+		    "[filesystem|volume|snapshot] ...\n"));
+	case HELP_MOUNT:
+		return (gettext("\tmount\n"
+		    "\tmount [-vO] [-o opts] <-a | filesystem>\n"));
+	case HELP_PROMOTE:
+		return (gettext("\tpromote <clone-filesystem>\n"));
+	case HELP_RECEIVE:
+		return (gettext("\treceive [-vnFu] <filesystem|volume|"
+		"snapshot>\n"
+		"\treceive [-vnFu] [-d | -e] <filesystem>\n"));
+	case HELP_RENAME:
+		return (gettext("\trename [-f] <filesystem|volume|snapshot> "
+		    "<filesystem|volume|snapshot>\n"
+		    "\trename [-f] -p <filesystem|volume> <filesystem|volume>\n"
+		    "\trename -r <snapshot> <snapshot>\n"));
+	case HELP_ROLLBACK:
+		return (gettext("\trollback [-rRf] <snapshot>\n"));
+	case HELP_SEND:
+		return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] "
+		    "<snapshot>\n"
+		    "\tsend [-Le] [-i snapshot|bookmark] "
+		    "<filesystem|volume|snapshot>\n"));
+	case HELP_SET:
+		return (gettext("\tset <property=value> ... "
+		    "<filesystem|volume|snapshot> ...\n"));
+	case HELP_SHARE:
+		return (gettext("\tshare <-a | filesystem>\n"));
+	case HELP_SNAPSHOT:
+		return (gettext("\tsnapshot|snap [-r] [-o property=value] ... "
+		    "<filesystem|volume>@<snap> ...\n"));
+	case HELP_UNMOUNT:
+		return (gettext("\tunmount [-f] "
+		    "<-a | filesystem|mountpoint>\n"));
+	case HELP_UNSHARE:
+		return (gettext("\tunshare "
+		    "<-a | filesystem|mountpoint>\n"));
+	case HELP_ALLOW:
+		return (gettext("\tallow <filesystem|volume>\n"
+		    "\tallow [-ldug] "
+		    "<\"everyone\"|user|group>[,...] <perm|@setname>[,...]\n"
+		    "\t    <filesystem|volume>\n"
+		    "\tallow [-ld] -e <perm|@setname>[,...] "
+		    "<filesystem|volume>\n"
+		    "\tallow -c <perm|@setname>[,...] <filesystem|volume>\n"
+		    "\tallow -s @setname <perm|@setname>[,...] "
+		    "<filesystem|volume>\n"));
+	case HELP_UNALLOW:
+		return (gettext("\tunallow [-rldug] "
+		    "<\"everyone\"|user|group>[,...]\n"
+		    "\t    [<perm|@setname>[,...]] <filesystem|volume>\n"
+		    "\tunallow [-rld] -e [<perm|@setname>[,...]] "
+		    "<filesystem|volume>\n"
+		    "\tunallow [-r] -c [<perm|@setname>[,...]] "
+		    "<filesystem|volume>\n"
+		    "\tunallow [-r] -s @setname [<perm|@setname>[,...]] "
+		    "<filesystem|volume>\n"));
+	case HELP_USERSPACE:
+		return (gettext("\tuserspace [-Hinp] [-o field[,...]] "
+		    "[-s field] ...\n"
+		    "\t    [-S field] ... [-t type[,...]] "
+		    "<filesystem|snapshot>\n"));
+	case HELP_GROUPSPACE:
+		return (gettext("\tgroupspace [-Hinp] [-o field[,...]] "
+		    "[-s field] ...\n"
+		    "\t    [-S field] ... [-t type[,...]] "
+		    "<filesystem|snapshot>\n"));
+	case HELP_HOLD:
+		return (gettext("\thold [-r] <tag> <snapshot> ...\n"));
+	case HELP_HOLDS:
+		return (gettext("\tholds [-r] <snapshot> ...\n"));
+	case HELP_RELEASE:
+		return (gettext("\trelease [-r] <tag> <snapshot> ...\n"));
+	case HELP_DIFF:
+		return (gettext("\tdiff [-FHt] <snapshot> "
+		    "[snapshot|filesystem]\n"));
+	case HELP_BOOKMARK:
+		return (gettext("\tbookmark <snapshot> <bookmark>\n"));
+	}
+
+	abort();
+	/* NOTREACHED */
+}
+
+void
+nomem(void)
+{
+	(void) fprintf(stderr, gettext("internal error: out of memory\n"));
+	exit(1);
+}
+
+/*
+ * Utility function to guarantee malloc() success.
+ */
+
+void *
+safe_malloc(size_t size)
+{
+	void *data;
+
+	if ((data = calloc(1, size)) == NULL)
+		nomem();
+
+	return (data);
+}
+
+static char *
+safe_strdup(char *str)
+{
+	char *dupstr = strdup(str);
+
+	if (dupstr == NULL)
+		nomem();
+
+	return (dupstr);
+}
+
+/*
+ * Callback routine that will print out information for each of
+ * the properties.
+ */
+static int
+usage_prop_cb(int prop, void *cb)
+{
+	FILE *fp = cb;
+
+	(void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop));
+
+	if (zfs_prop_readonly(prop))
+		(void) fprintf(fp, " NO    ");
+	else
+		(void) fprintf(fp, "YES    ");
+
+	if (zfs_prop_inheritable(prop))
+		(void) fprintf(fp, "  YES   ");
+	else
+		(void) fprintf(fp, "   NO   ");
+
+	if (zfs_prop_values(prop) == NULL)
+		(void) fprintf(fp, "-\n");
+	else
+		(void) fprintf(fp, "%s\n", zfs_prop_values(prop));
+
+	return (ZPROP_CONT);
+}
+
+/*
+ * Display usage message.  If we're inside a command, display only the usage for
+ * that command.  Otherwise, iterate over the entire command table and display
+ * a complete usage message.
+ */
+static void
+usage(boolean_t requested)
+{
+	int i;
+	boolean_t show_properties = B_FALSE;
+	FILE *fp = requested ? stdout : stderr;
+
+	if (current_command == NULL) {
+
+		(void) fprintf(fp, gettext("usage: zfs command args ...\n"));
+		(void) fprintf(fp,
+		    gettext("where 'command' is one of the following:\n\n"));
+
+		for (i = 0; i < NCOMMAND; i++) {
+			if (command_table[i].name == NULL)
+				(void) fprintf(fp, "\n");
+			else
+				(void) fprintf(fp, "%s",
+				    get_usage(command_table[i].usage));
+		}
+
+		(void) fprintf(fp, gettext("\nEach dataset is of the form: "
+		    "pool/[dataset/]*dataset[@name]\n"));
+	} else {
+		(void) fprintf(fp, gettext("usage:\n"));
+		(void) fprintf(fp, "%s", get_usage(current_command->usage));
+	}
+
+	if (current_command != NULL &&
+	    (strcmp(current_command->name, "set") == 0 ||
+	    strcmp(current_command->name, "get") == 0 ||
+	    strcmp(current_command->name, "inherit") == 0 ||
+	    strcmp(current_command->name, "list") == 0))
+		show_properties = B_TRUE;
+
+	if (show_properties) {
+		(void) fprintf(fp,
+		    gettext("\nThe following properties are supported:\n"));
+
+		(void) fprintf(fp, "\n\t%-14s %s  %s   %s\n\n",
+		    "PROPERTY", "EDIT", "INHERIT", "VALUES");
+
+		/* Iterate over all properties */
+		(void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE,
+		    ZFS_TYPE_DATASET);
+
+		(void) fprintf(fp, "\t%-15s ", "userused@...");
+		(void) fprintf(fp, " NO       NO   <size>\n");
+		(void) fprintf(fp, "\t%-15s ", "groupused@...");
+		(void) fprintf(fp, " NO       NO   <size>\n");
+		(void) fprintf(fp, "\t%-15s ", "userquota@...");
+		(void) fprintf(fp, "YES       NO   <size> | none\n");
+		(void) fprintf(fp, "\t%-15s ", "groupquota@...");
+		(void) fprintf(fp, "YES       NO   <size> | none\n");
+		(void) fprintf(fp, "\t%-15s ", "written@<snap>");
+		(void) fprintf(fp, " NO       NO   <size>\n");
+
+		(void) fprintf(fp, gettext("\nSizes are specified in bytes "
+		    "with standard units such as K, M, G, etc.\n"));
+		(void) fprintf(fp, gettext("\nUser-defined properties can "
+		    "be specified by using a name containing a colon (:).\n"));
+		(void) fprintf(fp, gettext("\nThe {user|group}{used|quota}@ "
+		    "properties must be appended with\n"
+		    "a user or group specifier of one of these forms:\n"
+		    "    POSIX name      (eg: \"matt\")\n"
+		    "    POSIX id        (eg: \"126829\")\n"
+		    "    SMB name@domain (eg: \"matt@sun\")\n"
+		    "    SMB SID         (eg: \"S-1-234-567-89\")\n"));
+	} else {
+		(void) fprintf(fp,
+		    gettext("\nFor the property list, run: %s\n"),
+		    "zfs set|get");
+		(void) fprintf(fp,
+		    gettext("\nFor the delegated permission list, run: %s\n"),
+		    "zfs allow|unallow");
+	}
+
+	/*
+	 * See comments at end of main().
+	 */
+	if (getenv("ZFS_ABORT") != NULL) {
+		(void) printf("dumping core by request\n");
+		abort();
+	}
+
+	exit(requested ? 0 : 2);
+}
+
+/*
+ * Take a property=value argument string and add it to the given nvlist.
+ * Modifies the argument inplace.
+ */
+static int
+parseprop(nvlist_t *props, char *propname)
+{
+	char *propval, *strval;
+
+	if ((propval = strchr(propname, '=')) == NULL) {
+		(void) fprintf(stderr, gettext("missing "
+		    "'=' for property=value argument\n"));
+		return (-1);
+	}
+	*propval = '\0';
+	propval++;
+	if (nvlist_lookup_string(props, propname, &strval) == 0) {
+		(void) fprintf(stderr, gettext("property '%s' "
+		    "specified multiple times\n"), propname);
+		return (-1);
+	}
+	if (nvlist_add_string(props, propname, propval) != 0)
+		nomem();
+	return (0);
+}
+
+static int
+parse_depth(char *opt, int *flags)
+{
+	char *tmp;
+	int depth;
+
+	depth = (int)strtol(opt, &tmp, 0);
+	if (*tmp) {
+		(void) fprintf(stderr,
+		    gettext("%s is not an integer\n"), optarg);
+		usage(B_FALSE);
+	}
+	if (depth < 0) {
+		(void) fprintf(stderr,
+		    gettext("Depth can not be negative.\n"));
+		usage(B_FALSE);
+	}
+	*flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE);
+	return (depth);
+}
+
+#define	PROGRESS_DELAY 2		/* seconds */
+
+static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
+static time_t pt_begin;
+static char *pt_header = NULL;
+static boolean_t pt_shown;
+
+static void
+start_progress_timer(void)
+{
+	pt_begin = time(NULL) + PROGRESS_DELAY;
+	pt_shown = B_FALSE;
+}
+
+static void
+set_progress_header(char *header)
+{
+	assert(pt_header == NULL);
+	pt_header = safe_strdup(header);
+	if (pt_shown) {
+		(void) printf("%s: ", header);
+		(void) fflush(stdout);
+	}
+}
+
+static void
+update_progress(char *update)
+{
+	if (!pt_shown && time(NULL) > pt_begin) {
+		int len = strlen(update);
+
+		(void) printf("%s: %s%*.*s", pt_header, update, len, len,
+		    pt_reverse);
+		(void) fflush(stdout);
+		pt_shown = B_TRUE;
+	} else if (pt_shown) {
+		int len = strlen(update);
+
+		(void) printf("%s%*.*s", update, len, len, pt_reverse);
+		(void) fflush(stdout);
+	}
+}
+
+static void
+finish_progress(char *done)
+{
+	if (pt_shown) {
+		(void) printf("%s\n", done);
+		(void) fflush(stdout);
+	}
+	free(pt_header);
+	pt_header = NULL;
+}
+
+static int
+zfs_mount_and_share(libzfs_handle_t *hdl, const char *dataset, zfs_type_t type)
+{
+	zfs_handle_t *zhp = NULL;
+	int ret = 0;
+
+	zhp = zfs_open(hdl, dataset, type);
+	if (zhp == NULL)
+		return (1);
+
+	/*
+	 * Volumes may neither be mounted or shared.  Potentially in the
+	 * future filesystems detected on these volumes could be mounted.
+	 */
+	if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) {
+		zfs_close(zhp);
+		return (0);
+	}
+
+	/*
+	 * Mount and/or share the new filesystem as appropriate.  We provide a
+	 * verbose error message to let the user know that their filesystem was
+	 * in fact created, even if we failed to mount or share it.
+	 *
+	 * If the user doesn't want the dataset automatically mounted, then
+	 * skip the mount/share step
+	 */
+	if (zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, type, B_FALSE) &&
+	    zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON) {
+		if (zfs_mount(zhp, NULL, 0) != 0) {
+			(void) fprintf(stderr, gettext("filesystem "
+			    "successfully created, but not mounted\n"));
+			ret = 1;
+		} else if (zfs_share(zhp) != 0) {
+			(void) fprintf(stderr, gettext("filesystem "
+			    "successfully created, but not shared\n"));
+			ret = 1;
+		}
+	}
+
+	zfs_close(zhp);
+
+	return (ret);
+}
+
+/*
+ * zfs clone [-p] [-o prop=value] ... <snap> <fs | vol>
+ *
+ * Given an existing dataset, create a writable copy whose initial contents
+ * are the same as the source.  The newly created dataset maintains a
+ * dependency on the original; the original cannot be destroyed so long as
+ * the clone exists.
+ *
+ * The '-p' flag creates all the non-existing ancestors of the target first.
+ */
+static int
+zfs_do_clone(int argc, char **argv)
+{
+	zfs_handle_t *zhp = NULL;
+	boolean_t parents = B_FALSE;
+	nvlist_t *props;
+	int ret = 0;
+	int c;
+
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+
+	/* check options */
+	while ((c = getopt(argc, argv, "o:p")) != -1) {
+		switch (c) {
+		case 'o':
+			if (parseprop(props, optarg) != 0)
+				return (1);
+			break;
+		case 'p':
+			parents = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			goto usage;
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing source dataset "
+		    "argument\n"));
+		goto usage;
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing target dataset "
+		    "argument\n"));
+		goto usage;
+	}
+	if (argc > 2) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		goto usage;
+	}
+
+	/* open the source dataset */
+	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
+		return (1);
+
+	if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM |
+	    ZFS_TYPE_VOLUME)) {
+		/*
+		 * Now create the ancestors of the target dataset.  If the
+		 * target already exists and '-p' option was used we should not
+		 * complain.
+		 */
+		if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM |
+		    ZFS_TYPE_VOLUME))
+			return (0);
+		if (zfs_create_ancestors(g_zfs, argv[1]) != 0)
+			return (1);
+	}
+
+	/* pass to libzfs */
+	ret = zfs_clone(zhp, argv[1], props);
+
+	/* create the mountpoint if necessary */
+	if (ret == 0) {
+		if (log_history) {
+			(void) zpool_log_history(g_zfs, history_str);
+			log_history = B_FALSE;
+		}
+
+		ret = zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET);
+	}
+
+	zfs_close(zhp);
+	nvlist_free(props);
+
+	return (!!ret);
+
+usage:
+	if (zhp)
+		zfs_close(zhp);
+	nvlist_free(props);
+	usage(B_FALSE);
+	return (-1);
+}
+
+/*
+ * zfs create [-p] [-o prop=value] ... fs
+ * zfs create [-ps] [-b blocksize] [-o prop=value] ... -V vol size
+ *
+ * Create a new dataset.  This command can be used to create filesystems
+ * and volumes.  Snapshot creation is handled by 'zfs snapshot'.
+ * For volumes, the user must specify a size to be used.
+ *
+ * The '-s' flag applies only to volumes, and indicates that we should not try
+ * to set the reservation for this volume.  By default we set a reservation
+ * equal to the size for any volume.  For pools with SPA_VERSION >=
+ * SPA_VERSION_REFRESERVATION, we set a refreservation instead.
+ *
+ * The '-p' flag creates all the non-existing ancestors of the target first.
+ */
+static int
+zfs_do_create(int argc, char **argv)
+{
+	zfs_type_t type = ZFS_TYPE_FILESYSTEM;
+	uint64_t volsize = 0;
+	int c;
+	boolean_t noreserve = B_FALSE;
+	boolean_t bflag = B_FALSE;
+	boolean_t parents = B_FALSE;
+	int ret = 1;
+	nvlist_t *props;
+	uint64_t intval;
+
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":V:b:so:p")) != -1) {
+		switch (c) {
+		case 'V':
+			type = ZFS_TYPE_VOLUME;
+			if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
+				(void) fprintf(stderr, gettext("bad volume "
+				    "size '%s': %s\n"), optarg,
+				    libzfs_error_description(g_zfs));
+				goto error;
+			}
+
+			if (nvlist_add_uint64(props,
+			    zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0)
+				nomem();
+			volsize = intval;
+			break;
+		case 'p':
+			parents = B_TRUE;
+			break;
+		case 'b':
+			bflag = B_TRUE;
+			if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
+				(void) fprintf(stderr, gettext("bad volume "
+				    "block size '%s': %s\n"), optarg,
+				    libzfs_error_description(g_zfs));
+				goto error;
+			}
+
+			if (nvlist_add_uint64(props,
+			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+			    intval) != 0)
+				nomem();
+			break;
+		case 'o':
+			if (parseprop(props, optarg))
+				goto error;
+			break;
+		case 's':
+			noreserve = B_TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing size "
+			    "argument\n"));
+			goto badusage;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			goto badusage;
+		}
+	}
+
+	if ((bflag || noreserve) && type != ZFS_TYPE_VOLUME) {
+		(void) fprintf(stderr, gettext("'-s' and '-b' can only be "
+		    "used when creating a volume\n"));
+		goto badusage;
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc == 0) {
+		(void) fprintf(stderr, gettext("missing %s argument\n"),
+		    zfs_type_to_name(type));
+		goto badusage;
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		goto badusage;
+	}
+
+	if (type == ZFS_TYPE_VOLUME && !noreserve) {
+		zpool_handle_t *zpool_handle;
+		uint64_t spa_version;
+		char *p;
+		zfs_prop_t resv_prop;
+		char *strval;
+
+		if ((p = strchr(argv[0], '/')))
+			*p = '\0';
+		zpool_handle = zpool_open(g_zfs, argv[0]);
+		if (p != NULL)
+			*p = '/';
+		if (zpool_handle == NULL)
+			goto error;
+		spa_version = zpool_get_prop_int(zpool_handle,
+		    ZPOOL_PROP_VERSION, NULL);
+		zpool_close(zpool_handle);
+		if (spa_version >= SPA_VERSION_REFRESERVATION)
+			resv_prop = ZFS_PROP_REFRESERVATION;
+		else
+			resv_prop = ZFS_PROP_RESERVATION;
+		volsize = zvol_volsize_to_reservation(volsize, props);
+
+		if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
+		    &strval) != 0) {
+			if (nvlist_add_uint64(props,
+			    zfs_prop_to_name(resv_prop), volsize) != 0) {
+				nvlist_free(props);
+				nomem();
+			}
+		}
+	}
+
+	if (parents && zfs_name_valid(argv[0], type)) {
+		/*
+		 * Now create the ancestors of target dataset.  If the target
+		 * already exists and '-p' option was used we should not
+		 * complain.
+		 */
+		if (zfs_dataset_exists(g_zfs, argv[0], type)) {
+			ret = 0;
+			goto error;
+		}
+		if (zfs_create_ancestors(g_zfs, argv[0]) != 0)
+			goto error;
+	}
+
+	/* pass to libzfs */
+	if (zfs_create(g_zfs, argv[0], type, props) != 0)
+		goto error;
+
+	if (log_history) {
+		(void) zpool_log_history(g_zfs, history_str);
+		log_history = B_FALSE;
+	}
+
+	ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET);
+error:
+	nvlist_free(props);
+	return (ret);
+badusage:
+	nvlist_free(props);
+	usage(B_FALSE);
+	return (2);
+}
+
+/*
+ * zfs destroy [-rRf] <fs, vol>
+ * zfs destroy [-rRd] <snap>
+ *
+ *	-r	Recursively destroy all children
+ *	-R	Recursively destroy all dependents, including clones
+ *	-f	Force unmounting of any dependents
+ *	-d	If we can't destroy now, mark for deferred destruction
+ *
+ * Destroys the given dataset.  By default, it will unmount any filesystems,
+ * and refuse to destroy a dataset that has any dependents.  A dependent can
+ * either be a child, or a clone of a child.
+ */
+typedef struct destroy_cbdata {
+	boolean_t	cb_first;
+	boolean_t	cb_force;
+	boolean_t	cb_recurse;
+	boolean_t	cb_error;
+	boolean_t	cb_doclones;
+	zfs_handle_t	*cb_target;
+	boolean_t	cb_defer_destroy;
+	boolean_t	cb_verbose;
+	boolean_t	cb_parsable;
+	boolean_t	cb_dryrun;
+	nvlist_t	*cb_nvl;
+	nvlist_t	*cb_batchedsnaps;
+
+	/* first snap in contiguous run */
+	char		*cb_firstsnap;
+	/* previous snap in contiguous run */
+	char		*cb_prevsnap;
+	int64_t		cb_snapused;
+	char		*cb_snapspec;
+	char		*cb_bookmark;
+} destroy_cbdata_t;
+
+/*
+ * Check for any dependents based on the '-r' or '-R' flags.
+ */
+static int
+destroy_check_dependent(zfs_handle_t *zhp, void *data)
+{
+	destroy_cbdata_t *cbp = data;
+	const char *tname = zfs_get_name(cbp->cb_target);
+	const char *name = zfs_get_name(zhp);
+
+	if (strncmp(tname, name, strlen(tname)) == 0 &&
+	    (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) {
+		/*
+		 * This is a direct descendant, not a clone somewhere else in
+		 * the hierarchy.
+		 */
+		if (cbp->cb_recurse)
+			goto out;
+
+		if (cbp->cb_first) {
+			(void) fprintf(stderr, gettext("cannot destroy '%s': "
+			    "%s has children\n"),
+			    zfs_get_name(cbp->cb_target),
+			    zfs_type_to_name(zfs_get_type(cbp->cb_target)));
+			(void) fprintf(stderr, gettext("use '-r' to destroy "
+			    "the following datasets:\n"));
+			cbp->cb_first = B_FALSE;
+			cbp->cb_error = B_TRUE;
+		}
+
+		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+	} else {
+		/*
+		 * This is a clone.  We only want to report this if the '-r'
+		 * wasn't specified, or the target is a snapshot.
+		 */
+		if (!cbp->cb_recurse &&
+		    zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT)
+			goto out;
+
+		if (cbp->cb_first) {
+			(void) fprintf(stderr, gettext("cannot destroy '%s': "
+			    "%s has dependent clones\n"),
+			    zfs_get_name(cbp->cb_target),
+			    zfs_type_to_name(zfs_get_type(cbp->cb_target)));
+			(void) fprintf(stderr, gettext("use '-R' to destroy "
+			    "the following datasets:\n"));
+			cbp->cb_first = B_FALSE;
+			cbp->cb_error = B_TRUE;
+			cbp->cb_dryrun = B_TRUE;
+		}
+
+		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+	}
+
+out:
+	zfs_close(zhp);
+	return (0);
+}
+
+static int
+destroy_callback(zfs_handle_t *zhp, void *data)
+{
+	destroy_cbdata_t *cb = data;
+	const char *name = zfs_get_name(zhp);
+
+	if (cb->cb_verbose) {
+		if (cb->cb_parsable) {
+			(void) printf("destroy\t%s\n", name);
+		} else if (cb->cb_dryrun) {
+			(void) printf(gettext("would destroy %s\n"),
+			    name);
+		} else {
+			(void) printf(gettext("will destroy %s\n"),
+			    name);
+		}
+	}
+
+	/*
+	 * Ignore pools (which we've already flagged as an error before getting
+	 * here).
+	 */
+	if (strchr(zfs_get_name(zhp), '/') == NULL &&
+	    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
+		zfs_close(zhp);
+		return (0);
+	}
+	if (cb->cb_dryrun) {
+		zfs_close(zhp);
+		return (0);
+	}
+
+	/*
+	 * We batch up all contiguous snapshots (even of different
+	 * filesystems) and destroy them with one ioctl.  We can't
+	 * simply do all snap deletions and then all fs deletions,
+	 * because we must delete a clone before its origin.
+	 */
+	if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) {
+		fnvlist_add_boolean(cb->cb_batchedsnaps, name);
+	} else {
+		int error = zfs_destroy_snaps_nvl(g_zfs,
+		    cb->cb_batchedsnaps, B_FALSE);
+		fnvlist_free(cb->cb_batchedsnaps);
+		cb->cb_batchedsnaps = fnvlist_alloc();
+
+		if (error != 0 ||
+		    zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 ||
+		    zfs_destroy(zhp, cb->cb_defer_destroy) != 0) {
+			zfs_close(zhp);
+			return (-1);
+		}
+	}
+
+	zfs_close(zhp);
+	return (0);
+}
+
+static int
+destroy_print_cb(zfs_handle_t *zhp, void *arg)
+{
+	destroy_cbdata_t *cb = arg;
+	const char *name = zfs_get_name(zhp);
+	int err = 0;
+
+	if (nvlist_exists(cb->cb_nvl, name)) {
+		if (cb->cb_firstsnap == NULL)
+			cb->cb_firstsnap = strdup(name);
+		if (cb->cb_prevsnap != NULL)
+			free(cb->cb_prevsnap);
+		/* this snap continues the current range */
+		cb->cb_prevsnap = strdup(name);
+		if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL)
+			nomem();
+		if (cb->cb_verbose) {
+			if (cb->cb_parsable) {
+				(void) printf("destroy\t%s\n", name);
+			} else if (cb->cb_dryrun) {
+				(void) printf(gettext("would destroy %s\n"),
+				    name);
+			} else {
+				(void) printf(gettext("will destroy %s\n"),
+				    name);
+			}
+		}
+	} else if (cb->cb_firstsnap != NULL) {
+		/* end of this range */
+		uint64_t used = 0;
+		err = lzc_snaprange_space(cb->cb_firstsnap,
+		    cb->cb_prevsnap, &used);
+		cb->cb_snapused += used;
+		free(cb->cb_firstsnap);
+		cb->cb_firstsnap = NULL;
+		free(cb->cb_prevsnap);
+		cb->cb_prevsnap = NULL;
+	}
+	zfs_close(zhp);
+	return (err);
+}
+
+static int
+destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb)
+{
+	int err;
+	assert(cb->cb_firstsnap == NULL);
+	assert(cb->cb_prevsnap == NULL);
+	err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb);
+	if (cb->cb_firstsnap != NULL) {
+		uint64_t used = 0;
+		if (err == 0) {
+			err = lzc_snaprange_space(cb->cb_firstsnap,
+			    cb->cb_prevsnap, &used);
+		}
+		cb->cb_snapused += used;
+		free(cb->cb_firstsnap);
+		cb->cb_firstsnap = NULL;
+		free(cb->cb_prevsnap);
+		cb->cb_prevsnap = NULL;
+	}
+	return (err);
+}
+
+static int
+snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg)
+{
+	destroy_cbdata_t *cb = arg;
+	int err = 0;
+
+	/* Check for clones. */
+	if (!cb->cb_doclones && !cb->cb_defer_destroy) {
+		cb->cb_target = zhp;
+		cb->cb_first = B_TRUE;
+		err = zfs_iter_dependents(zhp, B_TRUE,
+		    destroy_check_dependent, cb);
+	}
+
+	if (err == 0) {
+		if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp)))
+			nomem();
+	}
+	zfs_close(zhp);
+	return (err);
+}
+
+static int
+gather_snapshots(zfs_handle_t *zhp, void *arg)
+{
+	destroy_cbdata_t *cb = arg;
+	int err = 0;
+
+	err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb);
+	if (err == ENOENT)
+		err = 0;
+	if (err != 0)
+		goto out;
+
+	if (cb->cb_verbose) {
+		err = destroy_print_snapshots(zhp, cb);
+		if (err != 0)
+			goto out;
+	}
+
+	if (cb->cb_recurse)
+		err = zfs_iter_filesystems(zhp, gather_snapshots, cb);
+
+out:
+	zfs_close(zhp);
+	return (err);
+}
+
+static int
+destroy_clones(destroy_cbdata_t *cb)
+{
+	nvpair_t *pair;
+	for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL);
+	    pair != NULL;
+	    pair = nvlist_next_nvpair(cb->cb_nvl, pair)) {
+		zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair),
+		    ZFS_TYPE_SNAPSHOT);
+		if (zhp != NULL) {
+			boolean_t defer = cb->cb_defer_destroy;
+			int err;
+
+			/*
+			 * We can't defer destroy non-snapshots, so set it to
+			 * false while destroying the clones.
+			 */
+			cb->cb_defer_destroy = B_FALSE;
+			err = zfs_iter_dependents(zhp, B_FALSE,
+			    destroy_callback, cb);
+			cb->cb_defer_destroy = defer;
+			zfs_close(zhp);
+			if (err != 0)
+				return (err);
+		}
+	}
+	return (0);
+}
+
+static int
+zfs_do_destroy(int argc, char **argv)
+{
+	destroy_cbdata_t cb = { 0 };
+	int rv = 0;
+	int err = 0;
+	int c;
+	zfs_handle_t *zhp = NULL;
+	char *at, *pound;
+	zfs_type_t type = ZFS_TYPE_DATASET;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "vpndfrR")) != -1) {
+		switch (c) {
+		case 'v':
+			cb.cb_verbose = B_TRUE;
+			break;
+		case 'p':
+			cb.cb_verbose = B_TRUE;
+			cb.cb_parsable = B_TRUE;
+			break;
+		case 'n':
+			cb.cb_dryrun = B_TRUE;
+			break;
+		case 'd':
+			cb.cb_defer_destroy = B_TRUE;
+			type = ZFS_TYPE_SNAPSHOT;
+			break;
+		case 'f':
+			cb.cb_force = B_TRUE;
+			break;
+		case 'r':
+			cb.cb_recurse = B_TRUE;
+			break;
+		case 'R':
+			cb.cb_recurse = B_TRUE;
+			cb.cb_doclones = B_TRUE;
+			break;
+		case '?':
+		default:
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc == 0) {
+		(void) fprintf(stderr, gettext("missing dataset argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	at = strchr(argv[0], '@');
+	pound = strchr(argv[0], '#');
+	if (at != NULL) {
+
+		/* Build the list of snaps to destroy in cb_nvl. */
+		cb.cb_nvl = fnvlist_alloc();
+
+		*at = '\0';
+		zhp = zfs_open(g_zfs, argv[0],
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+		if (zhp == NULL)
+			return (1);
+
+		cb.cb_snapspec = at + 1;
+		if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 ||
+		    cb.cb_error) {
+			rv = 1;
+			goto out;
+		}
+
+		if (nvlist_empty(cb.cb_nvl)) {
+			(void) fprintf(stderr, gettext("could not find any "
+			    "snapshots to destroy; check snapshot names.\n"));
+			rv = 1;
+			goto out;
+		}
+
+		if (cb.cb_verbose) {
+			char buf[16];
+			zfs_nicenum(cb.cb_snapused, buf, sizeof (buf));
+			if (cb.cb_parsable) {
+				(void) printf("reclaim\t%llu\n",
+				    (u_longlong_t)cb.cb_snapused);
+			} else if (cb.cb_dryrun) {
+				(void) printf(gettext("would reclaim %s\n"),
+				    buf);
+			} else {
+				(void) printf(gettext("will reclaim %s\n"),
+				    buf);
+			}
+		}
+
+		if (!cb.cb_dryrun) {
+			if (cb.cb_doclones) {
+				cb.cb_batchedsnaps = fnvlist_alloc();
+				err = destroy_clones(&cb);
+				if (err == 0) {
+					err = zfs_destroy_snaps_nvl(g_zfs,
+					    cb.cb_batchedsnaps, B_FALSE);
+				}
+				if (err != 0) {
+					rv = 1;
+					goto out;
+				}
+			}
+			if (err == 0) {
+				err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl,
+				    cb.cb_defer_destroy);
+			}
+		}
+
+		if (err != 0)
+			rv = 1;
+	} else if (pound != NULL) {
+		int err;
+		nvlist_t *nvl;
+
+		if (cb.cb_dryrun) {
+			(void) fprintf(stderr,
+			    "dryrun is not supported with bookmark\n");
+			return (-1);
+		}
+
+		if (cb.cb_defer_destroy) {
+			(void) fprintf(stderr,
+			    "defer destroy is not supported with bookmark\n");
+			return (-1);
+		}
+
+		if (cb.cb_recurse) {
+			(void) fprintf(stderr,
+			    "recursive is not supported with bookmark\n");
+			return (-1);
+		}
+
+		if (!zfs_bookmark_exists(argv[0])) {
+			(void) fprintf(stderr, gettext("bookmark '%s' "
+			    "does not exist.\n"), argv[0]);
+			return (1);
+		}
+
+		nvl = fnvlist_alloc();
+		fnvlist_add_boolean(nvl, argv[0]);
+
+		err = lzc_destroy_bookmarks(nvl, NULL);
+		if (err != 0) {
+			(void) zfs_standard_error(g_zfs, err,
+			    "cannot destroy bookmark");
+		}
+
+		nvlist_free(cb.cb_nvl);
+
+		return (err);
+	} else {
+		/* Open the given dataset */
+		if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL)
+			return (1);
+
+		cb.cb_target = zhp;
+
+		/*
+		 * Perform an explicit check for pools before going any further.
+		 */
+		if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL &&
+		    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
+			(void) fprintf(stderr, gettext("cannot destroy '%s': "
+			    "operation does not apply to pools\n"),
+			    zfs_get_name(zhp));
+			(void) fprintf(stderr, gettext("use 'zfs destroy -r "
+			    "%s' to destroy all datasets in the pool\n"),
+			    zfs_get_name(zhp));
+			(void) fprintf(stderr, gettext("use 'zpool destroy %s' "
+			    "to destroy the pool itself\n"), zfs_get_name(zhp));
+			rv = 1;
+			goto out;
+		}
+
+		/*
+		 * Check for any dependents and/or clones.
+		 */
+		cb.cb_first = B_TRUE;
+		if (!cb.cb_doclones &&
+		    zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
+		    &cb) != 0) {
+			rv = 1;
+			goto out;
+		}
+
+		if (cb.cb_error) {
+			rv = 1;
+			goto out;
+		}
+
+		cb.cb_batchedsnaps = fnvlist_alloc();
+		if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback,
+		    &cb) != 0) {
+			rv = 1;
+			goto out;
+		}
+
+		/*
+		 * Do the real thing.  The callback will close the
+		 * handle regardless of whether it succeeds or not.
+		 */
+		err = destroy_callback(zhp, &cb);
+		zhp = NULL;
+		if (err == 0) {
+			err = zfs_destroy_snaps_nvl(g_zfs,
+			    cb.cb_batchedsnaps, cb.cb_defer_destroy);
+		}
+		if (err != 0)
+			rv = 1;
+	}
+
+out:
+	fnvlist_free(cb.cb_batchedsnaps);
+	fnvlist_free(cb.cb_nvl);
+	if (zhp != NULL)
+		zfs_close(zhp);
+	return (rv);
+}
+
+static boolean_t
+is_recvd_column(zprop_get_cbdata_t *cbp)
+{
+	int i;
+	zfs_get_column_t col;
+
+	for (i = 0; i < ZFS_GET_NCOLS &&
+	    (col = cbp->cb_columns[i]) != GET_COL_NONE; i++)
+		if (col == GET_COL_RECVD)
+			return (B_TRUE);
+	return (B_FALSE);
+}
+
+/*
+ * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...]
+ *	< all | property[,property]... > < fs | snap | vol > ...
+ *
+ *	-r	recurse over any child datasets
+ *	-H	scripted mode.  Headers are stripped, and fields are separated
+ *		by tabs instead of spaces.
+ *	-o	Set of fields to display.  One of "name,property,value,
+ *		received,source". Default is "name,property,value,source".
+ *		"all" is an alias for all five.
+ *	-s	Set of sources to allow.  One of
+ *		"local,default,inherited,received,temporary,none".  Default is
+ *		all six.
+ *	-p	Display values in parsable (literal) format.
+ *
+ *  Prints properties for the given datasets.  The user can control which
+ *  columns to display as well as which property types to allow.
+ */
+
+/*
+ * Invoked to display the properties for a single dataset.
+ */
+static int
+get_callback(zfs_handle_t *zhp, void *data)
+{
+	char buf[ZFS_MAXPROPLEN];
+	char rbuf[ZFS_MAXPROPLEN];
+	zprop_source_t sourcetype;
+	char source[ZFS_MAXNAMELEN];
+	zprop_get_cbdata_t *cbp = data;
+	nvlist_t *user_props = zfs_get_user_props(zhp);
+	zprop_list_t *pl = cbp->cb_proplist;
+	nvlist_t *propval;
+	char *strval;
+	char *sourceval;
+	boolean_t received = is_recvd_column(cbp);
+
+	for (; pl != NULL; pl = pl->pl_next) {
+		char *recvdval = NULL;
+		/*
+		 * Skip the special fake placeholder.  This will also skip over
+		 * the name property when 'all' is specified.
+		 */
+		if (pl->pl_prop == ZFS_PROP_NAME &&
+		    pl == cbp->cb_proplist)
+			continue;
+
+		if (pl->pl_prop != ZPROP_INVAL) {
+			if (zfs_prop_get(zhp, pl->pl_prop, buf,
+			    sizeof (buf), &sourcetype, source,
+			    sizeof (source),
+			    cbp->cb_literal) != 0) {
+				if (pl->pl_all)
+					continue;
+				if (!zfs_prop_valid_for_type(pl->pl_prop,
+				    ZFS_TYPE_DATASET, B_FALSE)) {
+					(void) fprintf(stderr,
+					    gettext("No such property '%s'\n"),
+					    zfs_prop_to_name(pl->pl_prop));
+					continue;
+				}
+				sourcetype = ZPROP_SRC_NONE;
+				(void) strlcpy(buf, "-", sizeof (buf));
+			}
+
+			if (received && (zfs_prop_get_recvd(zhp,
+			    zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf),
+			    cbp->cb_literal) == 0))
+				recvdval = rbuf;
+
+			zprop_print_one_property(zfs_get_name(zhp), cbp,
+			    zfs_prop_to_name(pl->pl_prop),
+			    buf, sourcetype, source, recvdval);
+		} else if (zfs_prop_userquota(pl->pl_user_prop)) {
+			sourcetype = ZPROP_SRC_LOCAL;
+
+			if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
+			    buf, sizeof (buf), cbp->cb_literal) != 0) {
+				sourcetype = ZPROP_SRC_NONE;
+				(void) strlcpy(buf, "-", sizeof (buf));
+			}
+
+			zprop_print_one_property(zfs_get_name(zhp), cbp,
+			    pl->pl_user_prop, buf, sourcetype, source, NULL);
+		} else if (zfs_prop_written(pl->pl_user_prop)) {
+			sourcetype = ZPROP_SRC_LOCAL;
+
+			if (zfs_prop_get_written(zhp, pl->pl_user_prop,
+			    buf, sizeof (buf), cbp->cb_literal) != 0) {
+				sourcetype = ZPROP_SRC_NONE;
+				(void) strlcpy(buf, "-", sizeof (buf));
+			}
+
+			zprop_print_one_property(zfs_get_name(zhp), cbp,
+			    pl->pl_user_prop, buf, sourcetype, source, NULL);
+		} else {
+			if (nvlist_lookup_nvlist(user_props,
+			    pl->pl_user_prop, &propval) != 0) {
+				if (pl->pl_all)
+					continue;
+				sourcetype = ZPROP_SRC_NONE;
+				strval = "-";
+			} else {
+				verify(nvlist_lookup_string(propval,
+				    ZPROP_VALUE, &strval) == 0);
+				verify(nvlist_lookup_string(propval,
+				    ZPROP_SOURCE, &sourceval) == 0);
+
+				if (strcmp(sourceval,
+				    zfs_get_name(zhp)) == 0) {
+					sourcetype = ZPROP_SRC_LOCAL;
+				} else if (strcmp(sourceval,
+				    ZPROP_SOURCE_VAL_RECVD) == 0) {
+					sourcetype = ZPROP_SRC_RECEIVED;
+				} else {
+					sourcetype = ZPROP_SRC_INHERITED;
+					(void) strlcpy(source,
+					    sourceval, sizeof (source));
+				}
+			}
+
+			if (received && (zfs_prop_get_recvd(zhp,
+			    pl->pl_user_prop, rbuf, sizeof (rbuf),
+			    cbp->cb_literal) == 0))
+				recvdval = rbuf;
+
+			zprop_print_one_property(zfs_get_name(zhp), cbp,
+			    pl->pl_user_prop, strval, sourcetype,
+			    source, recvdval);
+		}
+	}
+
+	return (0);
+}
+
+static int
+zfs_do_get(int argc, char **argv)
+{
+	zprop_get_cbdata_t cb = { 0 };
+	int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS;
+	int types = ZFS_TYPE_DATASET;
+	char *value, *fields;
+	int ret = 0;
+	int limit = 0;
+	zprop_list_t fake_name = { 0 };
+
+	/*
+	 * Set up default columns and sources.
+	 */
+	cb.cb_sources = ZPROP_SRC_ALL;
+	cb.cb_columns[0] = GET_COL_NAME;
+	cb.cb_columns[1] = GET_COL_PROPERTY;
+	cb.cb_columns[2] = GET_COL_VALUE;
+	cb.cb_columns[3] = GET_COL_SOURCE;
+	cb.cb_type = ZFS_TYPE_DATASET;
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) {
+		switch (c) {
+		case 'p':
+			cb.cb_literal = B_TRUE;
+			break;
+		case 'd':
+			limit = parse_depth(optarg, &flags);
+			break;
+		case 'r':
+			flags |= ZFS_ITER_RECURSE;
+			break;
+		case 'H':
+			cb.cb_scripted = B_TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case 'o':
+			/*
+			 * Process the set of columns to display.  We zero out
+			 * the structure to give us a blank slate.
+			 */
+			bzero(&cb.cb_columns, sizeof (cb.cb_columns));
+			i = 0;
+			while (*optarg != '\0') {
+				static char *col_subopts[] =
+				    { "name", "property", "value", "received",
+				    "source", "all", NULL };
+
+				if (i == ZFS_GET_NCOLS) {
+					(void) fprintf(stderr, gettext("too "
+					    "many fields given to -o "
+					    "option\n"));
+					usage(B_FALSE);
+				}
+
+				switch (getsubopt(&optarg, col_subopts,
+				    &value)) {
+				case 0:
+					cb.cb_columns[i++] = GET_COL_NAME;
+					break;
+				case 1:
+					cb.cb_columns[i++] = GET_COL_PROPERTY;
+					break;
+				case 2:
+					cb.cb_columns[i++] = GET_COL_VALUE;
+					break;
+				case 3:
+					cb.cb_columns[i++] = GET_COL_RECVD;
+					flags |= ZFS_ITER_RECVD_PROPS;
+					break;
+				case 4:
+					cb.cb_columns[i++] = GET_COL_SOURCE;
+					break;
+				case 5:
+					if (i > 0) {
+						(void) fprintf(stderr,
+						    gettext("\"all\" conflicts "
+						    "with specific fields "
+						    "given to -o option\n"));
+						usage(B_FALSE);
+					}
+					cb.cb_columns[0] = GET_COL_NAME;
+					cb.cb_columns[1] = GET_COL_PROPERTY;
+					cb.cb_columns[2] = GET_COL_VALUE;
+					cb.cb_columns[3] = GET_COL_RECVD;
+					cb.cb_columns[4] = GET_COL_SOURCE;
+					flags |= ZFS_ITER_RECVD_PROPS;
+					i = ZFS_GET_NCOLS;
+					break;
+				default:
+					(void) fprintf(stderr,
+					    gettext("invalid column name "
+					    "'%s'\n"), value);
+					usage(B_FALSE);
+				}
+			}
+			break;
+
+		case 's':
+			cb.cb_sources = 0;
+			while (*optarg != '\0') {
+				static char *source_subopts[] = {
+					"local", "default", "inherited",
+					"received", "temporary", "none",
+					NULL };
+
+				switch (getsubopt(&optarg, source_subopts,
+				    &value)) {
+				case 0:
+					cb.cb_sources |= ZPROP_SRC_LOCAL;
+					break;
+				case 1:
+					cb.cb_sources |= ZPROP_SRC_DEFAULT;
+					break;
+				case 2:
+					cb.cb_sources |= ZPROP_SRC_INHERITED;
+					break;
+				case 3:
+					cb.cb_sources |= ZPROP_SRC_RECEIVED;
+					break;
+				case 4:
+					cb.cb_sources |= ZPROP_SRC_TEMPORARY;
+					break;
+				case 5:
+					cb.cb_sources |= ZPROP_SRC_NONE;
+					break;
+				default:
+					(void) fprintf(stderr,
+					    gettext("invalid source "
+					    "'%s'\n"), value);
+					usage(B_FALSE);
+				}
+			}
+			break;
+
+		case 't':
+			types = 0;
+			flags &= ~ZFS_ITER_PROP_LISTSNAPS;
+			while (*optarg != '\0') {
+				static char *type_subopts[] = { "filesystem",
+				    "volume", "snapshot", "bookmark",
+				    "all", NULL };
+
+				switch (getsubopt(&optarg, type_subopts,
+				    &value)) {
+				case 0:
+					types |= ZFS_TYPE_FILESYSTEM;
+					break;
+				case 1:
+					types |= ZFS_TYPE_VOLUME;
+					break;
+				case 2:
+					types |= ZFS_TYPE_SNAPSHOT;
+					break;
+				case 3:
+					types |= ZFS_TYPE_BOOKMARK;
+					break;
+				case 4:
+					types = ZFS_TYPE_DATASET |
+					    ZFS_TYPE_BOOKMARK;
+					break;
+
+				default:
+					(void) fprintf(stderr,
+					    gettext("invalid type '%s'\n"),
+					    value);
+					usage(B_FALSE);
+				}
+			}
+			break;
+
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing property "
+		    "argument\n"));
+		usage(B_FALSE);
+	}
+
+	fields = argv[0];
+
+	if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET)
+	    != 0)
+		usage(B_FALSE);
+
+	argc--;
+	argv++;
+
+	/*
+	 * As part of zfs_expand_proplist(), we keep track of the maximum column
+	 * width for each property.  For the 'NAME' (and 'SOURCE') columns, we
+	 * need to know the maximum name length.  However, the user likely did
+	 * not specify 'name' as one of the properties to fetch, so we need to
+	 * make sure we always include at least this property for
+	 * print_get_headers() to work properly.
+	 */
+	if (cb.cb_proplist != NULL) {
+		fake_name.pl_prop = ZFS_PROP_NAME;
+		fake_name.pl_width = strlen(gettext("NAME"));
+		fake_name.pl_next = cb.cb_proplist;
+		cb.cb_proplist = &fake_name;
+	}
+
+	cb.cb_first = B_TRUE;
+
+	/* run for each object */
+	ret = zfs_for_each(argc, argv, flags, types, NULL,
+	    &cb.cb_proplist, limit, get_callback, &cb);
+
+	if (cb.cb_proplist == &fake_name)
+		zprop_free_list(fake_name.pl_next);
+	else
+		zprop_free_list(cb.cb_proplist);
+
+	return (ret);
+}
+
+/*
+ * inherit [-rS] <property> <fs|vol> ...
+ *
+ *	-r	Recurse over all children
+ *	-S	Revert to received value, if any
+ *
+ * For each dataset specified on the command line, inherit the given property
+ * from its parent.  Inheriting a property at the pool level will cause it to
+ * use the default value.  The '-r' flag will recurse over all children, and is
+ * useful for setting a property on a hierarchy-wide basis, regardless of any
+ * local modifications for each dataset.
+ */
+
+typedef struct inherit_cbdata {
+	const char *cb_propname;
+	boolean_t cb_received;
+} inherit_cbdata_t;
+
+static int
+inherit_recurse_cb(zfs_handle_t *zhp, void *data)
+{
+	inherit_cbdata_t *cb = data;
+	zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname);
+
+	/*
+	 * If we're doing it recursively, then ignore properties that
+	 * are not valid for this type of dataset.
+	 */
+	if (prop != ZPROP_INVAL &&
+	    !zfs_prop_valid_for_type(prop, zfs_get_type(zhp), B_FALSE))
+		return (0);
+
+	return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
+}
+
+static int
+inherit_cb(zfs_handle_t *zhp, void *data)
+{
+	inherit_cbdata_t *cb = data;
+
+	return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
+}
+
+static int
+zfs_do_inherit(int argc, char **argv)
+{
+	int c;
+	zfs_prop_t prop;
+	inherit_cbdata_t cb = { 0 };
+	char *propname;
+	int ret = 0;
+	int flags = 0;
+	boolean_t received = B_FALSE;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "rS")) != -1) {
+		switch (c) {
+		case 'r':
+			flags |= ZFS_ITER_RECURSE;
+			break;
+		case 'S':
+			received = B_TRUE;
+			break;
+		case '?':
+		default:
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing property argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing dataset argument\n"));
+		usage(B_FALSE);
+	}
+
+	propname = argv[0];
+	argc--;
+	argv++;
+
+	if ((prop = zfs_name_to_prop(propname)) != ZPROP_INVAL) {
+		if (zfs_prop_readonly(prop)) {
+			(void) fprintf(stderr, gettext(
+			    "%s property is read-only\n"),
+			    propname);
+			return (1);
+		}
+		if (!zfs_prop_inheritable(prop) && !received) {
+			(void) fprintf(stderr, gettext("'%s' property cannot "
+			    "be inherited\n"), propname);
+			if (prop == ZFS_PROP_QUOTA ||
+			    prop == ZFS_PROP_RESERVATION ||
+			    prop == ZFS_PROP_REFQUOTA ||
+			    prop == ZFS_PROP_REFRESERVATION) {
+				(void) fprintf(stderr, gettext("use 'zfs set "
+				    "%s=none' to clear\n"), propname);
+				(void) fprintf(stderr, gettext("use 'zfs "
+				    "inherit -S %s' to revert to received "
+				    "value\n"), propname);
+			}
+			return (1);
+		}
+		if (received && (prop == ZFS_PROP_VOLSIZE ||
+		    prop == ZFS_PROP_VERSION)) {
+			(void) fprintf(stderr, gettext("'%s' property cannot "
+			    "be reverted to a received value\n"), propname);
+			return (1);
+		}
+	} else if (!zfs_prop_user(propname)) {
+		(void) fprintf(stderr, gettext("invalid property '%s'\n"),
+		    propname);
+		usage(B_FALSE);
+	}
+
+	cb.cb_propname = propname;
+	cb.cb_received = received;
+
+	if (flags & ZFS_ITER_RECURSE) {
+		ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
+		    NULL, NULL, 0, inherit_recurse_cb, &cb);
+	} else {
+		ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
+		    NULL, NULL, 0, inherit_cb, &cb);
+	}
+
+	return (ret);
+}
+
+typedef struct upgrade_cbdata {
+	uint64_t cb_numupgraded;
+	uint64_t cb_numsamegraded;
+	uint64_t cb_numfailed;
+	uint64_t cb_version;
+	boolean_t cb_newer;
+	boolean_t cb_foundone;
+	char cb_lastfs[ZFS_MAXNAMELEN];
+} upgrade_cbdata_t;
+
+static int
+same_pool(zfs_handle_t *zhp, const char *name)
+{
+	int len1 = strcspn(name, "/@");
+	const char *zhname = zfs_get_name(zhp);
+	int len2 = strcspn(zhname, "/@");
+
+	if (len1 != len2)
+		return (B_FALSE);
+	return (strncmp(name, zhname, len1) == 0);
+}
+
+static int
+upgrade_list_callback(zfs_handle_t *zhp, void *data)
+{
+	upgrade_cbdata_t *cb = data;
+	int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
+
+	/* list if it's old/new */
+	if ((!cb->cb_newer && version < ZPL_VERSION) ||
+	    (cb->cb_newer && version > ZPL_VERSION)) {
+		char *str;
+		if (cb->cb_newer) {
+			str = gettext("The following filesystems are "
+			    "formatted using a newer software version and\n"
+			    "cannot be accessed on the current system.\n\n");
+		} else {
+			str = gettext("The following filesystems are "
+			    "out of date, and can be upgraded.  After being\n"
+			    "upgraded, these filesystems (and any 'zfs send' "
+			    "streams generated from\n"
+			    "subsequent snapshots) will no longer be "
+			    "accessible by older software versions.\n\n");
+		}
+
+		if (!cb->cb_foundone) {
+			(void) puts(str);
+			(void) printf(gettext("VER  FILESYSTEM\n"));
+			(void) printf(gettext("---  ------------\n"));
+			cb->cb_foundone = B_TRUE;
+		}
+
+		(void) printf("%2u   %s\n", version, zfs_get_name(zhp));
+	}
+
+	return (0);
+}
+
+static int
+upgrade_set_callback(zfs_handle_t *zhp, void *data)
+{
+	upgrade_cbdata_t *cb = data;
+	int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
+	int needed_spa_version;
+	int spa_version;
+
+	if (zfs_spa_version(zhp, &spa_version) < 0)
+		return (-1);
+
+	needed_spa_version = zfs_spa_version_map(cb->cb_version);
+
+	if (needed_spa_version < 0)
+		return (-1);
+
+	if (spa_version < needed_spa_version) {
+		/* can't upgrade */
+		(void) printf(gettext("%s: can not be "
+		    "upgraded; the pool version needs to first "
+		    "be upgraded\nto version %d\n\n"),
+		    zfs_get_name(zhp), needed_spa_version);
+		cb->cb_numfailed++;
+		return (0);
+	}
+
+	/* upgrade */
+	if (version < cb->cb_version) {
+		char verstr[16];
+		(void) snprintf(verstr, sizeof (verstr),
+		    "%llu", (u_longlong_t)cb->cb_version);
+		if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) {
+			/*
+			 * If they did "zfs upgrade -a", then we could
+			 * be doing ioctls to different pools.  We need
+			 * to log this history once to each pool, and bypass
+			 * the normal history logging that happens in main().
+			 */
+			(void) zpool_log_history(g_zfs, history_str);
+			log_history = B_FALSE;
+		}
+		if (zfs_prop_set(zhp, "version", verstr) == 0)
+			cb->cb_numupgraded++;
+		else
+			cb->cb_numfailed++;
+		(void) strcpy(cb->cb_lastfs, zfs_get_name(zhp));
+	} else if (version > cb->cb_version) {
+		/* can't downgrade */
+		(void) printf(gettext("%s: can not be downgraded; "
+		    "it is already at version %u\n"),
+		    zfs_get_name(zhp), version);
+		cb->cb_numfailed++;
+	} else {
+		cb->cb_numsamegraded++;
+	}
+	return (0);
+}
+
+/*
+ * zfs upgrade
+ * zfs upgrade -v
+ * zfs upgrade [-r] [-V <version>] <-a | filesystem>
+ */
+static int
+zfs_do_upgrade(int argc, char **argv)
+{
+	boolean_t all = B_FALSE;
+	boolean_t showversions = B_FALSE;
+	int ret = 0;
+	upgrade_cbdata_t cb = { 0 };
+	signed char c;
+	int flags = ZFS_ITER_ARGS_CAN_BE_PATHS;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "rvV:a")) != -1) {
+		switch (c) {
+		case 'r':
+			flags |= ZFS_ITER_RECURSE;
+			break;
+		case 'v':
+			showversions = B_TRUE;
+			break;
+		case 'V':
+			if (zfs_prop_string_to_index(ZFS_PROP_VERSION,
+			    optarg, &cb.cb_version) != 0) {
+				(void) fprintf(stderr,
+				    gettext("invalid version %s\n"), optarg);
+				usage(B_FALSE);
+			}
+			break;
+		case 'a':
+			all = B_TRUE;
+			break;
+		case '?':
+		default:
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) | cb.cb_version))
+		usage(B_FALSE);
+	if (showversions && (flags & ZFS_ITER_RECURSE || all ||
+	    cb.cb_version || argc))
+		usage(B_FALSE);
+	if ((all || argc) && (showversions))
+		usage(B_FALSE);
+	if (all && argc)
+		usage(B_FALSE);
+
+	if (showversions) {
+		/* Show info on available versions. */
+		(void) printf(gettext("The following filesystem versions are "
+		    "supported:\n\n"));
+		(void) printf(gettext("VER  DESCRIPTION\n"));
+		(void) printf("---  -----------------------------------------"
+		    "---------------\n");
+		(void) printf(gettext(" 1   Initial ZFS filesystem version\n"));
+		(void) printf(gettext(" 2   Enhanced directory entries\n"));
+		(void) printf(gettext(" 3   Case insensitive and filesystem "
+		    "user identifier (FUID)\n"));
+		(void) printf(gettext(" 4   userquota, groupquota "
+		    "properties\n"));
+		(void) printf(gettext(" 5   System attributes\n"));
+		(void) printf(gettext("\nFor more information on a particular "
+		    "version, including supported releases,\n"));
+		(void) printf("see the ZFS Administration Guide.\n\n");
+		ret = 0;
+	} else if (argc || all) {
+		/* Upgrade filesystems */
+		if (cb.cb_version == 0)
+			cb.cb_version = ZPL_VERSION;
+		ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM,
+		    NULL, NULL, 0, upgrade_set_callback, &cb);
+		(void) printf(gettext("%llu filesystems upgraded\n"),
+		    (u_longlong_t)cb.cb_numupgraded);
+		if (cb.cb_numsamegraded) {
+			(void) printf(gettext("%llu filesystems already at "
+			    "this version\n"),
+			    (u_longlong_t)cb.cb_numsamegraded);
+		}
+		if (cb.cb_numfailed != 0)
+			ret = 1;
+	} else {
+		/* List old-version filesytems */
+		boolean_t found;
+		(void) printf(gettext("This system is currently running "
+		    "ZFS filesystem version %llu.\n\n"), ZPL_VERSION);
+
+		flags |= ZFS_ITER_RECURSE;
+		ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
+		    NULL, NULL, 0, upgrade_list_callback, &cb);
+
+		found = cb.cb_foundone;
+		cb.cb_foundone = B_FALSE;
+		cb.cb_newer = B_TRUE;
+
+		ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
+		    NULL, NULL, 0, upgrade_list_callback, &cb);
+
+		if (!cb.cb_foundone && !found) {
+			(void) printf(gettext("All filesystems are "
+			    "formatted with the current version.\n"));
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...]
+ *               [-S field [-S field]...] [-t type[,...]] filesystem | snapshot
+ * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...]
+ *                [-S field [-S field]...] [-t type[,...]] filesystem | snapshot
+ *
+ *	-H      Scripted mode; elide headers and separate columns by tabs.
+ *	-i	Translate SID to POSIX ID.
+ *	-n	Print numeric ID instead of user/group name.
+ *	-o      Control which fields to display.
+ *	-p	Use exact (parsable) numeric output.
+ *	-s      Specify sort columns, descending order.
+ *	-S      Specify sort columns, ascending order.
+ *	-t      Control which object types to display.
+ *
+ *	Displays space consumed by, and quotas on, each user in the specified
+ *	filesystem or snapshot.
+ */
+
+/* us_field_types, us_field_hdr and us_field_names should be kept in sync */
+enum us_field_types {
+	USFIELD_TYPE,
+	USFIELD_NAME,
+	USFIELD_USED,
+	USFIELD_QUOTA
+};
+static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA" };
+static char *us_field_names[] = { "type", "name", "used", "quota" };
+#define	USFIELD_LAST	(sizeof (us_field_names) / sizeof (char *))
+
+#define	USTYPE_PSX_GRP	(1 << 0)
+#define	USTYPE_PSX_USR	(1 << 1)
+#define	USTYPE_SMB_GRP	(1 << 2)
+#define	USTYPE_SMB_USR	(1 << 3)
+#define	USTYPE_ALL	\
+	(USTYPE_PSX_GRP | USTYPE_PSX_USR | USTYPE_SMB_GRP | USTYPE_SMB_USR)
+
+static int us_type_bits[] = {
+	USTYPE_PSX_GRP,
+	USTYPE_PSX_USR,
+	USTYPE_SMB_GRP,
+	USTYPE_SMB_USR,
+	USTYPE_ALL
+};
+static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup",
+	"smbuser", "all" };
+
+typedef struct us_node {
+	nvlist_t	*usn_nvl;
+	uu_avl_node_t	usn_avlnode;
+	uu_list_node_t	usn_listnode;
+} us_node_t;
+
+typedef struct us_cbdata {
+	nvlist_t	**cb_nvlp;
+	uu_avl_pool_t	*cb_avl_pool;
+	uu_avl_t	*cb_avl;
+	boolean_t	cb_numname;
+	boolean_t	cb_nicenum;
+	boolean_t	cb_sid2posix;
+	zfs_userquota_prop_t cb_prop;
+	zfs_sort_column_t *cb_sortcol;
+	size_t		cb_width[USFIELD_LAST];
+} us_cbdata_t;
+
+static boolean_t us_populated = B_FALSE;
+
+typedef struct {
+	zfs_sort_column_t *si_sortcol;
+	boolean_t	si_numname;
+} us_sort_info_t;
+
+static int
+us_field_index(char *field)
+{
+	int i;
+
+	for (i = 0; i < USFIELD_LAST; i++) {
+		if (strcmp(field, us_field_names[i]) == 0)
+			return (i);
+	}
+
+	return (-1);
+}
+
+static int
+us_compare(const void *larg, const void *rarg, void *unused)
+{
+	const us_node_t *l = larg;
+	const us_node_t *r = rarg;
+	us_sort_info_t *si = (us_sort_info_t *)unused;
+	zfs_sort_column_t *sortcol = si->si_sortcol;
+	boolean_t numname = si->si_numname;
+	nvlist_t *lnvl = l->usn_nvl;
+	nvlist_t *rnvl = r->usn_nvl;
+	int rc = 0;
+	boolean_t lvb, rvb;
+
+	for (; sortcol != NULL; sortcol = sortcol->sc_next) {
+		char *lvstr = "";
+		char *rvstr = "";
+		uint32_t lv32 = 0;
+		uint32_t rv32 = 0;
+		uint64_t lv64 = 0;
+		uint64_t rv64 = 0;
+		zfs_prop_t prop = sortcol->sc_prop;
+		const char *propname = NULL;
+		boolean_t reverse = sortcol->sc_reverse;
+
+		switch (prop) {
+		case ZFS_PROP_TYPE:
+			propname = "type";
+			(void) nvlist_lookup_uint32(lnvl, propname, &lv32);
+			(void) nvlist_lookup_uint32(rnvl, propname, &rv32);
+			if (rv32 != lv32)
+				rc = (rv32 < lv32) ? 1 : -1;
+			break;
+		case ZFS_PROP_NAME:
+			propname = "name";
+			if (numname) {
+				(void) nvlist_lookup_uint64(lnvl, propname,
+				    &lv64);
+				(void) nvlist_lookup_uint64(rnvl, propname,
+				    &rv64);
+				if (rv64 != lv64)
+					rc = (rv64 < lv64) ? 1 : -1;
+			} else {
+				(void) nvlist_lookup_string(lnvl, propname,
+				    &lvstr);
+				(void) nvlist_lookup_string(rnvl, propname,
+				    &rvstr);
+				rc = strcmp(lvstr, rvstr);
+			}
+			break;
+		case ZFS_PROP_USED:
+		case ZFS_PROP_QUOTA:
+			if (!us_populated)
+				break;
+			if (prop == ZFS_PROP_USED)
+				propname = "used";
+			else
+				propname = "quota";
+			(void) nvlist_lookup_uint64(lnvl, propname, &lv64);
+			(void) nvlist_lookup_uint64(rnvl, propname, &rv64);
+			if (rv64 != lv64)
+				rc = (rv64 < lv64) ? 1 : -1;
+			break;
+		default:
+			break;
+		}
+
+		if (rc != 0) {
+			if (rc < 0)
+				return (reverse ? 1 : -1);
+			else
+				return (reverse ? -1 : 1);
+		}
+	}
+
+	/*
+	 * If entries still seem to be the same, check if they are of the same
+	 * type (smbentity is added only if we are doing SID to POSIX ID
+	 * translation where we can have duplicate type/name combinations).
+	 */
+	if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 &&
+	    nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 &&
+	    lvb != rvb)
+		return (lvb < rvb ? -1 : 1);
+
+	return (0);
+}
+
+static inline const char *
+us_type2str(unsigned field_type)
+{
+	switch (field_type) {
+	case USTYPE_PSX_USR:
+		return ("POSIX User");
+	case USTYPE_PSX_GRP:
+		return ("POSIX Group");
+	case USTYPE_SMB_USR:
+		return ("SMB User");
+	case USTYPE_SMB_GRP:
+		return ("SMB Group");
+	default:
+		return ("Undefined");
+	}
+}
+
+static int
+userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space)
+{
+	us_cbdata_t *cb = (us_cbdata_t *)arg;
+	zfs_userquota_prop_t prop = cb->cb_prop;
+	char *name = NULL;
+	char *propname;
+	char sizebuf[32];
+	us_node_t *node;
+	uu_avl_pool_t *avl_pool = cb->cb_avl_pool;
+	uu_avl_t *avl = cb->cb_avl;
+	uu_avl_index_t idx;
+	nvlist_t *props;
+	us_node_t *n;
+	zfs_sort_column_t *sortcol = cb->cb_sortcol;
+	unsigned type = 0;
+	const char *typestr;
+	size_t namelen;
+	size_t typelen;
+	size_t sizelen;
+	int typeidx, nameidx, sizeidx;
+	us_sort_info_t sortinfo = { sortcol, cb->cb_numname };
+	boolean_t smbentity = B_FALSE;
+
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+	node = safe_malloc(sizeof (us_node_t));
+	uu_avl_node_init(node, &node->usn_avlnode, avl_pool);
+	node->usn_nvl = props;
+
+	if (domain != NULL && domain[0] != '\0') {
+#ifdef HAVE_IDMAP
+		/* SMB */
+		char sid[ZFS_MAXNAMELEN + 32];
+		uid_t id;
+		uint64_t classes;
+		int err;
+		directory_error_t e;
+
+		smbentity = B_TRUE;
+
+		(void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid);
+
+		if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) {
+			type = USTYPE_SMB_GRP;
+			err = sid_to_id(sid, B_FALSE, &id);
+		} else {
+			type = USTYPE_SMB_USR;
+			err = sid_to_id(sid, B_TRUE, &id);
+		}
+
+		if (err == 0) {
+			rid = id;
+			if (!cb->cb_sid2posix) {
+				e = directory_name_from_sid(NULL, sid, &name,
+				    &classes);
+				if (e != NULL)
+					directory_error_free(e);
+				if (name == NULL)
+					name = sid;
+			}
+		}
+#else
+		nvlist_free(props);
+		free(node);
+
+		return (-1);
+#endif /* HAVE_IDMAP */
+	}
+
+	if (cb->cb_sid2posix || domain == NULL || domain[0] == '\0') {
+		/* POSIX or -i */
+		if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) {
+			type = USTYPE_PSX_GRP;
+			if (!cb->cb_numname) {
+				struct group *g;
+
+				if ((g = getgrgid(rid)) != NULL)
+					name = g->gr_name;
+			}
+		} else {
+			type = USTYPE_PSX_USR;
+			if (!cb->cb_numname) {
+				struct passwd *p;
+
+				if ((p = getpwuid(rid)) != NULL)
+					name = p->pw_name;
+			}
+		}
+	}
+
+	/*
+	 * Make sure that the type/name combination is unique when doing
+	 * SID to POSIX ID translation (hence changing the type from SMB to
+	 * POSIX).
+	 */
+	if (cb->cb_sid2posix &&
+	    nvlist_add_boolean_value(props, "smbentity", smbentity) != 0)
+		nomem();
+
+	/* Calculate/update width of TYPE field */
+	typestr = us_type2str(type);
+	typelen = strlen(gettext(typestr));
+	typeidx = us_field_index("type");
+	if (typelen > cb->cb_width[typeidx])
+		cb->cb_width[typeidx] = typelen;
+	if (nvlist_add_uint32(props, "type", type) != 0)
+		nomem();
+
+	/* Calculate/update width of NAME field */
+	if ((cb->cb_numname && cb->cb_sid2posix) || name == NULL) {
+		if (nvlist_add_uint64(props, "name", rid) != 0)
+			nomem();
+		namelen = snprintf(NULL, 0, "%u", rid);
+	} else {
+		if (nvlist_add_string(props, "name", name) != 0)
+			nomem();
+		namelen = strlen(name);
+	}
+	nameidx = us_field_index("name");
+	if (namelen > cb->cb_width[nameidx])
+		cb->cb_width[nameidx] = namelen;
+
+	/*
+	 * Check if this type/name combination is in the list and update it;
+	 * otherwise add new node to the list.
+	 */
+	if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) {
+		uu_avl_insert(avl, node, idx);
+	} else {
+		nvlist_free(props);
+		free(node);
+		node = n;
+		props = node->usn_nvl;
+	}
+
+	/* Calculate/update width of USED/QUOTA fields */
+	if (cb->cb_nicenum)
+		zfs_nicenum(space, sizebuf, sizeof (sizebuf));
+	else
+		(void) snprintf(sizebuf, sizeof (sizebuf), "%llu",
+		    (u_longlong_t)space);
+	sizelen = strlen(sizebuf);
+	if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED) {
+		propname = "used";
+		if (!nvlist_exists(props, "quota"))
+			(void) nvlist_add_uint64(props, "quota", 0);
+	} else {
+		propname = "quota";
+		if (!nvlist_exists(props, "used"))
+			(void) nvlist_add_uint64(props, "used", 0);
+	}
+	sizeidx = us_field_index(propname);
+	if (sizelen > cb->cb_width[sizeidx])
+		cb->cb_width[sizeidx] = sizelen;
+
+	if (nvlist_add_uint64(props, propname, space) != 0)
+		nomem();
+
+	return (0);
+}
+
+static void
+print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types,
+    size_t *width, us_node_t *node)
+{
+	nvlist_t *nvl = node->usn_nvl;
+	char valstr[ZFS_MAXNAMELEN];
+	boolean_t first = B_TRUE;
+	int cfield = 0;
+	int field;
+	uint32_t ustype;
+
+	/* Check type */
+	(void) nvlist_lookup_uint32(nvl, "type", &ustype);
+	if (!(ustype & types))
+		return;
+
+	while ((field = fields[cfield]) != USFIELD_LAST) {
+		nvpair_t *nvp = NULL;
+		data_type_t type;
+		uint32_t val32;
+		uint64_t val64;
+		char *strval = NULL;
+
+		while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+			if (strcmp(nvpair_name(nvp),
+			    us_field_names[field]) == 0)
+				break;
+		}
+
+		type = nvpair_type(nvp);
+		switch (type) {
+		case DATA_TYPE_UINT32:
+			(void) nvpair_value_uint32(nvp, &val32);
+			break;
+		case DATA_TYPE_UINT64:
+			(void) nvpair_value_uint64(nvp, &val64);
+			break;
+		case DATA_TYPE_STRING:
+			(void) nvpair_value_string(nvp, &strval);
+			break;
+		default:
+			(void) fprintf(stderr, "invalid data type\n");
+		}
+
+		switch (field) {
+		case USFIELD_TYPE:
+			strval = (char *)us_type2str(val32);
+			break;
+		case USFIELD_NAME:
+			if (type == DATA_TYPE_UINT64) {
+				(void) sprintf(valstr, "%llu",
+				    (u_longlong_t) val64);
+				strval = valstr;
+			}
+			break;
+		case USFIELD_USED:
+		case USFIELD_QUOTA:
+			if (type == DATA_TYPE_UINT64) {
+				if (parsable) {
+					(void) sprintf(valstr, "%llu",
+					    (u_longlong_t) val64);
+				} else {
+					zfs_nicenum(val64, valstr,
+					    sizeof (valstr));
+				}
+				if (field == USFIELD_QUOTA &&
+				    strcmp(valstr, "0") == 0)
+					strval = "none";
+				else
+					strval = valstr;
+			}
+			break;
+		}
+
+		if (!first) {
+			if (scripted)
+				(void) printf("\t");
+			else
+				(void) printf("  ");
+		}
+		if (scripted)
+			(void) printf("%s", strval);
+		else if (field == USFIELD_TYPE || field == USFIELD_NAME)
+			(void) printf("%-*s", (int) width[field], strval);
+		else
+			(void) printf("%*s", (int) width[field], strval);
+
+		first = B_FALSE;
+		cfield++;
+	}
+
+	(void) printf("\n");
+}
+
+static void
+print_us(boolean_t scripted, boolean_t parsable, int *fields, int types,
+    size_t *width, boolean_t rmnode, uu_avl_t *avl)
+{
+	us_node_t *node;
+	const char *col;
+	int cfield = 0;
+	int field;
+
+	if (!scripted) {
+		boolean_t first = B_TRUE;
+
+		while ((field = fields[cfield]) != USFIELD_LAST) {
+			col = gettext(us_field_hdr[field]);
+			if (field == USFIELD_TYPE || field == USFIELD_NAME) {
+				(void) printf(first ? "%-*s" : "  %-*s",
+				    (int) width[field], col);
+			} else {
+				(void) printf(first ? "%*s" : "  %*s",
+				    (int) width[field], col);
+			}
+			first = B_FALSE;
+			cfield++;
+		}
+		(void) printf("\n");
+	}
+
+	for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) {
+		print_us_node(scripted, parsable, fields, types, width, node);
+		if (rmnode)
+			nvlist_free(node->usn_nvl);
+	}
+}
+
+static int
+zfs_do_userspace(int argc, char **argv)
+{
+	zfs_handle_t *zhp;
+	zfs_userquota_prop_t p;
+	uu_avl_pool_t *avl_pool;
+	uu_avl_t *avl_tree;
+	uu_avl_walk_t *walk;
+	char *delim;
+	char deffields[] = "type,name,used,quota";
+	char *ofield = NULL;
+	char *tfield = NULL;
+	int cfield = 0;
+	int fields[256];
+	int i;
+	boolean_t scripted = B_FALSE;
+	boolean_t prtnum = B_FALSE;
+	boolean_t parsable = B_FALSE;
+	boolean_t sid2posix = B_FALSE;
+	int ret = 0;
+	int c;
+	zfs_sort_column_t *sortcol = NULL;
+	int types = USTYPE_PSX_USR | USTYPE_SMB_USR;
+	us_cbdata_t cb;
+	us_node_t *node;
+	us_node_t *rmnode;
+	uu_list_pool_t *listpool;
+	uu_list_t *list;
+	uu_avl_index_t idx = 0;
+	uu_list_index_t idx2 = 0;
+
+	if (argc < 2)
+		usage(B_FALSE);
+
+	if (strcmp(argv[0], "groupspace") == 0)
+		/* Toggle default group types */
+		types = USTYPE_PSX_GRP | USTYPE_SMB_GRP;
+
+	while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) {
+		switch (c) {
+		case 'n':
+			prtnum = B_TRUE;
+			break;
+		case 'H':
+			scripted = B_TRUE;
+			break;
+		case 'p':
+			parsable = B_TRUE;
+			break;
+		case 'o':
+			ofield = optarg;
+			break;
+		case 's':
+		case 'S':
+			if (zfs_add_sort_column(&sortcol, optarg,
+			    c == 's' ? B_FALSE : B_TRUE) != 0) {
+				(void) fprintf(stderr,
+				    gettext("invalid field '%s'\n"), optarg);
+				usage(B_FALSE);
+			}
+			break;
+		case 't':
+			tfield = optarg;
+			break;
+		case 'i':
+			sid2posix = B_TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing dataset name\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	/* Use default output fields if not specified using -o */
+	if (ofield == NULL)
+		ofield = deffields;
+	do {
+		if ((delim = strchr(ofield, ',')) != NULL)
+			*delim = '\0';
+		if ((fields[cfield++] = us_field_index(ofield)) == -1) {
+			(void) fprintf(stderr, gettext("invalid type '%s' "
+			    "for -o option\n"), ofield);
+			return (-1);
+		}
+		if (delim != NULL)
+			ofield = delim + 1;
+	} while (delim != NULL);
+	fields[cfield] = USFIELD_LAST;
+
+	/* Override output types (-t option) */
+	if (tfield != NULL) {
+		types = 0;
+
+		do {
+			boolean_t found = B_FALSE;
+
+			if ((delim = strchr(tfield, ',')) != NULL)
+				*delim = '\0';
+			for (i = 0; i < sizeof (us_type_bits) / sizeof (int);
+			    i++) {
+				if (strcmp(tfield, us_type_names[i]) == 0) {
+					found = B_TRUE;
+					types |= us_type_bits[i];
+					break;
+				}
+			}
+			if (!found) {
+				(void) fprintf(stderr, gettext("invalid type "
+				    "'%s' for -t option\n"), tfield);
+				return (-1);
+			}
+			if (delim != NULL)
+				tfield = delim + 1;
+		} while (delim != NULL);
+	}
+
+	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
+		return (1);
+
+	if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t),
+	    offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL)
+		nomem();
+	if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
+		nomem();
+
+	/* Always add default sorting columns */
+	(void) zfs_add_sort_column(&sortcol, "type", B_FALSE);
+	(void) zfs_add_sort_column(&sortcol, "name", B_FALSE);
+
+	cb.cb_sortcol = sortcol;
+	cb.cb_numname = prtnum;
+	cb.cb_nicenum = !parsable;
+	cb.cb_avl_pool = avl_pool;
+	cb.cb_avl = avl_tree;
+	cb.cb_sid2posix = sid2posix;
+
+	for (i = 0; i < USFIELD_LAST; i++)
+		cb.cb_width[i] = strlen(gettext(us_field_hdr[i]));
+
+	for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) {
+		if (((p == ZFS_PROP_USERUSED || p == ZFS_PROP_USERQUOTA) &&
+		    !(types & (USTYPE_PSX_USR | USTYPE_SMB_USR))) ||
+		    ((p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) &&
+		    !(types & (USTYPE_PSX_GRP | USTYPE_SMB_GRP))))
+			continue;
+		cb.cb_prop = p;
+		if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0)
+			return (ret);
+	}
+
+	/* Sort the list */
+	if ((node = uu_avl_first(avl_tree)) == NULL)
+		return (0);
+
+	us_populated = B_TRUE;
+
+	listpool = uu_list_pool_create("tmplist", sizeof (us_node_t),
+	    offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT);
+	list = uu_list_create(listpool, NULL, UU_DEFAULT);
+	uu_list_node_init(node, &node->usn_listnode, listpool);
+
+	while (node != NULL) {
+		rmnode = node;
+		node = uu_avl_next(avl_tree, node);
+		uu_avl_remove(avl_tree, rmnode);
+		if (uu_list_find(list, rmnode, NULL, &idx2) == NULL)
+			uu_list_insert(list, rmnode, idx2);
+	}
+
+	for (node = uu_list_first(list); node != NULL;
+	    node = uu_list_next(list, node)) {
+		us_sort_info_t sortinfo = { sortcol, cb.cb_numname };
+
+		if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL)
+			uu_avl_insert(avl_tree, node, idx);
+	}
+
+	uu_list_destroy(list);
+	uu_list_pool_destroy(listpool);
+
+	/* Print and free node nvlist memory */
+	print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE,
+	    cb.cb_avl);
+
+	zfs_free_sort_columns(sortcol);
+
+	/* Clean up the AVL tree */
+	if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
+		nomem();
+
+	while ((node = uu_avl_walk_next(walk)) != NULL) {
+		uu_avl_remove(cb.cb_avl, node);
+		free(node);
+	}
+
+	uu_avl_walk_end(walk);
+	uu_avl_destroy(avl_tree);
+	uu_avl_pool_destroy(avl_pool);
+
+	return (ret);
+}
+
+/*
+ * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property]
+ *      [-t type[,...]] [filesystem|volume|snapshot] ...
+ *
+ *	-H	Scripted mode; elide headers and separate columns by tabs
+ *	-p	Display values in parsable (literal) format.
+ *	-r	Recurse over all children
+ *	-d	Limit recursion by depth.
+ *	-o	Control which fields to display.
+ *	-s	Specify sort columns, descending order.
+ *	-S	Specify sort columns, ascending order.
+ *	-t	Control which object types to display.
+ *
+ * When given no arguments, list all filesystems in the system.
+ * Otherwise, list the specified datasets, optionally recursing down them if
+ * '-r' is specified.
+ */
+typedef struct list_cbdata {
+	boolean_t	cb_first;
+	boolean_t	cb_literal;
+	boolean_t	cb_scripted;
+	zprop_list_t	*cb_proplist;
+} list_cbdata_t;
+
+/*
+ * Given a list of columns to display, output appropriate headers for each one.
+ */
+static void
+print_header(list_cbdata_t *cb)
+{
+	zprop_list_t *pl = cb->cb_proplist;
+	char headerbuf[ZFS_MAXPROPLEN];
+	const char *header;
+	int i;
+	boolean_t first = B_TRUE;
+	boolean_t right_justify;
+
+	for (; pl != NULL; pl = pl->pl_next) {
+		if (!first) {
+			(void) printf("  ");
+		} else {
+			first = B_FALSE;
+		}
+
+		right_justify = B_FALSE;
+		if (pl->pl_prop != ZPROP_INVAL) {
+			header = zfs_prop_column_name(pl->pl_prop);
+			right_justify = zfs_prop_align_right(pl->pl_prop);
+		} else {
+			for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
+				headerbuf[i] = toupper(pl->pl_user_prop[i]);
+			headerbuf[i] = '\0';
+			header = headerbuf;
+		}
+
+		if (pl->pl_next == NULL && !right_justify)
+			(void) printf("%s", header);
+		else if (right_justify)
+			(void) printf("%*s", (int)pl->pl_width, header);
+		else
+			(void) printf("%-*s", (int)pl->pl_width, header);
+	}
+
+	(void) printf("\n");
+}
+
+/*
+ * Given a dataset and a list of fields, print out all the properties according
+ * to the described layout.
+ */
+static void
+print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb)
+{
+	zprop_list_t *pl = cb->cb_proplist;
+	boolean_t first = B_TRUE;
+	char property[ZFS_MAXPROPLEN];
+	nvlist_t *userprops = zfs_get_user_props(zhp);
+	nvlist_t *propval;
+	char *propstr;
+	boolean_t right_justify;
+
+	for (; pl != NULL; pl = pl->pl_next) {
+		if (!first) {
+			if (cb->cb_scripted)
+				(void) printf("\t");
+			else
+				(void) printf("  ");
+		} else {
+			first = B_FALSE;
+		}
+
+		if (pl->pl_prop == ZFS_PROP_NAME) {
+			(void) strlcpy(property, zfs_get_name(zhp),
+			    sizeof (property));
+			propstr = property;
+			right_justify = zfs_prop_align_right(pl->pl_prop);
+		} else if (pl->pl_prop != ZPROP_INVAL) {
+			if (zfs_prop_get(zhp, pl->pl_prop, property,
+			    sizeof (property), NULL, NULL, 0,
+			    cb->cb_literal) != 0)
+				propstr = "-";
+			else
+				propstr = property;
+			right_justify = zfs_prop_align_right(pl->pl_prop);
+		} else if (zfs_prop_userquota(pl->pl_user_prop)) {
+			if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
+			    property, sizeof (property), cb->cb_literal) != 0)
+				propstr = "-";
+			else
+				propstr = property;
+			right_justify = B_TRUE;
+		} else if (zfs_prop_written(pl->pl_user_prop)) {
+			if (zfs_prop_get_written(zhp, pl->pl_user_prop,
+			    property, sizeof (property), cb->cb_literal) != 0)
+				propstr = "-";
+			else
+				propstr = property;
+			right_justify = B_TRUE;
+		} else {
+			if (nvlist_lookup_nvlist(userprops,
+			    pl->pl_user_prop, &propval) != 0)
+				propstr = "-";
+			else
+				verify(nvlist_lookup_string(propval,
+				    ZPROP_VALUE, &propstr) == 0);
+			right_justify = B_FALSE;
+		}
+
+		/*
+		 * If this is being called in scripted mode, or if this is the
+		 * last column and it is left-justified, don't include a width
+		 * format specifier.
+		 */
+		if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify))
+			(void) printf("%s", propstr);
+		else if (right_justify)
+			(void) printf("%*s", (int)pl->pl_width, propstr);
+		else
+			(void) printf("%-*s", (int)pl->pl_width, propstr);
+	}
+
+	(void) printf("\n");
+}
+
+/*
+ * Generic callback function to list a dataset or snapshot.
+ */
+static int
+list_callback(zfs_handle_t *zhp, void *data)
+{
+	list_cbdata_t *cbp = data;
+
+	if (cbp->cb_first) {
+		if (!cbp->cb_scripted)
+			print_header(cbp);
+		cbp->cb_first = B_FALSE;
+	}
+
+	print_dataset(zhp, cbp);
+
+	return (0);
+}
+
+static int
+zfs_do_list(int argc, char **argv)
+{
+	int c;
+	static char default_fields[] =
+	    "name,used,available,referenced,mountpoint";
+	int types = ZFS_TYPE_DATASET;
+	boolean_t types_specified = B_FALSE;
+	char *fields = NULL;
+	list_cbdata_t cb = { 0 };
+	char *value;
+	int limit = 0;
+	int ret = 0;
+	zfs_sort_column_t *sortcol = NULL;
+	int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) {
+		switch (c) {
+		case 'o':
+			fields = optarg;
+			break;
+		case 'p':
+			cb.cb_literal = B_TRUE;
+			flags |= ZFS_ITER_LITERAL_PROPS;
+			break;
+		case 'd':
+			limit = parse_depth(optarg, &flags);
+			break;
+		case 'r':
+			flags |= ZFS_ITER_RECURSE;
+			break;
+		case 'H':
+			cb.cb_scripted = B_TRUE;
+			break;
+		case 's':
+			if (zfs_add_sort_column(&sortcol, optarg,
+			    B_FALSE) != 0) {
+				(void) fprintf(stderr,
+				    gettext("invalid property '%s'\n"), optarg);
+				usage(B_FALSE);
+			}
+			break;
+		case 'S':
+			if (zfs_add_sort_column(&sortcol, optarg,
+			    B_TRUE) != 0) {
+				(void) fprintf(stderr,
+				    gettext("invalid property '%s'\n"), optarg);
+				usage(B_FALSE);
+			}
+			break;
+		case 't':
+			types = 0;
+			types_specified = B_TRUE;
+			flags &= ~ZFS_ITER_PROP_LISTSNAPS;
+			while (*optarg != '\0') {
+				static char *type_subopts[] = { "filesystem",
+				    "volume", "snapshot", "snap", "bookmark",
+				    "all", NULL };
+
+				switch (getsubopt(&optarg, type_subopts,
+				    &value)) {
+				case 0:
+					types |= ZFS_TYPE_FILESYSTEM;
+					break;
+				case 1:
+					types |= ZFS_TYPE_VOLUME;
+					break;
+				case 2:
+				case 3:
+					types |= ZFS_TYPE_SNAPSHOT;
+					break;
+				case 4:
+					types |= ZFS_TYPE_BOOKMARK;
+					break;
+				case 5:
+					types = ZFS_TYPE_DATASET |
+					    ZFS_TYPE_BOOKMARK;
+					break;
+				default:
+					(void) fprintf(stderr,
+					    gettext("invalid type '%s'\n"),
+					    value);
+					usage(B_FALSE);
+				}
+			}
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (fields == NULL)
+		fields = default_fields;
+
+	/*
+	 * If we are only going to list snapshot names and sort by name,
+	 * then we can use faster version.
+	 */
+	if (strcmp(fields, "name") == 0 && zfs_sort_only_by_name(sortcol))
+		flags |= ZFS_ITER_SIMPLE;
+
+	/*
+	 * If "-o space" and no types were specified, don't display snapshots.
+	 */
+	if (strcmp(fields, "space") == 0 && types_specified == B_FALSE)
+		types &= ~ZFS_TYPE_SNAPSHOT;
+
+	/*
+	 * If the user specifies '-o all', the zprop_get_list() doesn't
+	 * normally include the name of the dataset.  For 'zfs list', we always
+	 * want this property to be first.
+	 */
+	if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET)
+	    != 0)
+		usage(B_FALSE);
+
+	cb.cb_first = B_TRUE;
+
+	ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist,
+	    limit, list_callback, &cb);
+
+	zprop_free_list(cb.cb_proplist);
+	zfs_free_sort_columns(sortcol);
+
+	if (ret == 0 && cb.cb_first && !cb.cb_scripted)
+		(void) fprintf(stderr, gettext("no datasets available\n"));
+
+	return (ret);
+}
+
+/*
+ * zfs rename [-f] <fs | snap | vol> <fs | snap | vol>
+ * zfs rename [-f] -p <fs | vol> <fs | vol>
+ * zfs rename -r <snap> <snap>
+ *
+ * Renames the given dataset to another of the same type.
+ *
+ * The '-p' flag creates all the non-existing ancestors of the target first.
+ */
+/* ARGSUSED */
+static int
+zfs_do_rename(int argc, char **argv)
+{
+	zfs_handle_t *zhp;
+	int c;
+	int ret = 0;
+	boolean_t recurse = B_FALSE;
+	boolean_t parents = B_FALSE;
+	boolean_t force_unmount = B_FALSE;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "prf")) != -1) {
+		switch (c) {
+		case 'p':
+			parents = B_TRUE;
+			break;
+		case 'r':
+			recurse = B_TRUE;
+			break;
+		case 'f':
+			force_unmount = B_TRUE;
+			break;
+		case '?':
+		default:
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing source dataset "
+		    "argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing target dataset "
+		    "argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 2) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	if (recurse && parents) {
+		(void) fprintf(stderr, gettext("-p and -r options are mutually "
+		    "exclusive\n"));
+		usage(B_FALSE);
+	}
+
+	if (recurse && strchr(argv[0], '@') == 0) {
+		(void) fprintf(stderr, gettext("source dataset for recursive "
+		    "rename must be a snapshot\n"));
+		usage(B_FALSE);
+	}
+
+	if ((zhp = zfs_open(g_zfs, argv[0], parents ? ZFS_TYPE_FILESYSTEM |
+	    ZFS_TYPE_VOLUME : ZFS_TYPE_DATASET)) == NULL)
+		return (1);
+
+	/* If we were asked and the name looks good, try to create ancestors. */
+	if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) &&
+	    zfs_create_ancestors(g_zfs, argv[1]) != 0) {
+		zfs_close(zhp);
+		return (1);
+	}
+
+	ret = (zfs_rename(zhp, argv[1], recurse, force_unmount) != 0);
+
+	zfs_close(zhp);
+	return (ret);
+}
+
+/*
+ * zfs promote <fs>
+ *
+ * Promotes the given clone fs to be the parent
+ */
+/* ARGSUSED */
+static int
+zfs_do_promote(int argc, char **argv)
+{
+	zfs_handle_t *zhp;
+	int ret = 0;
+
+	/* check options */
+	if (argc > 1 && argv[1][0] == '-') {
+		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+		    argv[1][1]);
+		usage(B_FALSE);
+	}
+
+	/* check number of arguments */
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing clone filesystem"
+		    " argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 2) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+	if (zhp == NULL)
+		return (1);
+
+	ret = (zfs_promote(zhp) != 0);
+
+
+	zfs_close(zhp);
+	return (ret);
+}
+
+/*
+ * zfs rollback [-rRf] <snapshot>
+ *
+ *	-r	Delete any intervening snapshots before doing rollback
+ *	-R	Delete any snapshots and their clones
+ *	-f	ignored for backwards compatability
+ *
+ * Given a filesystem, rollback to a specific snapshot, discarding any changes
+ * since then and making it the active dataset.  If more recent snapshots exist,
+ * the command will complain unless the '-r' flag is given.
+ */
+typedef struct rollback_cbdata {
+	uint64_t	cb_create;
+	boolean_t	cb_first;
+	int		cb_doclones;
+	char		*cb_target;
+	int		cb_error;
+	boolean_t	cb_recurse;
+} rollback_cbdata_t;
+
+static int
+rollback_check_dependent(zfs_handle_t *zhp, void *data)
+{
+	rollback_cbdata_t *cbp = data;
+
+	if (cbp->cb_first && cbp->cb_recurse) {
+		(void) fprintf(stderr, gettext("cannot rollback to "
+		    "'%s': clones of previous snapshots exist\n"),
+		    cbp->cb_target);
+		(void) fprintf(stderr, gettext("use '-R' to "
+		    "force deletion of the following clones and "
+		    "dependents:\n"));
+		cbp->cb_first = 0;
+		cbp->cb_error = 1;
+	}
+
+	(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+
+	zfs_close(zhp);
+	return (0);
+}
+
+
+/*
+ * Report any snapshots more recent than the one specified.  Used when '-r' is
+ * not specified.  We reuse this same callback for the snapshot dependents - if
+ * 'cb_dependent' is set, then this is a dependent and we should report it
+ * without checking the transaction group.
+ */
+static int
+rollback_check(zfs_handle_t *zhp, void *data)
+{
+	rollback_cbdata_t *cbp = data;
+
+	if (cbp->cb_doclones) {
+		zfs_close(zhp);
+		return (0);
+	}
+
+	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) {
+		if (cbp->cb_first && !cbp->cb_recurse) {
+			(void) fprintf(stderr, gettext("cannot "
+			    "rollback to '%s': more recent snapshots "
+			    "or bookmarks exist\n"),
+			    cbp->cb_target);
+			(void) fprintf(stderr, gettext("use '-r' to "
+			    "force deletion of the following "
+			    "snapshots and bookmarks:\n"));
+			cbp->cb_first = 0;
+			cbp->cb_error = 1;
+		}
+
+		if (cbp->cb_recurse) {
+			if (zfs_iter_dependents(zhp, B_TRUE,
+			    rollback_check_dependent, cbp) != 0) {
+				zfs_close(zhp);
+				return (-1);
+			}
+		} else {
+			(void) fprintf(stderr, "%s\n",
+			    zfs_get_name(zhp));
+		}
+	}
+	zfs_close(zhp);
+	return (0);
+}
+
+static int
+zfs_do_rollback(int argc, char **argv)
+{
+	int ret = 0;
+	int c;
+	boolean_t force = B_FALSE;
+	rollback_cbdata_t cb = { 0 };
+	zfs_handle_t *zhp, *snap;
+	char parentname[ZFS_MAXNAMELEN];
+	char *delim;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "rRf")) != -1) {
+		switch (c) {
+		case 'r':
+			cb.cb_recurse = 1;
+			break;
+		case 'R':
+			cb.cb_recurse = 1;
+			cb.cb_doclones = 1;
+			break;
+		case 'f':
+			force = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing dataset argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	/* open the snapshot */
+	if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
+		return (1);
+
+	/* open the parent dataset */
+	(void) strlcpy(parentname, argv[0], sizeof (parentname));
+	verify((delim = strrchr(parentname, '@')) != NULL);
+	*delim = '\0';
+	if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) {
+		zfs_close(snap);
+		return (1);
+	}
+
+	/*
+	 * Check for more recent snapshots and/or clones based on the presence
+	 * of '-r' and '-R'.
+	 */
+	cb.cb_target = argv[0];
+	cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
+	cb.cb_first = B_TRUE;
+	cb.cb_error = 0;
+	if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb)) != 0)
+		goto out;
+	if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0)
+		goto out;
+
+	if ((ret = cb.cb_error) != 0)
+		goto out;
+
+	/*
+	 * Rollback parent to the given snapshot.
+	 */
+	ret = zfs_rollback(zhp, snap, force);
+
+out:
+	zfs_close(snap);
+	zfs_close(zhp);
+
+	if (ret == 0)
+		return (0);
+	else
+		return (1);
+}
+
+/*
+ * zfs set property=value ... { fs | snap | vol } ...
+ *
+ * Sets the given properties for all datasets specified on the command line.
+ */
+
+static int
+set_callback(zfs_handle_t *zhp, void *data)
+{
+	nvlist_t *props = data;
+
+	if (zfs_prop_set_list(zhp, props) != 0) {
+		switch (libzfs_errno(g_zfs)) {
+		case EZFS_MOUNTFAILED:
+			(void) fprintf(stderr, gettext("property may be set "
+			    "but unable to remount filesystem\n"));
+			break;
+		case EZFS_SHARENFSFAILED:
+			(void) fprintf(stderr, gettext("property may be set "
+			    "but unable to reshare filesystem\n"));
+			break;
+		}
+		return (1);
+	}
+	return (0);
+}
+
+static int
+zfs_do_set(int argc, char **argv)
+{
+	nvlist_t *props = NULL;
+	int ds_start = -1; /* argv idx of first dataset arg */
+	int ret = 0;
+	int i;
+
+	/* check for options */
+	if (argc > 1 && argv[1][0] == '-') {
+		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+		    argv[1][1]);
+		usage(B_FALSE);
+	}
+
+	/* check number of arguments */
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing arguments\n"));
+		usage(B_FALSE);
+	}
+	if (argc < 3) {
+		if (strchr(argv[1], '=') == NULL) {
+			(void) fprintf(stderr, gettext("missing property=value "
+			    "argument(s)\n"));
+		} else {
+			(void) fprintf(stderr, gettext("missing dataset "
+			    "name(s)\n"));
+		}
+		usage(B_FALSE);
+	}
+
+	/* validate argument order:  prop=val args followed by dataset args */
+	for (i = 1; i < argc; i++) {
+		if (strchr(argv[i], '=') != NULL) {
+			if (ds_start > 0) {
+				/* out-of-order prop=val argument */
+				(void) fprintf(stderr, gettext("invalid "
+				    "argument order\n"));
+				usage(B_FALSE);
+			}
+		} else if (ds_start < 0) {
+			ds_start = i;
+		}
+	}
+	if (ds_start < 0) {
+		(void) fprintf(stderr, gettext("missing dataset name(s)\n"));
+		usage(B_FALSE);
+	}
+
+	/* Populate a list of property settings */
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+	for (i = 1; i < ds_start; i++) {
+		if ((ret = parseprop(props, argv[i])) != 0)
+			goto error;
+	}
+
+	ret = zfs_for_each(argc - ds_start, argv + ds_start, 0,
+	    ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props);
+
+error:
+	nvlist_free(props);
+	return (ret);
+}
+
+typedef struct snap_cbdata {
+	nvlist_t *sd_nvl;
+	boolean_t sd_recursive;
+	const char *sd_snapname;
+} snap_cbdata_t;
+
+static int
+zfs_snapshot_cb(zfs_handle_t *zhp, void *arg)
+{
+	snap_cbdata_t *sd = arg;
+	char *name;
+	int rv = 0;
+	int error;
+
+	if (sd->sd_recursive &&
+	    zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) {
+		zfs_close(zhp);
+		return (0);
+	}
+
+	error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname);
+	if (error == -1)
+		nomem();
+	fnvlist_add_boolean(sd->sd_nvl, name);
+	free(name);
+
+	if (sd->sd_recursive)
+		rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd);
+	zfs_close(zhp);
+	return (rv);
+}
+
+/*
+ * zfs snapshot [-r] [-o prop=value] ... <fs@snap>
+ *
+ * Creates a snapshot with the given name.  While functionally equivalent to
+ * 'zfs create', it is a separate command to differentiate intent.
+ */
+static int
+zfs_do_snapshot(int argc, char **argv)
+{
+	int ret = 0;
+	signed char c;
+	nvlist_t *props;
+	snap_cbdata_t sd = { 0 };
+	boolean_t multiple_snaps = B_FALSE;
+
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+	if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+
+	/* check options */
+	while ((c = getopt(argc, argv, "ro:")) != -1) {
+		switch (c) {
+		case 'o':
+			if (parseprop(props, optarg))
+				return (1);
+			break;
+		case 'r':
+			sd.sd_recursive = B_TRUE;
+			multiple_snaps = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			goto usage;
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
+		goto usage;
+	}
+
+	if (argc > 1)
+		multiple_snaps = B_TRUE;
+	for (; argc > 0; argc--, argv++) {
+		char *atp;
+		zfs_handle_t *zhp;
+
+		atp = strchr(argv[0], '@');
+		if (atp == NULL)
+			goto usage;
+		*atp = '\0';
+		sd.sd_snapname = atp + 1;
+		zhp = zfs_open(g_zfs, argv[0],
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+		if (zhp == NULL)
+			goto usage;
+		if (zfs_snapshot_cb(zhp, &sd) != 0)
+			goto usage;
+	}
+
+	ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props);
+	nvlist_free(sd.sd_nvl);
+	nvlist_free(props);
+	if (ret != 0 && multiple_snaps)
+		(void) fprintf(stderr, gettext("no snapshots were created\n"));
+	return (ret != 0);
+
+usage:
+	nvlist_free(sd.sd_nvl);
+	nvlist_free(props);
+	usage(B_FALSE);
+	return (-1);
+}
+
+/*
+ * Send a backup stream to stdout.
+ */
+static int
+zfs_do_send(int argc, char **argv)
+{
+	char *fromname = NULL;
+	char *toname = NULL;
+	char *cp;
+	zfs_handle_t *zhp;
+	sendflags_t flags = { 0 };
+	int c, err;
+	nvlist_t *dbgnv = NULL;
+	boolean_t extraverbose = B_FALSE;
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":i:I:RDpvnPLe")) != -1) {
+		switch (c) {
+		case 'i':
+			if (fromname)
+				usage(B_FALSE);
+			fromname = optarg;
+			break;
+		case 'I':
+			if (fromname)
+				usage(B_FALSE);
+			fromname = optarg;
+			flags.doall = B_TRUE;
+			break;
+		case 'R':
+			flags.replicate = B_TRUE;
+			break;
+		case 'p':
+			flags.props = B_TRUE;
+			break;
+		case 'P':
+			flags.parsable = B_TRUE;
+			flags.verbose = B_TRUE;
+			break;
+		case 'v':
+			if (flags.verbose)
+				extraverbose = B_TRUE;
+			flags.verbose = B_TRUE;
+			flags.progress = B_TRUE;
+			break;
+		case 'D':
+			flags.dedup = B_TRUE;
+			break;
+		case 'n':
+			flags.dryrun = B_TRUE;
+			break;
+		case 'L':
+			flags.largeblock = B_TRUE;
+			break;
+		case 'e':
+			flags.embed_data = B_TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	if (!flags.dryrun && isatty(STDOUT_FILENO)) {
+		(void) fprintf(stderr,
+		    gettext("Error: Stream can not be written to a terminal.\n"
+		    "You must redirect standard output.\n"));
+		return (1);
+	}
+
+	/*
+	 * Special case sending a filesystem, or from a bookmark.
+	 */
+	if (strchr(argv[0], '@') == NULL ||
+	    (fromname && strchr(fromname, '#') != NULL)) {
+		char frombuf[ZFS_MAXNAMELEN];
+		enum lzc_send_flags lzc_flags = 0;
+
+		if (flags.replicate || flags.doall || flags.props ||
+		    flags.dedup || flags.dryrun || flags.verbose ||
+		    flags.progress) {
+			(void) fprintf(stderr,
+			    gettext("Error: "
+			    "Unsupported flag with filesystem or bookmark.\n"));
+			return (1);
+		}
+
+		zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET);
+		if (zhp == NULL)
+			return (1);
+
+		if (flags.largeblock)
+			lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
+		if (flags.embed_data)
+			lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
+
+		if (fromname != NULL &&
+		    (fromname[0] == '#' || fromname[0] == '@')) {
+			/*
+			 * Incremental source name begins with # or @.
+			 * Default to same fs as target.
+			 */
+			(void) strncpy(frombuf, argv[0], sizeof (frombuf));
+			cp = strchr(frombuf, '@');
+			if (cp != NULL)
+				*cp = '\0';
+			(void) strlcat(frombuf, fromname, sizeof (frombuf));
+			fromname = frombuf;
+		}
+		err = zfs_send_one(zhp, fromname, STDOUT_FILENO, lzc_flags);
+		zfs_close(zhp);
+		return (err != 0);
+	}
+
+	cp = strchr(argv[0], '@');
+	*cp = '\0';
+	toname = cp + 1;
+	zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+	if (zhp == NULL)
+		return (1);
+
+	/*
+	 * If they specified the full path to the snapshot, chop off
+	 * everything except the short name of the snapshot, but special
+	 * case if they specify the origin.
+	 */
+	if (fromname && (cp = strchr(fromname, '@')) != NULL) {
+		char origin[ZFS_MAXNAMELEN];
+		zprop_source_t src;
+
+		(void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN,
+		    origin, sizeof (origin), &src, NULL, 0, B_FALSE);
+
+		if (strcmp(origin, fromname) == 0) {
+			fromname = NULL;
+			flags.fromorigin = B_TRUE;
+		} else {
+			*cp = '\0';
+			if (cp != fromname && strcmp(argv[0], fromname)) {
+				(void) fprintf(stderr,
+				    gettext("incremental source must be "
+				    "in same filesystem\n"));
+				usage(B_FALSE);
+			}
+			fromname = cp + 1;
+			if (strchr(fromname, '@') || strchr(fromname, '/')) {
+				(void) fprintf(stderr,
+				    gettext("invalid incremental source\n"));
+				usage(B_FALSE);
+			}
+		}
+	}
+
+	if (flags.replicate && fromname == NULL)
+		flags.doall = B_TRUE;
+
+	err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0,
+	    extraverbose ? &dbgnv : NULL);
+
+	if (extraverbose && dbgnv != NULL) {
+		/*
+		 * dump_nvlist prints to stdout, but that's been
+		 * redirected to a file.  Make it print to stderr
+		 * instead.
+		 */
+		(void) dup2(STDERR_FILENO, STDOUT_FILENO);
+		dump_nvlist(dbgnv, 0);
+		nvlist_free(dbgnv);
+	}
+	zfs_close(zhp);
+
+	return (err != 0);
+}
+
+/*
+ * zfs receive [-vnFu] [-d | -e] <fs@snap>
+ *
+ * Restore a backup stream from stdin.
+ */
+static int
+zfs_do_receive(int argc, char **argv)
+{
+	int c, err;
+	recvflags_t flags = { 0 };
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":denuvF")) != -1) {
+		switch (c) {
+		case 'd':
+			flags.isprefix = B_TRUE;
+			break;
+		case 'e':
+			flags.isprefix = B_TRUE;
+			flags.istail = B_TRUE;
+			break;
+		case 'n':
+			flags.dryrun = B_TRUE;
+			break;
+		case 'u':
+			flags.nomount = B_TRUE;
+			break;
+		case 'v':
+			flags.verbose = B_TRUE;
+			break;
+		case 'F':
+			flags.force = B_TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	if (isatty(STDIN_FILENO)) {
+		(void) fprintf(stderr,
+		    gettext("Error: Backup stream can not be read "
+		    "from a terminal.\n"
+		    "You must redirect standard input.\n"));
+		return (1);
+	}
+
+	err = zfs_receive(g_zfs, argv[0], &flags, STDIN_FILENO, NULL);
+
+	return (err != 0);
+}
+
+/*
+ * allow/unallow stuff
+ */
+/* copied from zfs/sys/dsl_deleg.h */
+#define	ZFS_DELEG_PERM_CREATE		"create"
+#define	ZFS_DELEG_PERM_DESTROY		"destroy"
+#define	ZFS_DELEG_PERM_SNAPSHOT		"snapshot"
+#define	ZFS_DELEG_PERM_ROLLBACK		"rollback"
+#define	ZFS_DELEG_PERM_CLONE		"clone"
+#define	ZFS_DELEG_PERM_PROMOTE		"promote"
+#define	ZFS_DELEG_PERM_RENAME		"rename"
+#define	ZFS_DELEG_PERM_MOUNT		"mount"
+#define	ZFS_DELEG_PERM_SHARE		"share"
+#define	ZFS_DELEG_PERM_SEND		"send"
+#define	ZFS_DELEG_PERM_RECEIVE		"receive"
+#define	ZFS_DELEG_PERM_ALLOW		"allow"
+#define	ZFS_DELEG_PERM_USERPROP		"userprop"
+#define	ZFS_DELEG_PERM_VSCAN		"vscan" /* ??? */
+#define	ZFS_DELEG_PERM_USERQUOTA	"userquota"
+#define	ZFS_DELEG_PERM_GROUPQUOTA	"groupquota"
+#define	ZFS_DELEG_PERM_USERUSED		"userused"
+#define	ZFS_DELEG_PERM_GROUPUSED	"groupused"
+#define	ZFS_DELEG_PERM_HOLD		"hold"
+#define	ZFS_DELEG_PERM_RELEASE		"release"
+#define	ZFS_DELEG_PERM_DIFF		"diff"
+#define	ZFS_DELEG_PERM_BOOKMARK		"bookmark"
+
+#define	ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE
+
+static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = {
+	{ ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW },
+	{ ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE },
+	{ ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE },
+	{ ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY },
+	{ ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF},
+	{ ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD },
+	{ ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT },
+	{ ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE },
+	{ ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE },
+	{ ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE },
+	{ ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME },
+	{ ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },
+	{ ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND },
+	{ ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
+	{ ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },
+	{ ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK },
+
+	{ ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
+	{ ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
+	{ ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
+	{ ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA },
+	{ ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED },
+	{ NULL, ZFS_DELEG_NOTE_NONE }
+};
+
+/* permission structure */
+typedef struct deleg_perm {
+	zfs_deleg_who_type_t	dp_who_type;
+	const char		*dp_name;
+	boolean_t		dp_local;
+	boolean_t		dp_descend;
+} deleg_perm_t;
+
+/* */
+typedef struct deleg_perm_node {
+	deleg_perm_t		dpn_perm;
+
+	uu_avl_node_t		dpn_avl_node;
+} deleg_perm_node_t;
+
+typedef struct fs_perm fs_perm_t;
+
+/* permissions set */
+typedef struct who_perm {
+	zfs_deleg_who_type_t	who_type;
+	const char		*who_name;		/* id */
+	char			who_ug_name[256];	/* user/group name */
+	fs_perm_t		*who_fsperm;		/* uplink */
+
+	uu_avl_t		*who_deleg_perm_avl;	/* permissions */
+} who_perm_t;
+
+/* */
+typedef struct who_perm_node {
+	who_perm_t	who_perm;
+	uu_avl_node_t	who_avl_node;
+} who_perm_node_t;
+
+typedef struct fs_perm_set fs_perm_set_t;
+/* fs permissions */
+struct fs_perm {
+	const char		*fsp_name;
+
+	uu_avl_t		*fsp_sc_avl;	/* sets,create */
+	uu_avl_t		*fsp_uge_avl;	/* user,group,everyone */
+
+	fs_perm_set_t		*fsp_set;	/* uplink */
+};
+
+/* */
+typedef struct fs_perm_node {
+	fs_perm_t	fspn_fsperm;
+	uu_avl_t	*fspn_avl;
+
+	uu_list_node_t	fspn_list_node;
+} fs_perm_node_t;
+
+/* top level structure */
+struct fs_perm_set {
+	uu_list_pool_t	*fsps_list_pool;
+	uu_list_t	*fsps_list; /* list of fs_perms */
+
+	uu_avl_pool_t	*fsps_named_set_avl_pool;
+	uu_avl_pool_t	*fsps_who_perm_avl_pool;
+	uu_avl_pool_t	*fsps_deleg_perm_avl_pool;
+};
+
+static inline const char *
+deleg_perm_type(zfs_deleg_note_t note)
+{
+	/* subcommands */
+	switch (note) {
+		/* SUBCOMMANDS */
+		/* OTHER */
+	case ZFS_DELEG_NOTE_GROUPQUOTA:
+	case ZFS_DELEG_NOTE_GROUPUSED:
+	case ZFS_DELEG_NOTE_USERPROP:
+	case ZFS_DELEG_NOTE_USERQUOTA:
+	case ZFS_DELEG_NOTE_USERUSED:
+		/* other */
+		return (gettext("other"));
+	default:
+		return (gettext("subcommand"));
+	}
+}
+
+static int inline
+who_type2weight(zfs_deleg_who_type_t who_type)
+{
+	int res;
+	switch (who_type) {
+		case ZFS_DELEG_NAMED_SET_SETS:
+		case ZFS_DELEG_NAMED_SET:
+			res = 0;
+			break;
+		case ZFS_DELEG_CREATE_SETS:
+		case ZFS_DELEG_CREATE:
+			res = 1;
+			break;
+		case ZFS_DELEG_USER_SETS:
+		case ZFS_DELEG_USER:
+			res = 2;
+			break;
+		case ZFS_DELEG_GROUP_SETS:
+		case ZFS_DELEG_GROUP:
+			res = 3;
+			break;
+		case ZFS_DELEG_EVERYONE_SETS:
+		case ZFS_DELEG_EVERYONE:
+			res = 4;
+			break;
+		default:
+			res = -1;
+	}
+
+	return (res);
+}
+
+/* ARGSUSED */
+static int
+who_perm_compare(const void *larg, const void *rarg, void *unused)
+{
+	const who_perm_node_t *l = larg;
+	const who_perm_node_t *r = rarg;
+	zfs_deleg_who_type_t ltype = l->who_perm.who_type;
+	zfs_deleg_who_type_t rtype = r->who_perm.who_type;
+	int lweight = who_type2weight(ltype);
+	int rweight = who_type2weight(rtype);
+	int res = lweight - rweight;
+	if (res == 0)
+		res = strncmp(l->who_perm.who_name, r->who_perm.who_name,
+		    ZFS_MAX_DELEG_NAME-1);
+
+	if (res == 0)
+		return (0);
+	if (res > 0)
+		return (1);
+	else
+		return (-1);
+}
+
+/* ARGSUSED */
+static int
+deleg_perm_compare(const void *larg, const void *rarg, void *unused)
+{
+	const deleg_perm_node_t *l = larg;
+	const deleg_perm_node_t *r = rarg;
+	int res =  strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name,
+	    ZFS_MAX_DELEG_NAME-1);
+
+	if (res == 0)
+		return (0);
+
+	if (res > 0)
+		return (1);
+	else
+		return (-1);
+}
+
+static inline void
+fs_perm_set_init(fs_perm_set_t *fspset)
+{
+	bzero(fspset, sizeof (fs_perm_set_t));
+
+	if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool",
+	    sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node),
+	    NULL, UU_DEFAULT)) == NULL)
+		nomem();
+	if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL,
+	    UU_DEFAULT)) == NULL)
+		nomem();
+
+	if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create(
+	    "named_set_avl_pool", sizeof (who_perm_node_t), offsetof(
+	    who_perm_node_t, who_avl_node), who_perm_compare,
+	    UU_DEFAULT)) == NULL)
+		nomem();
+
+	if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create(
+	    "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof(
+	    who_perm_node_t, who_avl_node), who_perm_compare,
+	    UU_DEFAULT)) == NULL)
+		nomem();
+
+	if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create(
+	    "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof(
+	    deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT))
+	    == NULL)
+		nomem();
+}
+
+static inline void fs_perm_fini(fs_perm_t *);
+static inline void who_perm_fini(who_perm_t *);
+
+static inline void
+fs_perm_set_fini(fs_perm_set_t *fspset)
+{
+	fs_perm_node_t *node = uu_list_first(fspset->fsps_list);
+
+	while (node != NULL) {
+		fs_perm_node_t *next_node =
+		    uu_list_next(fspset->fsps_list, node);
+		fs_perm_t *fsperm = &node->fspn_fsperm;
+		fs_perm_fini(fsperm);
+		uu_list_remove(fspset->fsps_list, node);
+		free(node);
+		node = next_node;
+	}
+
+	uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool);
+	uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool);
+	uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool);
+}
+
+static inline void
+deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type,
+    const char *name)
+{
+	deleg_perm->dp_who_type = type;
+	deleg_perm->dp_name = name;
+}
+
+static inline void
+who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm,
+    zfs_deleg_who_type_t type, const char *name)
+{
+	uu_avl_pool_t	*pool;
+	pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool;
+
+	bzero(who_perm, sizeof (who_perm_t));
+
+	if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL,
+	    UU_DEFAULT)) == NULL)
+		nomem();
+
+	who_perm->who_type = type;
+	who_perm->who_name = name;
+	who_perm->who_fsperm = fsperm;
+}
+
+static inline void
+who_perm_fini(who_perm_t *who_perm)
+{
+	deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl);
+
+	while (node != NULL) {
+		deleg_perm_node_t *next_node =
+		    uu_avl_next(who_perm->who_deleg_perm_avl, node);
+
+		uu_avl_remove(who_perm->who_deleg_perm_avl, node);
+		free(node);
+		node = next_node;
+	}
+
+	uu_avl_destroy(who_perm->who_deleg_perm_avl);
+}
+
+static inline void
+fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname)
+{
+	uu_avl_pool_t	*nset_pool = fspset->fsps_named_set_avl_pool;
+	uu_avl_pool_t	*who_pool = fspset->fsps_who_perm_avl_pool;
+
+	bzero(fsperm, sizeof (fs_perm_t));
+
+	if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT))
+	    == NULL)
+		nomem();
+
+	if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT))
+	    == NULL)
+		nomem();
+
+	fsperm->fsp_set = fspset;
+	fsperm->fsp_name = fsname;
+}
+
+static inline void
+fs_perm_fini(fs_perm_t *fsperm)
+{
+	who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl);
+	while (node != NULL) {
+		who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl,
+		    node);
+		who_perm_t *who_perm = &node->who_perm;
+		who_perm_fini(who_perm);
+		uu_avl_remove(fsperm->fsp_sc_avl, node);
+		free(node);
+		node = next_node;
+	}
+
+	node = uu_avl_first(fsperm->fsp_uge_avl);
+	while (node != NULL) {
+		who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl,
+		    node);
+		who_perm_t *who_perm = &node->who_perm;
+		who_perm_fini(who_perm);
+		uu_avl_remove(fsperm->fsp_uge_avl, node);
+		free(node);
+		node = next_node;
+	}
+
+	uu_avl_destroy(fsperm->fsp_sc_avl);
+	uu_avl_destroy(fsperm->fsp_uge_avl);
+}
+
+static void inline
+set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node,
+    zfs_deleg_who_type_t who_type, const char *name, char locality)
+{
+	uu_avl_index_t idx = 0;
+
+	deleg_perm_node_t *found_node = NULL;
+	deleg_perm_t	*deleg_perm = &node->dpn_perm;
+
+	deleg_perm_init(deleg_perm, who_type, name);
+
+	if ((found_node = uu_avl_find(avl, node, NULL, &idx))
+	    == NULL)
+		uu_avl_insert(avl, node, idx);
+	else {
+		node = found_node;
+		deleg_perm = &node->dpn_perm;
+	}
+
+
+	switch (locality) {
+	case ZFS_DELEG_LOCAL:
+		deleg_perm->dp_local = B_TRUE;
+		break;
+	case ZFS_DELEG_DESCENDENT:
+		deleg_perm->dp_descend = B_TRUE;
+		break;
+	case ZFS_DELEG_NA:
+		break;
+	default:
+		assert(B_FALSE); /* invalid locality */
+	}
+}
+
+static inline int
+parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality)
+{
+	nvpair_t *nvp = NULL;
+	fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set;
+	uu_avl_t *avl = who_perm->who_deleg_perm_avl;
+	zfs_deleg_who_type_t who_type = who_perm->who_type;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		const char *name = nvpair_name(nvp);
+		data_type_t type = nvpair_type(nvp);
+		uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool;
+		deleg_perm_node_t *node =
+		    safe_malloc(sizeof (deleg_perm_node_t));
+
+		VERIFY(type == DATA_TYPE_BOOLEAN);
+
+		uu_avl_node_init(node, &node->dpn_avl_node, avl_pool);
+		set_deleg_perm_node(avl, node, who_type, name, locality);
+	}
+
+	return (0);
+}
+
+static inline int
+parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl)
+{
+	nvpair_t *nvp = NULL;
+	fs_perm_set_t *fspset = fsperm->fsp_set;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		nvlist_t *nvl2 = NULL;
+		const char *name = nvpair_name(nvp);
+		uu_avl_t *avl = NULL;
+		uu_avl_pool_t *avl_pool = NULL;
+		zfs_deleg_who_type_t perm_type = name[0];
+		char perm_locality = name[1];
+		const char *perm_name = name + 3;
+		boolean_t is_set = B_TRUE;
+		who_perm_t *who_perm = NULL;
+
+		assert('$' == name[2]);
+
+		if (nvpair_value_nvlist(nvp, &nvl2) != 0)
+			return (-1);
+
+		switch (perm_type) {
+		case ZFS_DELEG_CREATE:
+		case ZFS_DELEG_CREATE_SETS:
+		case ZFS_DELEG_NAMED_SET:
+		case ZFS_DELEG_NAMED_SET_SETS:
+			avl_pool = fspset->fsps_named_set_avl_pool;
+			avl = fsperm->fsp_sc_avl;
+			break;
+		case ZFS_DELEG_USER:
+		case ZFS_DELEG_USER_SETS:
+		case ZFS_DELEG_GROUP:
+		case ZFS_DELEG_GROUP_SETS:
+		case ZFS_DELEG_EVERYONE:
+		case ZFS_DELEG_EVERYONE_SETS:
+			avl_pool = fspset->fsps_who_perm_avl_pool;
+			avl = fsperm->fsp_uge_avl;
+			break;
+		default:
+			break;
+		}
+
+		if (is_set) {
+			who_perm_node_t *found_node = NULL;
+			who_perm_node_t *node = safe_malloc(
+			    sizeof (who_perm_node_t));
+			who_perm = &node->who_perm;
+			uu_avl_index_t idx = 0;
+
+			uu_avl_node_init(node, &node->who_avl_node, avl_pool);
+			who_perm_init(who_perm, fsperm, perm_type, perm_name);
+
+			if ((found_node = uu_avl_find(avl, node, NULL, &idx))
+			    == NULL) {
+				if (avl == fsperm->fsp_uge_avl) {
+					uid_t rid = 0;
+					struct passwd *p = NULL;
+					struct group *g = NULL;
+					const char *nice_name = NULL;
+
+					switch (perm_type) {
+					case ZFS_DELEG_USER_SETS:
+					case ZFS_DELEG_USER:
+						rid = atoi(perm_name);
+						p = getpwuid(rid);
+						if (p)
+							nice_name = p->pw_name;
+						break;
+					case ZFS_DELEG_GROUP_SETS:
+					case ZFS_DELEG_GROUP:
+						rid = atoi(perm_name);
+						g = getgrgid(rid);
+						if (g)
+							nice_name = g->gr_name;
+						break;
+					default:
+						break;
+					}
+
+					if (nice_name != NULL)
+						(void) strlcpy(
+						    node->who_perm.who_ug_name,
+						    nice_name, 256);
+				}
+
+				uu_avl_insert(avl, node, idx);
+			} else {
+				node = found_node;
+				who_perm = &node->who_perm;
+			}
+		}
+
+		(void) parse_who_perm(who_perm, nvl2, perm_locality);
+	}
+
+	return (0);
+}
+
+static inline int
+parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl)
+{
+	nvpair_t *nvp = NULL;
+	uu_avl_index_t idx = 0;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		nvlist_t *nvl2 = NULL;
+		const char *fsname = nvpair_name(nvp);
+		data_type_t type = nvpair_type(nvp);
+		fs_perm_t *fsperm = NULL;
+		fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t));
+		if (node == NULL)
+			nomem();
+
+		fsperm = &node->fspn_fsperm;
+
+		VERIFY(DATA_TYPE_NVLIST == type);
+
+		uu_list_node_init(node, &node->fspn_list_node,
+		    fspset->fsps_list_pool);
+
+		idx = uu_list_numnodes(fspset->fsps_list);
+		fs_perm_init(fsperm, fspset, fsname);
+
+		if (nvpair_value_nvlist(nvp, &nvl2) != 0)
+			return (-1);
+
+		(void) parse_fs_perm(fsperm, nvl2);
+
+		uu_list_insert(fspset->fsps_list, node, idx);
+	}
+
+	return (0);
+}
+
+static inline const char *
+deleg_perm_comment(zfs_deleg_note_t note)
+{
+	const char *str = "";
+
+	/* subcommands */
+	switch (note) {
+		/* SUBCOMMANDS */
+	case ZFS_DELEG_NOTE_ALLOW:
+		str = gettext("Must also have the permission that is being"
+		    "\n\t\t\t\tallowed");
+		break;
+	case ZFS_DELEG_NOTE_CLONE:
+		str = gettext("Must also have the 'create' ability and 'mount'"
+		    "\n\t\t\t\tability in the origin file system");
+		break;
+	case ZFS_DELEG_NOTE_CREATE:
+		str = gettext("Must also have the 'mount' ability");
+		break;
+	case ZFS_DELEG_NOTE_DESTROY:
+		str = gettext("Must also have the 'mount' ability");
+		break;
+	case ZFS_DELEG_NOTE_DIFF:
+		str = gettext("Allows lookup of paths within a dataset;"
+		    "\n\t\t\t\tgiven an object number. Ordinary users need this"
+		    "\n\t\t\t\tin order to use zfs diff");
+		break;
+	case ZFS_DELEG_NOTE_HOLD:
+		str = gettext("Allows adding a user hold to a snapshot");
+		break;
+	case ZFS_DELEG_NOTE_MOUNT:
+		str = gettext("Allows mount/umount of ZFS datasets");
+		break;
+	case ZFS_DELEG_NOTE_PROMOTE:
+		str = gettext("Must also have the 'mount'\n\t\t\t\tand"
+		    " 'promote' ability in the origin file system");
+		break;
+	case ZFS_DELEG_NOTE_RECEIVE:
+		str = gettext("Must also have the 'mount' and 'create'"
+		    " ability");
+		break;
+	case ZFS_DELEG_NOTE_RELEASE:
+		str = gettext("Allows releasing a user hold which\n\t\t\t\t"
+		    "might destroy the snapshot");
+		break;
+	case ZFS_DELEG_NOTE_RENAME:
+		str = gettext("Must also have the 'mount' and 'create'"
+		    "\n\t\t\t\tability in the new parent");
+		break;
+	case ZFS_DELEG_NOTE_ROLLBACK:
+		str = gettext("");
+		break;
+	case ZFS_DELEG_NOTE_SEND:
+		str = gettext("");
+		break;
+	case ZFS_DELEG_NOTE_SHARE:
+		str = gettext("Allows sharing file systems over NFS or SMB"
+		    "\n\t\t\t\tprotocols");
+		break;
+	case ZFS_DELEG_NOTE_SNAPSHOT:
+		str = gettext("");
+		break;
+/*
+ *	case ZFS_DELEG_NOTE_VSCAN:
+ *		str = gettext("");
+ *		break;
+ */
+		/* OTHER */
+	case ZFS_DELEG_NOTE_GROUPQUOTA:
+		str = gettext("Allows accessing any groupquota@... property");
+		break;
+	case ZFS_DELEG_NOTE_GROUPUSED:
+		str = gettext("Allows reading any groupused@... property");
+		break;
+	case ZFS_DELEG_NOTE_USERPROP:
+		str = gettext("Allows changing any user property");
+		break;
+	case ZFS_DELEG_NOTE_USERQUOTA:
+		str = gettext("Allows accessing any userquota@... property");
+		break;
+	case ZFS_DELEG_NOTE_USERUSED:
+		str = gettext("Allows reading any userused@... property");
+		break;
+		/* other */
+	default:
+		str = "";
+	}
+
+	return (str);
+}
+
+struct allow_opts {
+	boolean_t local;
+	boolean_t descend;
+	boolean_t user;
+	boolean_t group;
+	boolean_t everyone;
+	boolean_t create;
+	boolean_t set;
+	boolean_t recursive; /* unallow only */
+	boolean_t prt_usage;
+
+	boolean_t prt_perms;
+	char *who;
+	char *perms;
+	const char *dataset;
+};
+
+static inline int
+prop_cmp(const void *a, const void *b)
+{
+	const char *str1 = *(const char **)a;
+	const char *str2 = *(const char **)b;
+	return (strcmp(str1, str2));
+}
+
+static void
+allow_usage(boolean_t un, boolean_t requested, const char *msg)
+{
+	const char *opt_desc[] = {
+		"-h", gettext("show this help message and exit"),
+		"-l", gettext("set permission locally"),
+		"-d", gettext("set permission for descents"),
+		"-u", gettext("set permission for user"),
+		"-g", gettext("set permission for group"),
+		"-e", gettext("set permission for everyone"),
+		"-c", gettext("set create time permission"),
+		"-s", gettext("define permission set"),
+		/* unallow only */
+		"-r", gettext("remove permissions recursively"),
+	};
+	size_t unallow_size = sizeof (opt_desc) / sizeof (char *);
+	size_t allow_size = unallow_size - 2;
+	const char *props[ZFS_NUM_PROPS];
+	int i;
+	size_t count = 0;
+	FILE *fp = requested ? stdout : stderr;
+	zprop_desc_t *pdtbl = zfs_prop_get_table();
+	const char *fmt = gettext("%-16s %-14s\t%s\n");
+
+	(void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW :
+	    HELP_ALLOW));
+	(void) fprintf(fp, gettext("Options:\n"));
+	for (i = 0; i < (un ? unallow_size : allow_size); i++) {
+		const char *opt = opt_desc[i++];
+		const char *optdsc = opt_desc[i];
+		(void) fprintf(fp, gettext("  %-10s  %s\n"), opt, optdsc);
+	}
+
+	(void) fprintf(fp, gettext("\nThe following permissions are "
+	    "supported:\n\n"));
+	(void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"),
+	    gettext("NOTES"));
+	for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) {
+		const char *perm_name = zfs_deleg_perm_tbl[i].z_perm;
+		zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note;
+		const char *perm_type = deleg_perm_type(perm_note);
+		const char *perm_comment = deleg_perm_comment(perm_note);
+		(void) fprintf(fp, fmt, perm_name, perm_type, perm_comment);
+	}
+
+	for (i = 0; i < ZFS_NUM_PROPS; i++) {
+		zprop_desc_t *pd = &pdtbl[i];
+		if (pd->pd_visible != B_TRUE)
+			continue;
+
+		if (pd->pd_attr == PROP_READONLY)
+			continue;
+
+		props[count++] = pd->pd_name;
+	}
+	props[count] = NULL;
+
+	qsort(props, count, sizeof (char *), prop_cmp);
+
+	for (i = 0; i < count; i++)
+		(void) fprintf(fp, fmt, props[i], gettext("property"), "");
+
+	if (msg != NULL)
+		(void) fprintf(fp, gettext("\nzfs: error: %s"), msg);
+
+	exit(requested ? 0 : 2);
+}
+
+static inline const char *
+munge_args(int argc, char **argv, boolean_t un, size_t expected_argc,
+    char **permsp)
+{
+	if (un && argc == expected_argc - 1)
+		*permsp = NULL;
+	else if (argc == expected_argc)
+		*permsp = argv[argc - 2];
+	else
+		allow_usage(un, B_FALSE,
+		    gettext("wrong number of parameters\n"));
+
+	return (argv[argc - 1]);
+}
+
+static void
+parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts)
+{
+	int uge_sum = opts->user + opts->group + opts->everyone;
+	int csuge_sum = opts->create + opts->set + uge_sum;
+	int ldcsuge_sum = csuge_sum + opts->local + opts->descend;
+	int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum;
+
+	if (uge_sum > 1)
+		allow_usage(un, B_FALSE,
+		    gettext("-u, -g, and -e are mutually exclusive\n"));
+
+	if (opts->prt_usage) {
+		if (argc == 0 && all_sum == 0)
+			allow_usage(un, B_TRUE, NULL);
+		else
+			usage(B_FALSE);
+	}
+
+	if (opts->set) {
+		if (csuge_sum > 1)
+			allow_usage(un, B_FALSE,
+			    gettext("invalid options combined with -s\n"));
+
+		opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
+		if (argv[0][0] != '@')
+			allow_usage(un, B_FALSE,
+			    gettext("invalid set name: missing '@' prefix\n"));
+		opts->who = argv[0];
+	} else if (opts->create) {
+		if (ldcsuge_sum > 1)
+			allow_usage(un, B_FALSE,
+			    gettext("invalid options combined with -c\n"));
+		opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+	} else if (opts->everyone) {
+		if (csuge_sum > 1)
+			allow_usage(un, B_FALSE,
+			    gettext("invalid options combined with -e\n"));
+		opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+	} else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone")
+	    == 0) {
+		opts->everyone = B_TRUE;
+		argc--;
+		argv++;
+		opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+	} else if (argc == 1 && !un) {
+		opts->prt_perms = B_TRUE;
+		opts->dataset = argv[argc-1];
+	} else {
+		opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
+		opts->who = argv[0];
+	}
+
+	if (!opts->local && !opts->descend) {
+		opts->local = B_TRUE;
+		opts->descend = B_TRUE;
+	}
+}
+
+static void
+store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend,
+    const char *who, char *perms, nvlist_t *top_nvl)
+{
+	int i;
+	char ld[2] = { '\0', '\0' };
+	char who_buf[ZFS_MAXNAMELEN+32];
+	char base_type = ZFS_DELEG_WHO_UNKNOWN;
+	char set_type = ZFS_DELEG_WHO_UNKNOWN;
+	nvlist_t *base_nvl = NULL;
+	nvlist_t *set_nvl = NULL;
+	nvlist_t *nvl;
+
+	if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+	if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) !=  0)
+		nomem();
+
+	switch (type) {
+	case ZFS_DELEG_NAMED_SET_SETS:
+	case ZFS_DELEG_NAMED_SET:
+		set_type = ZFS_DELEG_NAMED_SET_SETS;
+		base_type = ZFS_DELEG_NAMED_SET;
+		ld[0] = ZFS_DELEG_NA;
+		break;
+	case ZFS_DELEG_CREATE_SETS:
+	case ZFS_DELEG_CREATE:
+		set_type = ZFS_DELEG_CREATE_SETS;
+		base_type = ZFS_DELEG_CREATE;
+		ld[0] = ZFS_DELEG_NA;
+		break;
+	case ZFS_DELEG_USER_SETS:
+	case ZFS_DELEG_USER:
+		set_type = ZFS_DELEG_USER_SETS;
+		base_type = ZFS_DELEG_USER;
+		if (local)
+			ld[0] = ZFS_DELEG_LOCAL;
+		if (descend)
+			ld[1] = ZFS_DELEG_DESCENDENT;
+		break;
+	case ZFS_DELEG_GROUP_SETS:
+	case ZFS_DELEG_GROUP:
+		set_type = ZFS_DELEG_GROUP_SETS;
+		base_type = ZFS_DELEG_GROUP;
+		if (local)
+			ld[0] = ZFS_DELEG_LOCAL;
+		if (descend)
+			ld[1] = ZFS_DELEG_DESCENDENT;
+		break;
+	case ZFS_DELEG_EVERYONE_SETS:
+	case ZFS_DELEG_EVERYONE:
+		set_type = ZFS_DELEG_EVERYONE_SETS;
+		base_type = ZFS_DELEG_EVERYONE;
+		if (local)
+			ld[0] = ZFS_DELEG_LOCAL;
+		if (descend)
+			ld[1] = ZFS_DELEG_DESCENDENT;
+	default:
+		break;
+	}
+
+	if (perms != NULL) {
+		char *curr = perms;
+		char *end = curr + strlen(perms);
+
+		while (curr < end) {
+			char *delim = strchr(curr, ',');
+			if (delim == NULL)
+				delim = end;
+			else
+				*delim = '\0';
+
+			if (curr[0] == '@')
+				nvl = set_nvl;
+			else
+				nvl = base_nvl;
+
+			(void) nvlist_add_boolean(nvl, curr);
+			if (delim != end)
+				*delim = ',';
+			curr = delim + 1;
+		}
+
+		for (i = 0; i < 2; i++) {
+			char locality = ld[i];
+			if (locality == 0)
+				continue;
+
+			if (!nvlist_empty(base_nvl)) {
+				if (who != NULL)
+					(void) snprintf(who_buf,
+					    sizeof (who_buf), "%c%c$%s",
+					    base_type, locality, who);
+				else
+					(void) snprintf(who_buf,
+					    sizeof (who_buf), "%c%c$",
+					    base_type, locality);
+
+				(void) nvlist_add_nvlist(top_nvl, who_buf,
+				    base_nvl);
+			}
+
+
+			if (!nvlist_empty(set_nvl)) {
+				if (who != NULL)
+					(void) snprintf(who_buf,
+					    sizeof (who_buf), "%c%c$%s",
+					    set_type, locality, who);
+				else
+					(void) snprintf(who_buf,
+					    sizeof (who_buf), "%c%c$",
+					    set_type, locality);
+
+				(void) nvlist_add_nvlist(top_nvl, who_buf,
+				    set_nvl);
+			}
+		}
+	} else {
+		for (i = 0; i < 2; i++) {
+			char locality = ld[i];
+			if (locality == 0)
+				continue;
+
+			if (who != NULL)
+				(void) snprintf(who_buf, sizeof (who_buf),
+				    "%c%c$%s", base_type, locality, who);
+			else
+				(void) snprintf(who_buf, sizeof (who_buf),
+				    "%c%c$", base_type, locality);
+			(void) nvlist_add_boolean(top_nvl, who_buf);
+
+			if (who != NULL)
+				(void) snprintf(who_buf, sizeof (who_buf),
+				    "%c%c$%s", set_type, locality, who);
+			else
+				(void) snprintf(who_buf, sizeof (who_buf),
+				    "%c%c$", set_type, locality);
+			(void) nvlist_add_boolean(top_nvl, who_buf);
+		}
+	}
+}
+
+static int
+construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
+{
+	if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+
+	if (opts->set) {
+		store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local,
+		    opts->descend, opts->who, opts->perms, *nvlp);
+	} else if (opts->create) {
+		store_allow_perm(ZFS_DELEG_CREATE, opts->local,
+		    opts->descend, NULL, opts->perms, *nvlp);
+	} else if (opts->everyone) {
+		store_allow_perm(ZFS_DELEG_EVERYONE, opts->local,
+		    opts->descend, NULL, opts->perms, *nvlp);
+	} else {
+		char *curr = opts->who;
+		char *end = curr + strlen(curr);
+
+		while (curr < end) {
+			const char *who;
+			zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN;
+			char *endch;
+			char *delim = strchr(curr, ',');
+			char errbuf[256];
+			char id[64];
+			struct passwd *p = NULL;
+			struct group *g = NULL;
+
+			uid_t rid;
+			if (delim == NULL)
+				delim = end;
+			else
+				*delim = '\0';
+
+			rid = (uid_t)strtol(curr, &endch, 0);
+			if (opts->user) {
+				who_type = ZFS_DELEG_USER;
+				if (*endch != '\0')
+					p = getpwnam(curr);
+				else
+					p = getpwuid(rid);
+
+				if (p != NULL)
+					rid = p->pw_uid;
+				else {
+					(void) snprintf(errbuf, 256, gettext(
+					    "invalid user %s"), curr);
+					allow_usage(un, B_TRUE, errbuf);
+				}
+			} else if (opts->group) {
+				who_type = ZFS_DELEG_GROUP;
+				if (*endch != '\0')
+					g = getgrnam(curr);
+				else
+					g = getgrgid(rid);
+
+				if (g != NULL)
+					rid = g->gr_gid;
+				else {
+					(void) snprintf(errbuf, 256, gettext(
+					    "invalid group %s"),  curr);
+					allow_usage(un, B_TRUE, errbuf);
+				}
+			} else {
+				if (*endch != '\0') {
+					p = getpwnam(curr);
+				} else {
+					p = getpwuid(rid);
+				}
+
+				if (p == NULL) {
+					if (*endch != '\0') {
+						g = getgrnam(curr);
+					} else {
+						g = getgrgid(rid);
+					}
+				}
+
+				if (p != NULL) {
+					who_type = ZFS_DELEG_USER;
+					rid = p->pw_uid;
+				} else if (g != NULL) {
+					who_type = ZFS_DELEG_GROUP;
+					rid = g->gr_gid;
+				} else {
+					(void) snprintf(errbuf, 256, gettext(
+					    "invalid user/group %s"), curr);
+					allow_usage(un, B_TRUE, errbuf);
+				}
+			}
+
+			(void) sprintf(id, "%u", rid);
+			who = id;
+
+			store_allow_perm(who_type, opts->local,
+			    opts->descend, who, opts->perms, *nvlp);
+			curr = delim + 1;
+		}
+	}
+
+	return (0);
+}
+
+static void
+print_set_creat_perms(uu_avl_t *who_avl)
+{
+	const char *sc_title[] = {
+		gettext("Permission sets:\n"),
+		gettext("Create time permissions:\n"),
+		NULL
+	};
+	const char **title_ptr = sc_title;
+	who_perm_node_t *who_node = NULL;
+	int prev_weight = -1;
+
+	for (who_node = uu_avl_first(who_avl); who_node != NULL;
+	    who_node = uu_avl_next(who_avl, who_node)) {
+		uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
+		zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
+		const char *who_name = who_node->who_perm.who_name;
+		int weight = who_type2weight(who_type);
+		boolean_t first = B_TRUE;
+		deleg_perm_node_t *deleg_node;
+
+		if (prev_weight != weight) {
+			(void) printf("%s", *title_ptr++);
+			prev_weight = weight;
+		}
+
+		if (who_name == NULL || strnlen(who_name, 1) == 0)
+			(void) printf("\t");
+		else
+			(void) printf("\t%s ", who_name);
+
+		for (deleg_node = uu_avl_first(avl); deleg_node != NULL;
+		    deleg_node = uu_avl_next(avl, deleg_node)) {
+			if (first) {
+				(void) printf("%s",
+				    deleg_node->dpn_perm.dp_name);
+				first = B_FALSE;
+			} else
+				(void) printf(",%s",
+				    deleg_node->dpn_perm.dp_name);
+		}
+
+		(void) printf("\n");
+	}
+}
+
+static void inline
+print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend,
+    const char *title)
+{
+	who_perm_node_t *who_node = NULL;
+	boolean_t prt_title = B_TRUE;
+	uu_avl_walk_t *walk;
+
+	if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL)
+		nomem();
+
+	while ((who_node = uu_avl_walk_next(walk)) != NULL) {
+		const char *who_name = who_node->who_perm.who_name;
+		const char *nice_who_name = who_node->who_perm.who_ug_name;
+		uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
+		zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
+		char delim = ' ';
+		deleg_perm_node_t *deleg_node;
+		boolean_t prt_who = B_TRUE;
+
+		for (deleg_node = uu_avl_first(avl);
+		    deleg_node != NULL;
+		    deleg_node = uu_avl_next(avl, deleg_node)) {
+			if (local != deleg_node->dpn_perm.dp_local ||
+			    descend != deleg_node->dpn_perm.dp_descend)
+				continue;
+
+			if (prt_who) {
+				const char *who = NULL;
+				if (prt_title) {
+					prt_title = B_FALSE;
+					(void) printf("%s", title);
+				}
+
+				switch (who_type) {
+				case ZFS_DELEG_USER_SETS:
+				case ZFS_DELEG_USER:
+					who = gettext("user");
+					if (nice_who_name)
+						who_name  = nice_who_name;
+					break;
+				case ZFS_DELEG_GROUP_SETS:
+				case ZFS_DELEG_GROUP:
+					who = gettext("group");
+					if (nice_who_name)
+						who_name  = nice_who_name;
+					break;
+				case ZFS_DELEG_EVERYONE_SETS:
+				case ZFS_DELEG_EVERYONE:
+					who = gettext("everyone");
+					who_name = NULL;
+				default:
+					break;
+				}
+
+				prt_who = B_FALSE;
+				if (who_name == NULL)
+					(void) printf("\t%s", who);
+				else
+					(void) printf("\t%s %s", who, who_name);
+			}
+
+			(void) printf("%c%s", delim,
+			    deleg_node->dpn_perm.dp_name);
+			delim = ',';
+		}
+
+		if (!prt_who)
+			(void) printf("\n");
+	}
+
+	uu_avl_walk_end(walk);
+}
+
+static void
+print_fs_perms(fs_perm_set_t *fspset)
+{
+	fs_perm_node_t *node = NULL;
+	char buf[ZFS_MAXNAMELEN+32];
+	const char *dsname = buf;
+
+	for (node = uu_list_first(fspset->fsps_list); node != NULL;
+	    node = uu_list_next(fspset->fsps_list, node)) {
+		uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl;
+		uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl;
+		int left = 0;
+
+		(void) snprintf(buf, ZFS_MAXNAMELEN+32,
+		    gettext("---- Permissions on %s "),
+		    node->fspn_fsperm.fsp_name);
+		(void) printf("%s", dsname);
+		left = 70 - strlen(buf);
+		while (left-- > 0)
+			(void) printf("-");
+		(void) printf("\n");
+
+		print_set_creat_perms(sc_avl);
+		print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE,
+		    gettext("Local permissions:\n"));
+		print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE,
+		    gettext("Descendent permissions:\n"));
+		print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE,
+		    gettext("Local+Descendent permissions:\n"));
+	}
+}
+
+static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL };
+
+struct deleg_perms {
+	boolean_t un;
+	nvlist_t *nvl;
+};
+
+static int
+set_deleg_perms(zfs_handle_t *zhp, void *data)
+{
+	struct deleg_perms *perms = (struct deleg_perms *)data;
+	zfs_type_t zfs_type = zfs_get_type(zhp);
+
+	if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME)
+		return (0);
+
+	return (zfs_set_fsacl(zhp, perms->un, perms->nvl));
+}
+
+static int
+zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un)
+{
+	zfs_handle_t *zhp;
+	nvlist_t *perm_nvl = NULL;
+	nvlist_t *update_perm_nvl = NULL;
+	int error = 1;
+	int c;
+	struct allow_opts opts = { 0 };
+
+	const char *optstr = un ? "ldugecsrh" : "ldugecsh";
+
+	/* check opts */
+	while ((c = getopt(argc, argv, optstr)) != -1) {
+		switch (c) {
+		case 'l':
+			opts.local = B_TRUE;
+			break;
+		case 'd':
+			opts.descend = B_TRUE;
+			break;
+		case 'u':
+			opts.user = B_TRUE;
+			break;
+		case 'g':
+			opts.group = B_TRUE;
+			break;
+		case 'e':
+			opts.everyone = B_TRUE;
+			break;
+		case 's':
+			opts.set = B_TRUE;
+			break;
+		case 'c':
+			opts.create = B_TRUE;
+			break;
+		case 'r':
+			opts.recursive = B_TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case 'h':
+			opts.prt_usage = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check arguments */
+	parse_allow_args(argc, argv, un, &opts);
+
+	/* try to open the dataset */
+	if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM |
+	    ZFS_TYPE_VOLUME)) == NULL) {
+		(void) fprintf(stderr, "Failed to open dataset: %s\n",
+		    opts.dataset);
+		return (-1);
+	}
+
+	if (zfs_get_fsacl(zhp, &perm_nvl) != 0)
+		goto cleanup2;
+
+	fs_perm_set_init(&fs_perm_set);
+	if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) {
+		(void) fprintf(stderr, "Failed to parse fsacl permissions\n");
+		goto cleanup1;
+	}
+
+	if (opts.prt_perms)
+		print_fs_perms(&fs_perm_set);
+	else {
+		(void) construct_fsacl_list(un, &opts, &update_perm_nvl);
+		if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0)
+			goto cleanup0;
+
+		if (un && opts.recursive) {
+			struct deleg_perms data = { un, update_perm_nvl };
+			if (zfs_iter_filesystems(zhp, set_deleg_perms,
+			    &data) != 0)
+				goto cleanup0;
+		}
+	}
+
+	error = 0;
+
+cleanup0:
+	nvlist_free(perm_nvl);
+	if (update_perm_nvl != NULL)
+		nvlist_free(update_perm_nvl);
+cleanup1:
+	fs_perm_set_fini(&fs_perm_set);
+cleanup2:
+	zfs_close(zhp);
+
+	return (error);
+}
+
+static int
+zfs_do_allow(int argc, char **argv)
+{
+	return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE));
+}
+
+static int
+zfs_do_unallow(int argc, char **argv)
+{
+	return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE));
+}
+
+static int
+zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
+{
+	int errors = 0;
+	int i;
+	const char *tag;
+	boolean_t recursive = B_FALSE;
+	const char *opts = holding ? "rt" : "r";
+	int c;
+
+	/* check options */
+	while ((c = getopt(argc, argv, opts)) != -1) {
+		switch (c) {
+		case 'r':
+			recursive = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 2)
+		usage(B_FALSE);
+
+	tag = argv[0];
+	--argc;
+	++argv;
+
+	if (holding && tag[0] == '.') {
+		/* tags starting with '.' are reserved for libzfs */
+		(void) fprintf(stderr, gettext("tag may not start with '.'\n"));
+		usage(B_FALSE);
+	}
+
+	for (i = 0; i < argc; ++i) {
+		zfs_handle_t *zhp;
+		char parent[ZFS_MAXNAMELEN];
+		const char *delim;
+		char *path = argv[i];
+
+		delim = strchr(path, '@');
+		if (delim == NULL) {
+			(void) fprintf(stderr,
+			    gettext("'%s' is not a snapshot\n"), path);
+			++errors;
+			continue;
+		}
+		(void) strncpy(parent, path, delim - path);
+		parent[delim - path] = '\0';
+
+		zhp = zfs_open(g_zfs, parent,
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+		if (zhp == NULL) {
+			++errors;
+			continue;
+		}
+		if (holding) {
+			if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0)
+				++errors;
+		} else {
+			if (zfs_release(zhp, delim+1, tag, recursive) != 0)
+				++errors;
+		}
+		zfs_close(zhp);
+	}
+
+	return (errors != 0);
+}
+
+/*
+ * zfs hold [-r] [-t] <tag> <snap> ...
+ *
+ *	-r	Recursively hold
+ *
+ * Apply a user-hold with the given tag to the list of snapshots.
+ */
+static int
+zfs_do_hold(int argc, char **argv)
+{
+	return (zfs_do_hold_rele_impl(argc, argv, B_TRUE));
+}
+
+/*
+ * zfs release [-r] <tag> <snap> ...
+ *
+ *	-r	Recursively release
+ *
+ * Release a user-hold with the given tag from the list of snapshots.
+ */
+static int
+zfs_do_release(int argc, char **argv)
+{
+	return (zfs_do_hold_rele_impl(argc, argv, B_FALSE));
+}
+
+typedef struct holds_cbdata {
+	boolean_t	cb_recursive;
+	const char	*cb_snapname;
+	nvlist_t	**cb_nvlp;
+	size_t		cb_max_namelen;
+	size_t		cb_max_taglen;
+} holds_cbdata_t;
+
+#define	STRFTIME_FMT_STR "%a %b %e %k:%M %Y"
+#define	DATETIME_BUF_LEN (32)
+/*
+ *
+ */
+static void
+print_holds(boolean_t scripted, int nwidth, int tagwidth, nvlist_t *nvl)
+{
+	int i;
+	nvpair_t *nvp = NULL;
+	char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" };
+	const char *col;
+
+	if (!scripted) {
+		for (i = 0; i < 3; i++) {
+			col = gettext(hdr_cols[i]);
+			if (i < 2)
+				(void) printf("%-*s  ", i ? tagwidth : nwidth,
+				    col);
+			else
+				(void) printf("%s\n", col);
+		}
+	}
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		char *zname = nvpair_name(nvp);
+		nvlist_t *nvl2;
+		nvpair_t *nvp2 = NULL;
+		(void) nvpair_value_nvlist(nvp, &nvl2);
+		while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) {
+			char tsbuf[DATETIME_BUF_LEN];
+			char *tagname = nvpair_name(nvp2);
+			uint64_t val = 0;
+			time_t time;
+			struct tm t;
+			char sep = scripted ? '\t' : ' ';
+			int sepnum = scripted ? 1 : 2;
+
+			(void) nvpair_value_uint64(nvp2, &val);
+			time = (time_t)val;
+			(void) localtime_r(&time, &t);
+			(void) strftime(tsbuf, DATETIME_BUF_LEN,
+			    gettext(STRFTIME_FMT_STR), &t);
+
+			(void) printf("%-*s%*c%-*s%*c%s\n", nwidth, zname,
+			    sepnum, sep, tagwidth, tagname, sepnum, sep, tsbuf);
+		}
+	}
+}
+
+/*
+ * Generic callback function to list a dataset or snapshot.
+ */
+static int
+holds_callback(zfs_handle_t *zhp, void *data)
+{
+	holds_cbdata_t *cbp = data;
+	nvlist_t *top_nvl = *cbp->cb_nvlp;
+	nvlist_t *nvl = NULL;
+	nvpair_t *nvp = NULL;
+	const char *zname = zfs_get_name(zhp);
+	size_t znamelen = strnlen(zname, ZFS_MAXNAMELEN);
+
+	if (cbp->cb_recursive) {
+		const char *snapname;
+		char *delim  = strchr(zname, '@');
+		if (delim == NULL)
+			return (0);
+
+		snapname = delim + 1;
+		if (strcmp(cbp->cb_snapname, snapname))
+			return (0);
+	}
+
+	if (zfs_get_holds(zhp, &nvl) != 0)
+		return (-1);
+
+	if (znamelen > cbp->cb_max_namelen)
+		cbp->cb_max_namelen  = znamelen;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		const char *tag = nvpair_name(nvp);
+		size_t taglen = strnlen(tag, MAXNAMELEN);
+		if (taglen > cbp->cb_max_taglen)
+			cbp->cb_max_taglen  = taglen;
+	}
+
+	return (nvlist_add_nvlist(top_nvl, zname, nvl));
+}
+
+/*
+ * zfs holds [-r] <snap> ...
+ *
+ *	-r	Recursively hold
+ */
+static int
+zfs_do_holds(int argc, char **argv)
+{
+	int errors = 0;
+	int c;
+	int i;
+	boolean_t scripted = B_FALSE;
+	boolean_t recursive = B_FALSE;
+	const char *opts = "rH";
+	nvlist_t *nvl;
+
+	int types = ZFS_TYPE_SNAPSHOT;
+	holds_cbdata_t cb = { 0 };
+
+	int limit = 0;
+	int ret = 0;
+	int flags = 0;
+
+	/* check options */
+	while ((c = getopt(argc, argv, opts)) != -1) {
+		switch (c) {
+		case 'r':
+			recursive = B_TRUE;
+			break;
+		case 'H':
+			scripted = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	if (recursive) {
+		types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
+		flags |= ZFS_ITER_RECURSE;
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1)
+		usage(B_FALSE);
+
+	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+
+	for (i = 0; i < argc; ++i) {
+		char *snapshot = argv[i];
+		const char *delim;
+		const char *snapname;
+
+		delim = strchr(snapshot, '@');
+		if (delim == NULL) {
+			(void) fprintf(stderr,
+			    gettext("'%s' is not a snapshot\n"), snapshot);
+			++errors;
+			continue;
+		}
+		snapname = delim + 1;
+		if (recursive)
+			snapshot[delim - snapshot] = '\0';
+
+		cb.cb_recursive = recursive;
+		cb.cb_snapname = snapname;
+		cb.cb_nvlp = &nvl;
+
+		/*
+		 *  1. collect holds data, set format options
+		 */
+		ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit,
+		    holds_callback, &cb);
+		if (ret != 0)
+			++errors;
+	}
+
+	/*
+	 *  2. print holds data
+	 */
+	print_holds(scripted, cb.cb_max_namelen, cb.cb_max_taglen, nvl);
+
+	if (nvlist_empty(nvl))
+		(void) fprintf(stderr, gettext("no datasets available\n"));
+
+	nvlist_free(nvl);
+
+	return (0 != errors);
+}
+
+#define	CHECK_SPINNER 30
+#define	SPINNER_TIME 3		/* seconds */
+#define	MOUNT_TIME 5		/* seconds */
+
+static int
+get_one_dataset(zfs_handle_t *zhp, void *data)
+{
+	static char *spin[] = { "-", "\\", "|", "/" };
+	static int spinval = 0;
+	static int spincheck = 0;
+	static time_t last_spin_time = (time_t)0;
+	get_all_cb_t *cbp = data;
+	zfs_type_t type = zfs_get_type(zhp);
+
+	if (cbp->cb_verbose) {
+		if (--spincheck < 0) {
+			time_t now = time(NULL);
+			if (last_spin_time + SPINNER_TIME < now) {
+				update_progress(spin[spinval++ % 4]);
+				last_spin_time = now;
+			}
+			spincheck = CHECK_SPINNER;
+		}
+	}
+
+	/*
+	 * Iterate over any nested datasets.
+	 */
+	if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) {
+		zfs_close(zhp);
+		return (1);
+	}
+
+	/*
+	 * Skip any datasets whose type does not match.
+	 */
+	if ((type & ZFS_TYPE_FILESYSTEM) == 0) {
+		zfs_close(zhp);
+		return (0);
+	}
+	libzfs_add_handle(cbp, zhp);
+	assert(cbp->cb_used <= cbp->cb_alloc);
+
+	return (0);
+}
+
+static void
+get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t verbose)
+{
+	get_all_cb_t cb = { 0 };
+	cb.cb_verbose = verbose;
+	cb.cb_getone = get_one_dataset;
+
+	if (verbose)
+		set_progress_header(gettext("Reading ZFS config"));
+	(void) zfs_iter_root(g_zfs, get_one_dataset, &cb);
+
+	*dslist = cb.cb_handles;
+	*count = cb.cb_used;
+
+	if (verbose)
+		finish_progress(gettext("done."));
+}
+
+/*
+ * Generic callback for sharing or mounting filesystems.  Because the code is so
+ * similar, we have a common function with an extra parameter to determine which
+ * mode we are using.
+ */
+#define	OP_SHARE	0x1
+#define	OP_MOUNT	0x2
+
+/*
+ * Share or mount a dataset.
+ */
+static int
+share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
+    boolean_t explicit, const char *options)
+{
+	char mountpoint[ZFS_MAXPROPLEN];
+	char shareopts[ZFS_MAXPROPLEN];
+	char smbshareopts[ZFS_MAXPROPLEN];
+	const char *cmdname = op == OP_SHARE ? "share" : "mount";
+	struct mnttab mnt;
+	uint64_t zoned, canmount;
+	boolean_t shared_nfs, shared_smb;
+
+	assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM);
+
+	/*
+	 * Check to make sure we can mount/share this dataset.  If we
+	 * are in the global zone and the filesystem is exported to a
+	 * local zone, or if we are in a local zone and the
+	 * filesystem is not exported, then it is an error.
+	 */
+	zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
+
+	if (zoned && getzoneid() == GLOBAL_ZONEID) {
+		if (!explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "dataset is exported to a local zone\n"), cmdname,
+		    zfs_get_name(zhp));
+		return (1);
+
+	} else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
+		if (!explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "permission denied\n"), cmdname,
+		    zfs_get_name(zhp));
+		return (1);
+	}
+
+	/*
+	 * Ignore any filesystems which don't apply to us. This
+	 * includes those with a legacy mountpoint, or those with
+	 * legacy share options.
+	 */
+	verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
+	    sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
+	verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
+	    sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
+	verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts,
+	    sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0);
+
+	if (op == OP_SHARE && strcmp(shareopts, "off") == 0 &&
+	    strcmp(smbshareopts, "off") == 0) {
+		if (!explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot share '%s': "
+		    "legacy share\n"), zfs_get_name(zhp));
+		(void) fprintf(stderr, gettext("use share(1M) to "
+		    "share this filesystem, or set "
+		    "sharenfs property on\n"));
+		return (1);
+	}
+
+	/*
+	 * We cannot share or mount legacy filesystems. If the
+	 * shareopts is non-legacy but the mountpoint is legacy, we
+	 * treat it as a legacy share.
+	 */
+	if (strcmp(mountpoint, "legacy") == 0) {
+		if (!explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
+		(void) fprintf(stderr, gettext("use %s(1M) to "
+		    "%s this filesystem\n"), cmdname, cmdname);
+		return (1);
+	}
+
+	if (strcmp(mountpoint, "none") == 0) {
+		if (!explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': no "
+		    "mountpoint set\n"), cmdname, zfs_get_name(zhp));
+		return (1);
+	}
+
+	/*
+	 * canmount	explicit	outcome
+	 * on		no		pass through
+	 * on		yes		pass through
+	 * off		no		return 0
+	 * off		yes		display error, return 1
+	 * noauto	no		return 0
+	 * noauto	yes		pass through
+	 */
+	canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
+	if (canmount == ZFS_CANMOUNT_OFF) {
+		if (!explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "'canmount' property is set to 'off'\n"), cmdname,
+		    zfs_get_name(zhp));
+		return (1);
+	} else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) {
+		return (0);
+	}
+
+	/*
+	 * At this point, we have verified that the mountpoint and/or
+	 * shareopts are appropriate for auto management. If the
+	 * filesystem is already mounted or shared, return (failing
+	 * for explicit requests); otherwise mount or share the
+	 * filesystem.
+	 */
+	switch (op) {
+	case OP_SHARE:
+
+		shared_nfs = zfs_is_shared_nfs(zhp, NULL);
+		shared_smb = zfs_is_shared_smb(zhp, NULL);
+
+		if ((shared_nfs && shared_smb) ||
+		    ((shared_nfs && strcmp(shareopts, "on") == 0) &&
+		    (strcmp(smbshareopts, "off") == 0)) ||
+		    ((shared_smb && strcmp(smbshareopts, "on") == 0) &&
+		    (strcmp(shareopts, "off") == 0))) {
+			if (!explicit)
+				return (0);
+
+			(void) fprintf(stderr, gettext("cannot share "
+			    "'%s': filesystem already shared\n"),
+			    zfs_get_name(zhp));
+			return (1);
+		}
+
+		if (!zfs_is_mounted(zhp, NULL) &&
+		    zfs_mount(zhp, NULL, 0) != 0)
+			return (1);
+
+		if (protocol == NULL) {
+			if (zfs_shareall(zhp) != 0)
+				return (1);
+		} else if (strcmp(protocol, "nfs") == 0) {
+			if (zfs_share_nfs(zhp))
+				return (1);
+		} else if (strcmp(protocol, "smb") == 0) {
+			if (zfs_share_smb(zhp))
+				return (1);
+		} else {
+			(void) fprintf(stderr, gettext("cannot share "
+			    "'%s': invalid share type '%s' "
+			    "specified\n"),
+			    zfs_get_name(zhp), protocol);
+			return (1);
+		}
+
+		break;
+
+	case OP_MOUNT:
+		if (options == NULL)
+			mnt.mnt_mntopts = "";
+		else
+			mnt.mnt_mntopts = (char *)options;
+
+		if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
+		    zfs_is_mounted(zhp, NULL)) {
+			if (!explicit)
+				return (0);
+
+			(void) fprintf(stderr, gettext("cannot mount "
+			    "'%s': filesystem already mounted\n"),
+			    zfs_get_name(zhp));
+			return (1);
+		}
+
+		if (zfs_mount(zhp, options, flags) != 0)
+			return (1);
+		break;
+	}
+
+	return (0);
+}
+
+/*
+ * Reports progress in the form "(current/total)".  Not thread-safe.
+ */
+static void
+report_mount_progress(int current, int total)
+{
+	static time_t last_progress_time = 0;
+	time_t now = time(NULL);
+	char info[32];
+
+	/* report 1..n instead of 0..n-1 */
+	++current;
+
+	/* display header if we're here for the first time */
+	if (current == 1) {
+		set_progress_header(gettext("Mounting ZFS filesystems"));
+	} else if (current != total && last_progress_time + MOUNT_TIME >= now) {
+		/* too soon to report again */
+		return;
+	}
+
+	last_progress_time = now;
+
+	(void) sprintf(info, "(%d/%d)", current, total);
+
+	if (current == total)
+		finish_progress(info);
+	else
+		update_progress(info);
+}
+
+static void
+append_options(char *mntopts, char *newopts)
+{
+	int len = strlen(mntopts);
+
+	/* original length plus new string to append plus 1 for the comma */
+	if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) {
+		(void) fprintf(stderr, gettext("the opts argument for "
+		    "'%s' option is too long (more than %d chars)\n"),
+		    "-o", MNT_LINE_MAX);
+		usage(B_FALSE);
+	}
+
+	if (*mntopts)
+		mntopts[len++] = ',';
+
+	(void) strcpy(&mntopts[len], newopts);
+}
+
+static int
+share_mount(int op, int argc, char **argv)
+{
+	int do_all = 0;
+	boolean_t verbose = B_FALSE;
+	int c, ret = 0;
+	char *options = NULL;
+	int flags = 0;
+
+	/* check options */
+	while ((c = getopt(argc, argv, op == OP_MOUNT ? ":avo:O" : "a"))
+	    != -1) {
+		switch (c) {
+		case 'a':
+			do_all = 1;
+			break;
+		case 'v':
+			verbose = B_TRUE;
+			break;
+		case 'o':
+			if (*optarg == '\0') {
+				(void) fprintf(stderr, gettext("empty mount "
+				    "options (-o) specified\n"));
+				usage(B_FALSE);
+			}
+
+			if (options == NULL)
+				options = safe_malloc(MNT_LINE_MAX + 1);
+
+			/* option validation is done later */
+			append_options(options, optarg);
+			break;
+		case 'O':
+			flags |= MS_OVERLAY;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (do_all) {
+		zfs_handle_t **dslist = NULL;
+		size_t i, count = 0;
+		char *protocol = NULL;
+
+		if (op == OP_SHARE && argc > 0) {
+			if (strcmp(argv[0], "nfs") != 0 &&
+			    strcmp(argv[0], "smb") != 0) {
+				(void) fprintf(stderr, gettext("share type "
+				    "must be 'nfs' or 'smb'\n"));
+				usage(B_FALSE);
+			}
+			protocol = argv[0];
+			argc--;
+			argv++;
+		}
+
+		if (argc != 0) {
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+			usage(B_FALSE);
+		}
+
+		start_progress_timer();
+		get_all_datasets(&dslist, &count, verbose);
+
+		if (count == 0)
+			return (0);
+
+		qsort(dslist, count, sizeof (void *), libzfs_dataset_cmp);
+
+		for (i = 0; i < count; i++) {
+			if (verbose)
+				report_mount_progress(i, count);
+
+			if (share_mount_one(dslist[i], op, flags, protocol,
+			    B_FALSE, options) != 0)
+				ret = 1;
+			zfs_close(dslist[i]);
+		}
+
+		free(dslist);
+	} else if (argc == 0) {
+		struct mnttab entry;
+
+		if ((op == OP_SHARE) || (options != NULL)) {
+			(void) fprintf(stderr, gettext("missing filesystem "
+			    "argument (specify -a for all)\n"));
+			usage(B_FALSE);
+		}
+
+		/*
+		 * When mount is given no arguments, go through /etc/mtab and
+		 * display any active ZFS mounts.  We hide any snapshots, since
+		 * they are controlled automatically.
+		 */
+
+		/* Reopen MNTTAB to prevent reading stale data from open file */
+		if (freopen(MNTTAB, "r", mnttab_file) == NULL)
+			return (ENOENT);
+
+		while (getmntent(mnttab_file, &entry) == 0) {
+			if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 ||
+			    strchr(entry.mnt_special, '@') != NULL)
+				continue;
+
+			(void) printf("%-30s  %s\n", entry.mnt_special,
+			    entry.mnt_mountp);
+		}
+
+	} else {
+		zfs_handle_t *zhp;
+
+		if (argc > 1) {
+			(void) fprintf(stderr,
+			    gettext("too many arguments\n"));
+			usage(B_FALSE);
+		}
+
+		if ((zhp = zfs_open(g_zfs, argv[0],
+		    ZFS_TYPE_FILESYSTEM)) == NULL) {
+			ret = 1;
+		} else {
+			ret = share_mount_one(zhp, op, flags, NULL, B_TRUE,
+			    options);
+			zfs_close(zhp);
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * zfs mount -a [nfs]
+ * zfs mount filesystem
+ *
+ * Mount all filesystems, or mount the given filesystem.
+ */
+static int
+zfs_do_mount(int argc, char **argv)
+{
+	return (share_mount(OP_MOUNT, argc, argv));
+}
+
+/*
+ * zfs share -a [nfs | smb]
+ * zfs share filesystem
+ *
+ * Share all filesystems, or share the given filesystem.
+ */
+static int
+zfs_do_share(int argc, char **argv)
+{
+	return (share_mount(OP_SHARE, argc, argv));
+}
+
+typedef struct unshare_unmount_node {
+	zfs_handle_t	*un_zhp;
+	char		*un_mountp;
+	uu_avl_node_t	un_avlnode;
+} unshare_unmount_node_t;
+
+/* ARGSUSED */
+static int
+unshare_unmount_compare(const void *larg, const void *rarg, void *unused)
+{
+	const unshare_unmount_node_t *l = larg;
+	const unshare_unmount_node_t *r = rarg;
+
+	return (strcmp(l->un_mountp, r->un_mountp));
+}
+
+/*
+ * Convenience routine used by zfs_do_umount() and manual_unmount().  Given an
+ * absolute path, find the entry /etc/mtab, verify that its a ZFS filesystem,
+ * and unmount it appropriately.
+ */
+static int
+unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
+{
+	zfs_handle_t *zhp;
+	int ret = 0;
+	struct stat64 statbuf;
+	struct extmnttab entry;
+	const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount";
+	ino_t path_inode;
+
+	/*
+	 * Search for the path in /etc/mtab.  Rather than looking for the
+	 * specific path, which can be fooled by non-standard paths (i.e. ".."
+	 * or "//"), we stat() the path and search for the corresponding
+	 * (major,minor) device pair.
+	 */
+	if (stat64(path, &statbuf) != 0) {
+		(void) fprintf(stderr, gettext("cannot %s '%s': %s\n"),
+		    cmdname, path, strerror(errno));
+		return (1);
+	}
+	path_inode = statbuf.st_ino;
+
+	/*
+	 * Search for the given (major,minor) pair in the mount table.
+	 */
+
+	/* Reopen MNTTAB to prevent reading stale data from open file */
+	if (freopen(MNTTAB, "r", mnttab_file) == NULL)
+		return (ENOENT);
+
+	while ((ret = getextmntent(mnttab_file, &entry, 0)) == 0) {
+		if (entry.mnt_major == major(statbuf.st_dev) &&
+		    entry.mnt_minor == minor(statbuf.st_dev))
+			break;
+	}
+	if (ret != 0) {
+		if (op == OP_SHARE) {
+			(void) fprintf(stderr, gettext("cannot %s '%s': not "
+			    "currently mounted\n"), cmdname, path);
+			return (1);
+		}
+		(void) fprintf(stderr, gettext("warning: %s not in mtab\n"),
+		    path);
+		if ((ret = umount2(path, flags)) != 0)
+			(void) fprintf(stderr, gettext("%s: %s\n"), path,
+			    strerror(errno));
+		return (ret != 0);
+	}
+
+	if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
+		(void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS "
+		    "filesystem\n"), cmdname, path);
+		return (1);
+	}
+
+	if ((zhp = zfs_open(g_zfs, entry.mnt_special,
+	    ZFS_TYPE_FILESYSTEM)) == NULL)
+		return (1);
+
+	ret = 1;
+	if (stat64(entry.mnt_mountp, &statbuf) != 0) {
+		(void) fprintf(stderr, gettext("cannot %s '%s': %s\n"),
+		    cmdname, path, strerror(errno));
+		goto out;
+	} else if (statbuf.st_ino != path_inode) {
+		(void) fprintf(stderr, gettext("cannot "
+		    "%s '%s': not a mountpoint\n"), cmdname, path);
+		goto out;
+	}
+
+	if (op == OP_SHARE) {
+		char nfs_mnt_prop[ZFS_MAXPROPLEN];
+		char smbshare_prop[ZFS_MAXPROPLEN];
+
+		verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop,
+		    sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0);
+		verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop,
+		    sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0);
+
+		if (strcmp(nfs_mnt_prop, "off") == 0 &&
+		    strcmp(smbshare_prop, "off") == 0) {
+			(void) fprintf(stderr, gettext("cannot unshare "
+			    "'%s': legacy share\n"), path);
+			(void) fprintf(stderr, gettext("use exportfs(8) "
+			    "or smbcontrol(1) to unshare this filesystem\n"));
+		} else if (!zfs_is_shared(zhp)) {
+			(void) fprintf(stderr, gettext("cannot unshare '%s': "
+			    "not currently shared\n"), path);
+		} else {
+			ret = zfs_unshareall_bypath(zhp, path);
+		}
+	} else {
+		char mtpt_prop[ZFS_MAXPROPLEN];
+
+		verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop,
+		    sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0);
+
+		if (is_manual) {
+			ret = zfs_unmount(zhp, NULL, flags);
+		} else if (strcmp(mtpt_prop, "legacy") == 0) {
+			(void) fprintf(stderr, gettext("cannot unmount "
+			    "'%s': legacy mountpoint\n"),
+			    zfs_get_name(zhp));
+			(void) fprintf(stderr, gettext("use umount(8) "
+			    "to unmount this filesystem\n"));
+		} else {
+			ret = zfs_unmountall(zhp, flags);
+		}
+	}
+
+out:
+	zfs_close(zhp);
+
+	return (ret != 0);
+}
+
+/*
+ * Generic callback for unsharing or unmounting a filesystem.
+ */
+static int
+unshare_unmount(int op, int argc, char **argv)
+{
+	int do_all = 0;
+	int flags = 0;
+	int ret = 0;
+	int c;
+	zfs_handle_t *zhp;
+	char nfs_mnt_prop[ZFS_MAXPROPLEN];
+	char sharesmb[ZFS_MAXPROPLEN];
+
+	/* check options */
+	while ((c = getopt(argc, argv, op == OP_SHARE ? "a" : "af")) != -1) {
+		switch (c) {
+		case 'a':
+			do_all = 1;
+			break;
+		case 'f':
+			flags = MS_FORCE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (do_all) {
+		/*
+		 * We could make use of zfs_for_each() to walk all datasets in
+		 * the system, but this would be very inefficient, especially
+		 * since we would have to linearly search /etc/mtab for each
+		 * one.  Instead, do one pass through /etc/mtab looking for
+		 * zfs entries and call zfs_unmount() for each one.
+		 *
+		 * Things get a little tricky if the administrator has created
+		 * mountpoints beneath other ZFS filesystems.  In this case, we
+		 * have to unmount the deepest filesystems first.  To accomplish
+		 * this, we place all the mountpoints in an AVL tree sorted by
+		 * the special type (dataset name), and walk the result in
+		 * reverse to make sure to get any snapshots first.
+		 */
+		struct mnttab entry;
+		uu_avl_pool_t *pool;
+		uu_avl_t *tree = NULL;
+		unshare_unmount_node_t *node;
+		uu_avl_index_t idx;
+		uu_avl_walk_t *walk;
+
+		if (argc != 0) {
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+			usage(B_FALSE);
+		}
+
+		if (((pool = uu_avl_pool_create("unmount_pool",
+		    sizeof (unshare_unmount_node_t),
+		    offsetof(unshare_unmount_node_t, un_avlnode),
+		    unshare_unmount_compare, UU_DEFAULT)) == NULL) ||
+		    ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL))
+			nomem();
+
+		/* Reopen MNTTAB to prevent reading stale data from open file */
+		if (freopen(MNTTAB, "r", mnttab_file) == NULL)
+			return (ENOENT);
+
+		while (getmntent(mnttab_file, &entry) == 0) {
+
+			/* ignore non-ZFS entries */
+			if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
+				continue;
+
+			/* ignore snapshots */
+			if (strchr(entry.mnt_special, '@') != NULL)
+				continue;
+
+			if ((zhp = zfs_open(g_zfs, entry.mnt_special,
+			    ZFS_TYPE_FILESYSTEM)) == NULL) {
+				ret = 1;
+				continue;
+			}
+
+			switch (op) {
+			case OP_SHARE:
+				verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
+				    nfs_mnt_prop,
+				    sizeof (nfs_mnt_prop),
+				    NULL, NULL, 0, B_FALSE) == 0);
+				if (strcmp(nfs_mnt_prop, "off") != 0)
+					break;
+				verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
+				    nfs_mnt_prop,
+				    sizeof (nfs_mnt_prop),
+				    NULL, NULL, 0, B_FALSE) == 0);
+				if (strcmp(nfs_mnt_prop, "off") == 0)
+					continue;
+				break;
+			case OP_MOUNT:
+				/* Ignore legacy mounts */
+				verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
+				    nfs_mnt_prop,
+				    sizeof (nfs_mnt_prop),
+				    NULL, NULL, 0, B_FALSE) == 0);
+				if (strcmp(nfs_mnt_prop, "legacy") == 0)
+					continue;
+				/* Ignore canmount=noauto mounts */
+				if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) ==
+				    ZFS_CANMOUNT_NOAUTO)
+					continue;
+			default:
+				break;
+			}
+
+			node = safe_malloc(sizeof (unshare_unmount_node_t));
+			node->un_zhp = zhp;
+			node->un_mountp = safe_strdup(entry.mnt_mountp);
+
+			uu_avl_node_init(node, &node->un_avlnode, pool);
+
+			if (uu_avl_find(tree, node, NULL, &idx) == NULL) {
+				uu_avl_insert(tree, node, idx);
+			} else {
+				zfs_close(node->un_zhp);
+				free(node->un_mountp);
+				free(node);
+			}
+		}
+
+		/*
+		 * Walk the AVL tree in reverse, unmounting each filesystem and
+		 * removing it from the AVL tree in the process.
+		 */
+		if ((walk = uu_avl_walk_start(tree,
+		    UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL)
+			nomem();
+
+		while ((node = uu_avl_walk_next(walk)) != NULL) {
+			uu_avl_remove(tree, node);
+
+			switch (op) {
+			case OP_SHARE:
+				if (zfs_unshareall_bypath(node->un_zhp,
+				    node->un_mountp) != 0)
+					ret = 1;
+				break;
+
+			case OP_MOUNT:
+				if (zfs_unmount(node->un_zhp,
+				    node->un_mountp, flags) != 0)
+					ret = 1;
+				break;
+			}
+
+			zfs_close(node->un_zhp);
+			free(node->un_mountp);
+			free(node);
+		}
+
+		uu_avl_walk_end(walk);
+		uu_avl_destroy(tree);
+		uu_avl_pool_destroy(pool);
+
+	} else {
+		if (argc != 1) {
+			if (argc == 0)
+				(void) fprintf(stderr,
+				    gettext("missing filesystem argument\n"));
+			else
+				(void) fprintf(stderr,
+				    gettext("too many arguments\n"));
+			usage(B_FALSE);
+		}
+
+		/*
+		 * We have an argument, but it may be a full path or a ZFS
+		 * filesystem.  Pass full paths off to unmount_path() (shared by
+		 * manual_unmount), otherwise open the filesystem and pass to
+		 * zfs_unmount().
+		 */
+		if (argv[0][0] == '/')
+			return (unshare_unmount_path(op, argv[0],
+			    flags, B_FALSE));
+
+		if ((zhp = zfs_open(g_zfs, argv[0],
+		    ZFS_TYPE_FILESYSTEM)) == NULL)
+			return (1);
+
+		verify(zfs_prop_get(zhp, op == OP_SHARE ?
+		    ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
+		    nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL,
+		    NULL, 0, B_FALSE) == 0);
+
+		switch (op) {
+		case OP_SHARE:
+			verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
+			    nfs_mnt_prop,
+			    sizeof (nfs_mnt_prop),
+			    NULL, NULL, 0, B_FALSE) == 0);
+			verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
+			    sharesmb, sizeof (sharesmb), NULL, NULL,
+			    0, B_FALSE) == 0);
+
+			if (strcmp(nfs_mnt_prop, "off") == 0 &&
+			    strcmp(sharesmb, "off") == 0) {
+				(void) fprintf(stderr, gettext("cannot "
+				    "unshare '%s': legacy share\n"),
+				    zfs_get_name(zhp));
+				(void) fprintf(stderr, gettext("use "
+				    "unshare(1M) to unshare this "
+				    "filesystem\n"));
+				ret = 1;
+			} else if (!zfs_is_shared(zhp)) {
+				(void) fprintf(stderr, gettext("cannot "
+				    "unshare '%s': not currently "
+				    "shared\n"), zfs_get_name(zhp));
+				ret = 1;
+			} else if (zfs_unshareall(zhp) != 0) {
+				ret = 1;
+			}
+			break;
+
+		case OP_MOUNT:
+			if (strcmp(nfs_mnt_prop, "legacy") == 0) {
+				(void) fprintf(stderr, gettext("cannot "
+				    "unmount '%s': legacy "
+				    "mountpoint\n"), zfs_get_name(zhp));
+				(void) fprintf(stderr, gettext("use "
+				    "umount(1M) to unmount this "
+				    "filesystem\n"));
+				ret = 1;
+			} else if (!zfs_is_mounted(zhp, NULL)) {
+				(void) fprintf(stderr, gettext("cannot "
+				    "unmount '%s': not currently "
+				    "mounted\n"),
+				    zfs_get_name(zhp));
+				ret = 1;
+			} else if (zfs_unmountall(zhp, flags) != 0) {
+				ret = 1;
+			}
+			break;
+		}
+
+		zfs_close(zhp);
+	}
+
+	return (ret);
+}
+
+/*
+ * zfs unmount -a
+ * zfs unmount filesystem
+ *
+ * Unmount all filesystems, or a specific ZFS filesystem.
+ */
+static int
+zfs_do_unmount(int argc, char **argv)
+{
+	return (unshare_unmount(OP_MOUNT, argc, argv));
+}
+
+/*
+ * zfs unshare -a
+ * zfs unshare filesystem
+ *
+ * Unshare all filesystems, or a specific ZFS filesystem.
+ */
+static int
+zfs_do_unshare(int argc, char **argv)
+{
+	return (unshare_unmount(OP_SHARE, argc, argv));
+}
+
+static int
+find_command_idx(char *command, int *idx)
+{
+	int i;
+
+	for (i = 0; i < NCOMMAND; i++) {
+		if (command_table[i].name == NULL)
+			continue;
+
+		if (strcmp(command, command_table[i].name) == 0) {
+			*idx = i;
+			return (0);
+		}
+	}
+	return (1);
+}
+
+static int
+zfs_do_diff(int argc, char **argv)
+{
+	zfs_handle_t *zhp;
+	int flags = 0;
+	char *tosnap = NULL;
+	char *fromsnap = NULL;
+	char *atp, *copy;
+	int err = 0;
+	int c;
+
+	while ((c = getopt(argc, argv, "FHt")) != -1) {
+		switch (c) {
+		case 'F':
+			flags |= ZFS_DIFF_CLASSIFY;
+			break;
+		case 'H':
+			flags |= ZFS_DIFF_PARSEABLE;
+			break;
+		case 't':
+			flags |= ZFS_DIFF_TIMESTAMP;
+			break;
+		default:
+			(void) fprintf(stderr,
+			    gettext("invalid option '%c'\n"), optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr,
+		gettext("must provide at least one snapshot name\n"));
+		usage(B_FALSE);
+	}
+
+	if (argc > 2) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	fromsnap = argv[0];
+	tosnap = (argc == 2) ? argv[1] : NULL;
+
+	copy = NULL;
+	if (*fromsnap != '@')
+		copy = strdup(fromsnap);
+	else if (tosnap)
+		copy = strdup(tosnap);
+	if (copy == NULL)
+		usage(B_FALSE);
+
+	if ((atp = strchr(copy, '@')))
+		*atp = '\0';
+
+	if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL)
+		return (1);
+
+	free(copy);
+
+	/*
+	 * Ignore SIGPIPE so that the library can give us
+	 * information on any failure
+	 */
+	(void) sigignore(SIGPIPE);
+
+	err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags);
+
+	zfs_close(zhp);
+
+	return (err != 0);
+}
+
+/*
+ * zfs bookmark <fs@snap> <fs#bmark>
+ *
+ * Creates a bookmark with the given name from the given snapshot.
+ */
+static int
+zfs_do_bookmark(int argc, char **argv)
+{
+	char snapname[ZFS_MAXNAMELEN];
+	zfs_handle_t *zhp;
+	nvlist_t *nvl;
+	int ret = 0;
+	int c;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "")) != -1) {
+		switch (c) {
+		case '?':
+			(void) fprintf(stderr,
+			    gettext("invalid option '%c'\n"), optopt);
+			goto usage;
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
+		goto usage;
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing bookmark argument\n"));
+		goto usage;
+	}
+
+	if (strchr(argv[1], '#') == NULL) {
+		(void) fprintf(stderr,
+		    gettext("invalid bookmark name '%s' -- "
+		    "must contain a '#'\n"), argv[1]);
+		goto usage;
+	}
+
+	if (argv[0][0] == '@') {
+		/*
+		 * Snapshot name begins with @.
+		 * Default to same fs as bookmark.
+		 */
+		(void) strncpy(snapname, argv[1], sizeof (snapname));
+		*strchr(snapname, '#') = '\0';
+		(void) strlcat(snapname, argv[0], sizeof (snapname));
+	} else {
+		(void) strncpy(snapname, argv[0], sizeof (snapname));
+	}
+	zhp = zfs_open(g_zfs, snapname, ZFS_TYPE_SNAPSHOT);
+	if (zhp == NULL)
+		goto usage;
+	zfs_close(zhp);
+
+
+	nvl = fnvlist_alloc();
+	fnvlist_add_string(nvl, argv[1], snapname);
+	ret = lzc_bookmark(nvl, NULL);
+	fnvlist_free(nvl);
+
+	if (ret != 0) {
+		const char *err_msg;
+		char errbuf[1024];
+
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN,
+		    "cannot create bookmark '%s'"), argv[1]);
+
+		switch (ret) {
+		case EXDEV:
+			err_msg = "bookmark is in a different pool";
+			break;
+		case EEXIST:
+			err_msg = "bookmark exists";
+			break;
+		case EINVAL:
+			err_msg = "invalid argument";
+			break;
+		case ENOTSUP:
+			err_msg = "bookmark feature not enabled";
+			break;
+		case ENOSPC:
+			err_msg = "out of space";
+			break;
+		default:
+			err_msg = "unknown error";
+			break;
+		}
+		(void) fprintf(stderr, "%s: %s\n", errbuf,
+		    dgettext(TEXT_DOMAIN, err_msg));
+	}
+
+	return (ret != 0);
+
+usage:
+	usage(B_FALSE);
+	return (-1);
+}
+
+int
+main(int argc, char **argv)
+{
+	int ret = 0;
+	int i = 0;
+	char *cmdname;
+
+	(void) setlocale(LC_ALL, "");
+	(void) textdomain(TEXT_DOMAIN);
+
+	dprintf_setup(&argc, argv);
+
+	opterr = 0;
+
+	/*
+	 * Make sure the user has specified some command.
+	 */
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing command\n"));
+		usage(B_FALSE);
+	}
+
+	cmdname = argv[1];
+
+	/*
+	 * The 'umount' command is an alias for 'unmount'
+	 */
+	if (strcmp(cmdname, "umount") == 0)
+		cmdname = "unmount";
+
+	/*
+	 * The 'recv' command is an alias for 'receive'
+	 */
+	if (strcmp(cmdname, "recv") == 0)
+		cmdname = "receive";
+
+	/*
+	 * The 'snap' command is an alias for 'snapshot'
+	 */
+	if (strcmp(cmdname, "snap") == 0)
+		cmdname = "snapshot";
+
+	/*
+	 * Special case '-?'
+	 */
+	if ((strcmp(cmdname, "-?") == 0) ||
+	    (strcmp(cmdname, "--help") == 0))
+		usage(B_TRUE);
+
+	if ((g_zfs = libzfs_init()) == NULL) {
+		(void) fprintf(stderr, "%s", libzfs_error_init(errno));
+		return (1);
+	}
+
+	mnttab_file = g_zfs->libzfs_mnttab;
+
+	zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
+
+	libzfs_print_on_error(g_zfs, B_TRUE);
+
+	/*
+	 * Run the appropriate command.
+	 */
+	libzfs_mnttab_cache(g_zfs, B_TRUE);
+	if (find_command_idx(cmdname, &i) == 0) {
+		current_command = &command_table[i];
+		ret = command_table[i].func(argc - 1, argv + 1);
+	} else if (strchr(cmdname, '=') != NULL) {
+		verify(find_command_idx("set", &i) == 0);
+		current_command = &command_table[i];
+		ret = command_table[i].func(argc, argv);
+	} else {
+		(void) fprintf(stderr, gettext("unrecognized "
+		    "command '%s'\n"), cmdname);
+		usage(B_FALSE);
+		ret = 1;
+	}
+
+	if (ret == 0 && log_history)
+		(void) zpool_log_history(g_zfs, history_str);
+
+	libzfs_fini(g_zfs);
+
+	/*
+	 * The 'ZFS_ABORT' environment variable causes us to dump core on exit
+	 * for the purposes of running ::findleaks.
+	 */
+	if (getenv("ZFS_ABORT") != NULL) {
+		(void) printf("dumping core by request\n");
+		abort();
+	}
+
+	return (ret);
+}
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index 1dc9eff77017..d28146a372ca 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -3728,7 +3728,8 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
 	 */
 	n = ztest_random(regions) * stride + ztest_random(width);
 	s = 1 + ztest_random(2 * width - 1);
-	dmu_prefetch(os, bigobj, n * chunksize, s * chunksize);
+	dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize,
+	    ZIO_PRIORITY_SYNC_READ);
 
 	/*
 	 * Pick a random index and compute the offsets into packobj and bigobj.
@@ -5930,8 +5931,10 @@ ztest_run(ztest_shared_t *zs)
 	 * Right before closing the pool, kick off a bunch of async I/O;
 	 * spa_close() should wait for it to complete.
 	 */
-	for (object = 1; object < 50; object++)
-		dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20);
+	for (object = 1; object < 50; object++) {
+		dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20,
+		    ZIO_PRIORITY_SYNC_READ);
+	}
 
 	/* Verify that at least one commit cb was called in a timely fashion */
 	if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG)
diff --git a/cmd/ztest/ztest.c.orig b/cmd/ztest/ztest.c.orig
new file mode 100644
index 000000000000..1dc9eff77017
--- /dev/null
+++ b/cmd/ztest/ztest.c.orig
@@ -0,0 +1,6560 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ */
+
+/*
+ * The objective of this program is to provide a DMU/ZAP/SPA stress test
+ * that runs entirely in userland, is easy to use, and easy to extend.
+ *
+ * The overall design of the ztest program is as follows:
+ *
+ * (1) For each major functional area (e.g. adding vdevs to a pool,
+ *     creating and destroying datasets, reading and writing objects, etc)
+ *     we have a simple routine to test that functionality.  These
+ *     individual routines do not have to do anything "stressful".
+ *
+ * (2) We turn these simple functionality tests into a stress test by
+ *     running them all in parallel, with as many threads as desired,
+ *     and spread across as many datasets, objects, and vdevs as desired.
+ *
+ * (3) While all this is happening, we inject faults into the pool to
+ *     verify that self-healing data really works.
+ *
+ * (4) Every time we open a dataset, we change its checksum and compression
+ *     functions.  Thus even individual objects vary from block to block
+ *     in which checksum they use and whether they're compressed.
+ *
+ * (5) To verify that we never lose on-disk consistency after a crash,
+ *     we run the entire test in a child of the main process.
+ *     At random times, the child self-immolates with a SIGKILL.
+ *     This is the software equivalent of pulling the power cord.
+ *     The parent then runs the test again, using the existing
+ *     storage pool, as many times as desired. If backwards compatibility
+ *     testing is enabled ztest will sometimes run the "older" version
+ *     of ztest after a SIGKILL.
+ *
+ * (6) To verify that we don't have future leaks or temporal incursions,
+ *     many of the functional tests record the transaction group number
+ *     as part of their data.  When reading old data, they verify that
+ *     the transaction group number is less than the current, open txg.
+ *     If you add a new test, please do this if applicable.
+ *
+ * (7) Threads are created with a reduced stack size, for sanity checking.
+ *     Therefore, it's important not to allocate huge buffers on the stack.
+ *
+ * When run with no arguments, ztest runs for about five minutes and
+ * produces no output if successful.  To get a little bit of information,
+ * specify -V.  To get more information, specify -VV, and so on.
+ *
+ * To turn this into an overnight stress test, use -T to specify run time.
+ *
+ * You can ask more more vdevs [-v], datasets [-d], or threads [-t]
+ * to increase the pool capacity, fanout, and overall stress level.
+ *
+ * Use the -k option to set the desired frequency of kills.
+ *
+ * When ztest invokes itself it passes all relevant information through a
+ * temporary file which is mmap-ed in the child process. This allows shared
+ * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always
+ * stored at offset 0 of this file and contains information on the size and
+ * number of shared structures in the file. The information stored in this file
+ * must remain backwards compatible with older versions of ztest so that
+ * ztest can invoke them during backwards compatibility testing (-B).
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/dmu_objset.h>
+#include <sys/poll.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_file.h>
+#include <sys/spa_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_scan.h>
+#include <sys/zio_checksum.h>
+#include <sys/refcount.h>
+#include <sys/zfeature.h>
+#include <sys/dsl_userhold.h>
+#include <stdio.h>
+#include <stdio_ext.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <umem.h>
+#include <ctype.h>
+#include <math.h>
+#include <sys/fs/zfs.h>
+#include <libnvpair.h>
+#ifdef __GNUC__
+#include <execinfo.h> /* for backtrace() */
+#endif
+
+static int ztest_fd_data = -1;
+static int ztest_fd_rand = -1;
+
+typedef struct ztest_shared_hdr {
+	uint64_t	zh_hdr_size;
+	uint64_t	zh_opts_size;
+	uint64_t	zh_size;
+	uint64_t	zh_stats_size;
+	uint64_t	zh_stats_count;
+	uint64_t	zh_ds_size;
+	uint64_t	zh_ds_count;
+} ztest_shared_hdr_t;
+
+static ztest_shared_hdr_t *ztest_shared_hdr;
+
+typedef struct ztest_shared_opts {
+	char zo_pool[MAXNAMELEN];
+	char zo_dir[MAXNAMELEN];
+	char zo_alt_ztest[MAXNAMELEN];
+	char zo_alt_libpath[MAXNAMELEN];
+	uint64_t zo_vdevs;
+	uint64_t zo_vdevtime;
+	size_t zo_vdev_size;
+	int zo_ashift;
+	int zo_mirrors;
+	int zo_raidz;
+	int zo_raidz_parity;
+	int zo_datasets;
+	int zo_threads;
+	uint64_t zo_passtime;
+	uint64_t zo_killrate;
+	int zo_verbose;
+	int zo_init;
+	uint64_t zo_time;
+	uint64_t zo_maxloops;
+	uint64_t zo_metaslab_gang_bang;
+} ztest_shared_opts_t;
+
+static const ztest_shared_opts_t ztest_opts_defaults = {
+	.zo_pool = { 'z', 't', 'e', 's', 't', '\0' },
+	.zo_dir = { '/', 't', 'm', 'p', '\0' },
+	.zo_alt_ztest = { '\0' },
+	.zo_alt_libpath = { '\0' },
+	.zo_vdevs = 5,
+	.zo_ashift = SPA_MINBLOCKSHIFT,
+	.zo_mirrors = 2,
+	.zo_raidz = 4,
+	.zo_raidz_parity = 1,
+	.zo_vdev_size = SPA_MINDEVSIZE,
+	.zo_datasets = 7,
+	.zo_threads = 23,
+	.zo_passtime = 60,		/* 60 seconds */
+	.zo_killrate = 70,		/* 70% kill rate */
+	.zo_verbose = 0,
+	.zo_init = 1,
+	.zo_time = 300,			/* 5 minutes */
+	.zo_maxloops = 50,		/* max loops during spa_freeze() */
+	.zo_metaslab_gang_bang = 32 << 10
+};
+
+extern uint64_t metaslab_gang_bang;
+extern uint64_t metaslab_df_alloc_threshold;
+extern int metaslab_preload_limit;
+
+static ztest_shared_opts_t *ztest_shared_opts;
+static ztest_shared_opts_t ztest_opts;
+
+typedef struct ztest_shared_ds {
+	uint64_t	zd_seq;
+} ztest_shared_ds_t;
+
+static ztest_shared_ds_t *ztest_shared_ds;
+#define	ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d])
+
+#define	BT_MAGIC	0x123456789abcdefULL
+#define	MAXFAULTS() \
+	(MAX(zs->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1)
+
+enum ztest_io_type {
+	ZTEST_IO_WRITE_TAG,
+	ZTEST_IO_WRITE_PATTERN,
+	ZTEST_IO_WRITE_ZEROES,
+	ZTEST_IO_TRUNCATE,
+	ZTEST_IO_SETATTR,
+	ZTEST_IO_REWRITE,
+	ZTEST_IO_TYPES
+};
+
+typedef struct ztest_block_tag {
+	uint64_t	bt_magic;
+	uint64_t	bt_objset;
+	uint64_t	bt_object;
+	uint64_t	bt_offset;
+	uint64_t	bt_gen;
+	uint64_t	bt_txg;
+	uint64_t	bt_crtxg;
+} ztest_block_tag_t;
+
+typedef struct bufwad {
+	uint64_t	bw_index;
+	uint64_t	bw_txg;
+	uint64_t	bw_data;
+} bufwad_t;
+
+/*
+ * XXX -- fix zfs range locks to be generic so we can use them here.
+ */
+typedef enum {
+	RL_READER,
+	RL_WRITER,
+	RL_APPEND
+} rl_type_t;
+
+typedef struct rll {
+	void		*rll_writer;
+	int		rll_readers;
+	kmutex_t	rll_lock;
+	kcondvar_t	rll_cv;
+} rll_t;
+
+typedef struct rl {
+	uint64_t	rl_object;
+	uint64_t	rl_offset;
+	uint64_t	rl_size;
+	rll_t		*rl_lock;
+} rl_t;
+
+#define	ZTEST_RANGE_LOCKS	64
+#define	ZTEST_OBJECT_LOCKS	64
+
+/*
+ * Object descriptor.  Used as a template for object lookup/create/remove.
+ */
+typedef struct ztest_od {
+	uint64_t	od_dir;
+	uint64_t	od_object;
+	dmu_object_type_t od_type;
+	dmu_object_type_t od_crtype;
+	uint64_t	od_blocksize;
+	uint64_t	od_crblocksize;
+	uint64_t	od_gen;
+	uint64_t	od_crgen;
+	char		od_name[MAXNAMELEN];
+} ztest_od_t;
+
+/*
+ * Per-dataset state.
+ */
+typedef struct ztest_ds {
+	ztest_shared_ds_t *zd_shared;
+	objset_t	*zd_os;
+	rwlock_t	zd_zilog_lock;
+	zilog_t		*zd_zilog;
+	ztest_od_t	*zd_od;		/* debugging aid */
+	char		zd_name[MAXNAMELEN];
+	kmutex_t	zd_dirobj_lock;
+	rll_t		zd_object_lock[ZTEST_OBJECT_LOCKS];
+	rll_t		zd_range_lock[ZTEST_RANGE_LOCKS];
+} ztest_ds_t;
+
+/*
+ * Per-iteration state.
+ */
+typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);
+
+typedef struct ztest_info {
+	ztest_func_t	*zi_func;	/* test function */
+	uint64_t	zi_iters;	/* iterations per execution */
+	uint64_t	*zi_interval;	/* execute every <interval> seconds */
+	const char	*zi_funcname;	/* name of test function */
+} ztest_info_t;
+
+typedef struct ztest_shared_callstate {
+	uint64_t	zc_count;	/* per-pass count */
+	uint64_t	zc_time;	/* per-pass time */
+	uint64_t	zc_next;	/* next time to call this function */
+} ztest_shared_callstate_t;
+
+static ztest_shared_callstate_t *ztest_shared_callstate;
+#define	ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c])
+
+ztest_func_t ztest_dmu_read_write;
+ztest_func_t ztest_dmu_write_parallel;
+ztest_func_t ztest_dmu_object_alloc_free;
+ztest_func_t ztest_dmu_commit_callbacks;
+ztest_func_t ztest_zap;
+ztest_func_t ztest_zap_parallel;
+ztest_func_t ztest_zil_commit;
+ztest_func_t ztest_zil_remount;
+ztest_func_t ztest_dmu_read_write_zcopy;
+ztest_func_t ztest_dmu_objset_create_destroy;
+ztest_func_t ztest_dmu_prealloc;
+ztest_func_t ztest_fzap;
+ztest_func_t ztest_dmu_snapshot_create_destroy;
+ztest_func_t ztest_dsl_prop_get_set;
+ztest_func_t ztest_spa_prop_get_set;
+ztest_func_t ztest_spa_create_destroy;
+ztest_func_t ztest_fault_inject;
+ztest_func_t ztest_ddt_repair;
+ztest_func_t ztest_dmu_snapshot_hold;
+ztest_func_t ztest_spa_rename;
+ztest_func_t ztest_scrub;
+ztest_func_t ztest_dsl_dataset_promote_busy;
+ztest_func_t ztest_vdev_attach_detach;
+ztest_func_t ztest_vdev_LUN_growth;
+ztest_func_t ztest_vdev_add_remove;
+ztest_func_t ztest_vdev_aux_add_remove;
+ztest_func_t ztest_split_pool;
+ztest_func_t ztest_reguid;
+ztest_func_t ztest_spa_upgrade;
+
+uint64_t zopt_always = 0ULL * NANOSEC;		/* all the time */
+uint64_t zopt_incessant = 1ULL * NANOSEC / 10;	/* every 1/10 second */
+uint64_t zopt_often = 1ULL * NANOSEC;		/* every second */
+uint64_t zopt_sometimes = 10ULL * NANOSEC;	/* every 10 seconds */
+uint64_t zopt_rarely = 60ULL * NANOSEC;		/* every 60 seconds */
+
+#define	ZTI_INIT(func, iters, interval) \
+	{   .zi_func = (func), \
+	    .zi_iters = (iters), \
+	    .zi_interval = (interval), \
+	    .zi_funcname = # func }
+
+ztest_info_t ztest_info[] = {
+	ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always),
+	ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always),
+	ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always),
+	ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always),
+	ZTI_INIT(ztest_zap, 30, &zopt_always),
+	ZTI_INIT(ztest_zap_parallel, 100, &zopt_always),
+	ZTI_INIT(ztest_split_pool, 1, &zopt_always),
+	ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant),
+	ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often),
+	ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often),
+	ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often),
+	ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes),
+#if 0
+	ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes),
+#endif
+	ZTI_INIT(ztest_fzap, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_ddt_repair, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_reguid, 1, &zopt_rarely),
+	ZTI_INIT(ztest_spa_rename, 1, &zopt_rarely),
+	ZTI_INIT(ztest_scrub, 1, &zopt_rarely),
+	ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely),
+	ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely),
+	ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely),
+	ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime),
+	ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime),
+};
+
+#define	ZTEST_FUNCS	(sizeof (ztest_info) / sizeof (ztest_info_t))
+
+/*
+ * The following struct is used to hold a list of uncalled commit callbacks.
+ * The callbacks are ordered by txg number.
+ */
+typedef struct ztest_cb_list {
+	kmutex_t	zcl_callbacks_lock;
+	list_t		zcl_callbacks;
+} ztest_cb_list_t;
+
+/*
+ * Stuff we need to share writably between parent and child.
+ */
+typedef struct ztest_shared {
+	boolean_t	zs_do_init;
+	hrtime_t	zs_proc_start;
+	hrtime_t	zs_proc_stop;
+	hrtime_t	zs_thread_start;
+	hrtime_t	zs_thread_stop;
+	hrtime_t	zs_thread_kill;
+	uint64_t	zs_enospc_count;
+	uint64_t	zs_vdev_next_leaf;
+	uint64_t	zs_vdev_aux;
+	uint64_t	zs_alloc;
+	uint64_t	zs_space;
+	uint64_t	zs_splits;
+	uint64_t	zs_mirrors;
+	uint64_t	zs_metaslab_sz;
+	uint64_t	zs_metaslab_df_alloc_threshold;
+	uint64_t	zs_guid;
+} ztest_shared_t;
+
+#define	ID_PARALLEL	-1ULL
+
+static char ztest_dev_template[] = "%s/%s.%llua";
+static char ztest_aux_template[] = "%s/%s.%s.%llu";
+ztest_shared_t *ztest_shared;
+
+static spa_t *ztest_spa = NULL;
+static ztest_ds_t *ztest_ds;
+
+static kmutex_t ztest_vdev_lock;
+
+/*
+ * The ztest_name_lock protects the pool and dataset namespace used by
+ * the individual tests. To modify the namespace, consumers must grab
+ * this lock as writer. Grabbing the lock as reader will ensure that the
+ * namespace does not change while the lock is held.
+ */
+static rwlock_t ztest_name_lock;
+
+static boolean_t ztest_dump_core = B_TRUE;
+static boolean_t ztest_exiting;
+
+/* Global commit callback list */
+static ztest_cb_list_t zcl;
+/* Commit cb delay */
+static uint64_t zc_min_txg_delay = UINT64_MAX;
+static int zc_cb_counter = 0;
+
+/*
+ * Minimum number of commit callbacks that need to be registered for us to check
+ * whether the minimum txg delay is acceptable.
+ */
+#define	ZTEST_COMMIT_CB_MIN_REG	100
+
+/*
+ * If a number of txgs equal to this threshold have been created after a commit
+ * callback has been registered but not called, then we assume there is an
+ * implementation bug.
+ */
+#define	ZTEST_COMMIT_CB_THRESH	(TXG_CONCURRENT_STATES + 1000)
+
+extern uint64_t metaslab_gang_bang;
+extern uint64_t metaslab_df_alloc_threshold;
+
+enum ztest_object {
+	ZTEST_META_DNODE = 0,
+	ZTEST_DIROBJ,
+	ZTEST_OBJECTS
+};
+
+static void usage(boolean_t) __NORETURN;
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+const char *
+_umem_debug_init(void)
+{
+	return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+	return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+
+#define	BACKTRACE_SZ	100
+
+static void sig_handler(int signo)
+{
+	struct sigaction action;
+#ifdef __GNUC__ /* backtrace() is a GNU extension */
+	int nptrs;
+	void *buffer[BACKTRACE_SZ];
+
+	nptrs = backtrace(buffer, BACKTRACE_SZ);
+	backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO);
+#endif
+
+	/*
+	 * Restore default action and re-raise signal so SIGSEGV and
+	 * SIGABRT can trigger a core dump.
+	 */
+	action.sa_handler = SIG_DFL;
+	sigemptyset(&action.sa_mask);
+	action.sa_flags = 0;
+	(void) sigaction(signo, &action, NULL);
+	raise(signo);
+}
+
+#define	FATAL_MSG_SZ	1024
+
+char *fatal_msg;
+
+static void
+fatal(int do_perror, char *message, ...)
+{
+	va_list args;
+	int save_errno = errno;
+	char *buf;
+
+	(void) fflush(stdout);
+	buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL);
+
+	va_start(args, message);
+	(void) sprintf(buf, "ztest: ");
+	/* LINTED */
+	(void) vsprintf(buf + strlen(buf), message, args);
+	va_end(args);
+	if (do_perror) {
+		(void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf),
+		    ": %s", strerror(save_errno));
+	}
+	(void) fprintf(stderr, "%s\n", buf);
+	fatal_msg = buf;			/* to ease debugging */
+	if (ztest_dump_core)
+		abort();
+	exit(3);
+}
+
+static int
+str2shift(const char *buf)
+{
+	const char *ends = "BKMGTPEZ";
+	int i;
+
+	if (buf[0] == '\0')
+		return (0);
+	for (i = 0; i < strlen(ends); i++) {
+		if (toupper(buf[0]) == ends[i])
+			break;
+	}
+	if (i == strlen(ends)) {
+		(void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n",
+		    buf);
+		usage(B_FALSE);
+	}
+	if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) {
+		return (10*i);
+	}
+	(void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf);
+	usage(B_FALSE);
+	/* NOTREACHED */
+}
+
+static uint64_t
+nicenumtoull(const char *buf)
+{
+	char *end;
+	uint64_t val;
+
+	val = strtoull(buf, &end, 0);
+	if (end == buf) {
+		(void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf);
+		usage(B_FALSE);
+	} else if (end[0] == '.') {
+		double fval = strtod(buf, &end);
+		fval *= pow(2, str2shift(end));
+		if (fval > UINT64_MAX) {
+			(void) fprintf(stderr, "ztest: value too large: %s\n",
+			    buf);
+			usage(B_FALSE);
+		}
+		val = (uint64_t)fval;
+	} else {
+		int shift = str2shift(end);
+		if (shift >= 64 || (val << shift) >> shift != val) {
+			(void) fprintf(stderr, "ztest: value too large: %s\n",
+			    buf);
+			usage(B_FALSE);
+		}
+		val <<= shift;
+	}
+	return (val);
+}
+
+static void
+usage(boolean_t requested)
+{
+	const ztest_shared_opts_t *zo = &ztest_opts_defaults;
+
+	char nice_vdev_size[10];
+	char nice_gang_bang[10];
+	FILE *fp = requested ? stdout : stderr;
+
+	nicenum(zo->zo_vdev_size, nice_vdev_size);
+	nicenum(zo->zo_metaslab_gang_bang, nice_gang_bang);
+
+	(void) fprintf(fp, "Usage: %s\n"
+	    "\t[-v vdevs (default: %llu)]\n"
+	    "\t[-s size_of_each_vdev (default: %s)]\n"
+	    "\t[-a alignment_shift (default: %d)] use 0 for random\n"
+	    "\t[-m mirror_copies (default: %d)]\n"
+	    "\t[-r raidz_disks (default: %d)]\n"
+	    "\t[-R raidz_parity (default: %d)]\n"
+	    "\t[-d datasets (default: %d)]\n"
+	    "\t[-t threads (default: %d)]\n"
+	    "\t[-g gang_block_threshold (default: %s)]\n"
+	    "\t[-i init_count (default: %d)] initialize pool i times\n"
+	    "\t[-k kill_percentage (default: %llu%%)]\n"
+	    "\t[-p pool_name (default: %s)]\n"
+	    "\t[-f dir (default: %s)] file directory for vdev files\n"
+	    "\t[-V] verbose (use multiple times for ever more blather)\n"
+	    "\t[-E] use existing pool instead of creating new one\n"
+	    "\t[-T time (default: %llu sec)] total run time\n"
+	    "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n"
+	    "\t[-P passtime (default: %llu sec)] time per pass\n"
+	    "\t[-B alt_ztest (default: <none>)] alternate ztest path\n"
+	    "\t[-h] (print help)\n"
+	    "",
+	    zo->zo_pool,
+	    (u_longlong_t)zo->zo_vdevs,			/* -v */
+	    nice_vdev_size,				/* -s */
+	    zo->zo_ashift,				/* -a */
+	    zo->zo_mirrors,				/* -m */
+	    zo->zo_raidz,				/* -r */
+	    zo->zo_raidz_parity,			/* -R */
+	    zo->zo_datasets,				/* -d */
+	    zo->zo_threads,				/* -t */
+	    nice_gang_bang,				/* -g */
+	    zo->zo_init,				/* -i */
+	    (u_longlong_t)zo->zo_killrate,		/* -k */
+	    zo->zo_pool,				/* -p */
+	    zo->zo_dir,					/* -f */
+	    (u_longlong_t)zo->zo_time,			/* -T */
+	    (u_longlong_t)zo->zo_maxloops,		/* -F */
+	    (u_longlong_t)zo->zo_passtime);
+	exit(requested ? 0 : 1);
+}
+
+static void
+process_options(int argc, char **argv)
+{
+	char *path;
+	ztest_shared_opts_t *zo = &ztest_opts;
+
+	int opt;
+	uint64_t value;
+	char altdir[MAXNAMELEN] = { 0 };
+
+	bcopy(&ztest_opts_defaults, zo, sizeof (*zo));
+
+	while ((opt = getopt(argc, argv,
+	    "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:hF:B:")) != EOF) {
+		value = 0;
+		switch (opt) {
+		case 'v':
+		case 's':
+		case 'a':
+		case 'm':
+		case 'r':
+		case 'R':
+		case 'd':
+		case 't':
+		case 'g':
+		case 'i':
+		case 'k':
+		case 'T':
+		case 'P':
+		case 'F':
+			value = nicenumtoull(optarg);
+		}
+		switch (opt) {
+		case 'v':
+			zo->zo_vdevs = value;
+			break;
+		case 's':
+			zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value);
+			break;
+		case 'a':
+			zo->zo_ashift = value;
+			break;
+		case 'm':
+			zo->zo_mirrors = value;
+			break;
+		case 'r':
+			zo->zo_raidz = MAX(1, value);
+			break;
+		case 'R':
+			zo->zo_raidz_parity = MIN(MAX(value, 1), 3);
+			break;
+		case 'd':
+			zo->zo_datasets = MAX(1, value);
+			break;
+		case 't':
+			zo->zo_threads = MAX(1, value);
+			break;
+		case 'g':
+			zo->zo_metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1,
+			    value);
+			break;
+		case 'i':
+			zo->zo_init = value;
+			break;
+		case 'k':
+			zo->zo_killrate = value;
+			break;
+		case 'p':
+			(void) strlcpy(zo->zo_pool, optarg,
+			    sizeof (zo->zo_pool));
+			break;
+		case 'f':
+			path = realpath(optarg, NULL);
+			if (path == NULL) {
+				(void) fprintf(stderr, "error: %s: %s\n",
+				    optarg, strerror(errno));
+				usage(B_FALSE);
+			} else {
+				(void) strlcpy(zo->zo_dir, path,
+				    sizeof (zo->zo_dir));
+			}
+			break;
+		case 'V':
+			zo->zo_verbose++;
+			break;
+		case 'E':
+			zo->zo_init = 0;
+			break;
+		case 'T':
+			zo->zo_time = value;
+			break;
+		case 'P':
+			zo->zo_passtime = MAX(1, value);
+			break;
+		case 'F':
+			zo->zo_maxloops = MAX(1, value);
+			break;
+		case 'B':
+			(void) strlcpy(altdir, optarg, sizeof (altdir));
+			break;
+		case 'h':
+			usage(B_TRUE);
+			break;
+		case '?':
+		default:
+			usage(B_FALSE);
+			break;
+		}
+	}
+
+	zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1);
+
+	zo->zo_vdevtime =
+	    (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs :
+	    UINT64_MAX >> 2);
+
+	if (strlen(altdir) > 0) {
+		char *cmd;
+		char *realaltdir;
+		char *bin;
+		char *ztest;
+		char *isa;
+		int isalen;
+
+		cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+		realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+		VERIFY(NULL != realpath(getexecname(), cmd));
+		if (0 != access(altdir, F_OK)) {
+			ztest_dump_core = B_FALSE;
+			fatal(B_TRUE, "invalid alternate ztest path: %s",
+			    altdir);
+		}
+		VERIFY(NULL != realpath(altdir, realaltdir));
+
+		/*
+		 * 'cmd' should be of the form "<anything>/usr/bin/<isa>/ztest".
+		 * We want to extract <isa> to determine if we should use
+		 * 32 or 64 bit binaries.
+		 */
+		bin = strstr(cmd, "/usr/bin/");
+		ztest = strstr(bin, "/ztest");
+		isa = bin + 9;
+		isalen = ztest - isa;
+		(void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest),
+		    "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa);
+		(void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath),
+		    "%s/usr/lib/%.*s", realaltdir, isalen, isa);
+
+		if (0 != access(zo->zo_alt_ztest, X_OK)) {
+			ztest_dump_core = B_FALSE;
+			fatal(B_TRUE, "invalid alternate ztest: %s",
+			    zo->zo_alt_ztest);
+		} else if (0 != access(zo->zo_alt_libpath, X_OK)) {
+			ztest_dump_core = B_FALSE;
+			fatal(B_TRUE, "invalid alternate lib directory %s",
+			    zo->zo_alt_libpath);
+		}
+
+		umem_free(cmd, MAXPATHLEN);
+		umem_free(realaltdir, MAXPATHLEN);
+	}
+}
+
+static void
+ztest_kill(ztest_shared_t *zs)
+{
+	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa));
+	zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa));
+
+	/*
+	 * Before we kill off ztest, make sure that the config is updated.
+	 * See comment above spa_config_sync().
+	 */
+	mutex_enter(&spa_namespace_lock);
+	spa_config_sync(ztest_spa, B_FALSE, B_FALSE);
+	mutex_exit(&spa_namespace_lock);
+
+	(void) kill(getpid(), SIGKILL);
+}
+
+static uint64_t
+ztest_random(uint64_t range)
+{
+	uint64_t r;
+
+	ASSERT3S(ztest_fd_rand, >=, 0);
+
+	if (range == 0)
+		return (0);
+
+	if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
+		fatal(1, "short read from /dev/urandom");
+
+	return (r % range);
+}
+
+/* ARGSUSED */
+static void
+ztest_record_enospc(const char *s)
+{
+	ztest_shared->zs_enospc_count++;
+}
+
+static uint64_t
+ztest_get_ashift(void)
+{
+	if (ztest_opts.zo_ashift == 0)
+		return (SPA_MINBLOCKSHIFT + ztest_random(5));
+	return (ztest_opts.zo_ashift);
+}
+
+static nvlist_t *
+make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
+{
+	char *pathbuf;
+	uint64_t vdev;
+	nvlist_t *file;
+
+	pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+	if (ashift == 0)
+		ashift = ztest_get_ashift();
+
+	if (path == NULL) {
+		path = pathbuf;
+
+		if (aux != NULL) {
+			vdev = ztest_shared->zs_vdev_aux;
+			(void) snprintf(path, MAXPATHLEN,
+			    ztest_aux_template, ztest_opts.zo_dir,
+			    pool == NULL ? ztest_opts.zo_pool : pool,
+			    aux, vdev);
+		} else {
+			vdev = ztest_shared->zs_vdev_next_leaf++;
+			(void) snprintf(path, MAXPATHLEN,
+			    ztest_dev_template, ztest_opts.zo_dir,
+			    pool == NULL ? ztest_opts.zo_pool : pool, vdev);
+		}
+	}
+
+	if (size != 0) {
+		int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666);
+		if (fd == -1)
+			fatal(1, "can't open %s", path);
+		if (ftruncate(fd, size) != 0)
+			fatal(1, "can't ftruncate %s", path);
+		(void) close(fd);
+	}
+
+	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
+	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0);
+	VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
+	umem_free(pathbuf, MAXPATHLEN);
+
+	return (file);
+}
+
+static nvlist_t *
+make_vdev_raidz(char *path, char *aux, char *pool, size_t size,
+    uint64_t ashift, int r)
+{
+	nvlist_t *raidz, **child;
+	int c;
+
+	if (r < 2)
+		return (make_vdev_file(path, aux, pool, size, ashift));
+	child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);
+
+	for (c = 0; c < r; c++)
+		child[c] = make_vdev_file(path, aux, pool, size, ashift);
+
+	VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
+	    VDEV_TYPE_RAIDZ) == 0);
+	VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
+	    ztest_opts.zo_raidz_parity) == 0);
+	VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
+	    child, r) == 0);
+
+	for (c = 0; c < r; c++)
+		nvlist_free(child[c]);
+
+	umem_free(child, r * sizeof (nvlist_t *));
+
+	return (raidz);
+}
+
+static nvlist_t *
+make_vdev_mirror(char *path, char *aux, char *pool, size_t size,
+    uint64_t ashift, int r, int m)
+{
+	nvlist_t *mirror, **child;
+	int c;
+
+	if (m < 1)
+		return (make_vdev_raidz(path, aux, pool, size, ashift, r));
+
+	child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
+
+	for (c = 0; c < m; c++)
+		child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r);
+
+	VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
+	    VDEV_TYPE_MIRROR) == 0);
+	VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN,
+	    child, m) == 0);
+
+	for (c = 0; c < m; c++)
+		nvlist_free(child[c]);
+
+	umem_free(child, m * sizeof (nvlist_t *));
+
+	return (mirror);
+}
+
+static nvlist_t *
+make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift,
+    int log, int r, int m, int t)
+{
+	nvlist_t *root, **child;
+	int c;
+
+	ASSERT(t > 0);
+
+	child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
+
+	for (c = 0; c < t; c++) {
+		child[c] = make_vdev_mirror(path, aux, pool, size, ashift,
+		    r, m);
+		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+		    log) == 0);
+	}
+
+	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
+	VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN,
+	    child, t) == 0);
+
+	for (c = 0; c < t; c++)
+		nvlist_free(child[c]);
+
+	umem_free(child, t * sizeof (nvlist_t *));
+
+	return (root);
+}
+
+/*
+ * Find a random spa version. Returns back a random spa version in the
+ * range [initial_version, SPA_VERSION_FEATURES].
+ */
+static uint64_t
+ztest_random_spa_version(uint64_t initial_version)
+{
+	uint64_t version = initial_version;
+
+	if (version <= SPA_VERSION_BEFORE_FEATURES) {
+		version = version +
+		    ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1);
+	}
+
+	if (version > SPA_VERSION_BEFORE_FEATURES)
+		version = SPA_VERSION_FEATURES;
+
+	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
+	return (version);
+}
+
+static int
+ztest_random_blocksize(void)
+{
+	/*
+	 * Choose a block size >= the ashift.
+	 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
+	 */
+	int maxbs = SPA_OLD_MAXBLOCKSHIFT;
+	if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
+		maxbs = 20;
+	uint64_t block_shift =
+	    ztest_random(maxbs - ztest_spa->spa_max_ashift + 1);
+	return (1 << (SPA_MINBLOCKSHIFT + block_shift));
+}
+
+static int
+ztest_random_ibshift(void)
+{
+	return (DN_MIN_INDBLKSHIFT +
+	    ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1));
+}
+
+static uint64_t
+ztest_random_vdev_top(spa_t *spa, boolean_t log_ok)
+{
+	uint64_t top;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *tvd;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+	do {
+		top = ztest_random(rvd->vdev_children);
+		tvd = rvd->vdev_child[top];
+	} while (tvd->vdev_ishole || (tvd->vdev_islog && !log_ok) ||
+	    tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL);
+
+	return (top);
+}
+
+static uint64_t
+ztest_random_dsl_prop(zfs_prop_t prop)
+{
+	uint64_t value;
+
+	do {
+		value = zfs_prop_random_value(prop, ztest_random(-1ULL));
+	} while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF);
+
+	return (value);
+}
+
+static int
+ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
+    boolean_t inherit)
+{
+	const char *propname = zfs_prop_to_name(prop);
+	const char *valname;
+	char *setpoint;
+	uint64_t curval;
+	int error;
+
+	error = dsl_prop_set_int(osname, propname,
+	    (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value);
+
+	if (error == ENOSPC) {
+		ztest_record_enospc(FTAG);
+		return (error);
+	}
+	ASSERT0(error);
+
+	setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+	VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint));
+
+	if (ztest_opts.zo_verbose >= 6) {
+		int err;
+
+		err = zfs_prop_index_to_string(prop, curval, &valname);
+		if (err)
+			(void) printf("%s %s = %llu at '%s'\n",
+			    osname, propname, (unsigned long long)curval,
+				setpoint);
+		else
+			(void) printf("%s %s = %s at '%s'\n",
+			    osname, propname, valname, setpoint);
+	}
+	umem_free(setpoint, MAXPATHLEN);
+
+	return (error);
+}
+
+static int
+ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value)
+{
+	spa_t *spa = ztest_spa;
+	nvlist_t *props = NULL;
+	int error;
+
+	VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);
+
+	error = spa_prop_set(spa, props);
+
+	nvlist_free(props);
+
+	if (error == ENOSPC) {
+		ztest_record_enospc(FTAG);
+		return (error);
+	}
+	ASSERT0(error);
+
+	return (error);
+}
+
+static void
+ztest_rll_init(rll_t *rll)
+{
+	rll->rll_writer = NULL;
+	rll->rll_readers = 0;
+	mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL);
+}
+
+static void
+ztest_rll_destroy(rll_t *rll)
+{
+	ASSERT(rll->rll_writer == NULL);
+	ASSERT(rll->rll_readers == 0);
+	mutex_destroy(&rll->rll_lock);
+	cv_destroy(&rll->rll_cv);
+}
+
+static void
+ztest_rll_lock(rll_t *rll, rl_type_t type)
+{
+	mutex_enter(&rll->rll_lock);
+
+	if (type == RL_READER) {
+		while (rll->rll_writer != NULL)
+			(void) cv_wait(&rll->rll_cv, &rll->rll_lock);
+		rll->rll_readers++;
+	} else {
+		while (rll->rll_writer != NULL || rll->rll_readers)
+			(void) cv_wait(&rll->rll_cv, &rll->rll_lock);
+		rll->rll_writer = curthread;
+	}
+
+	mutex_exit(&rll->rll_lock);
+}
+
+static void
+ztest_rll_unlock(rll_t *rll)
+{
+	mutex_enter(&rll->rll_lock);
+
+	if (rll->rll_writer) {
+		ASSERT(rll->rll_readers == 0);
+		rll->rll_writer = NULL;
+	} else {
+		ASSERT(rll->rll_readers != 0);
+		ASSERT(rll->rll_writer == NULL);
+		rll->rll_readers--;
+	}
+
+	if (rll->rll_writer == NULL && rll->rll_readers == 0)
+		cv_broadcast(&rll->rll_cv);
+
+	mutex_exit(&rll->rll_lock);
+}
+
+static void
+ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
+{
+	rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
+
+	ztest_rll_lock(rll, type);
+}
+
+static void
+ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
+{
+	rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
+
+	ztest_rll_unlock(rll);
+}
+
+static rl_t *
+ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
+    uint64_t size, rl_type_t type)
+{
+	uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
+	rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
+	rl_t *rl;
+
+	rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
+	rl->rl_object = object;
+	rl->rl_offset = offset;
+	rl->rl_size = size;
+	rl->rl_lock = rll;
+
+	ztest_rll_lock(rll, type);
+
+	return (rl);
+}
+
+static void
+ztest_range_unlock(rl_t *rl)
+{
+	rll_t *rll = rl->rl_lock;
+
+	ztest_rll_unlock(rll);
+
+	umem_free(rl, sizeof (*rl));
+}
+
+static void
+ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os)
+{
+	zd->zd_os = os;
+	zd->zd_zilog = dmu_objset_zil(os);
+	zd->zd_shared = szd;
+	dmu_objset_name(os, zd->zd_name);
+	int l;
+
+	if (zd->zd_shared != NULL)
+		zd->zd_shared->zd_seq = 0;
+
+	VERIFY(rwlock_init(&zd->zd_zilog_lock, USYNC_THREAD, NULL) == 0);
+	mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+		ztest_rll_init(&zd->zd_object_lock[l]);
+
+	for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
+		ztest_rll_init(&zd->zd_range_lock[l]);
+}
+
+static void
+ztest_zd_fini(ztest_ds_t *zd)
+{
+	int l;
+
+	mutex_destroy(&zd->zd_dirobj_lock);
+	(void) rwlock_destroy(&zd->zd_zilog_lock);
+
+	for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+		ztest_rll_destroy(&zd->zd_object_lock[l]);
+
+	for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
+		ztest_rll_destroy(&zd->zd_range_lock[l]);
+}
+
+#define	TXG_MIGHTWAIT	(ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
+
+static uint64_t
+ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
+{
+	uint64_t txg;
+	int error;
+
+	/*
+	 * Attempt to assign tx to some transaction group.
+	 */
+	error = dmu_tx_assign(tx, txg_how);
+	if (error) {
+		if (error == ERESTART) {
+			ASSERT(txg_how == TXG_NOWAIT);
+			dmu_tx_wait(tx);
+		} else {
+			ASSERT3U(error, ==, ENOSPC);
+			ztest_record_enospc(tag);
+		}
+		dmu_tx_abort(tx);
+		return (0);
+	}
+	txg = dmu_tx_get_txg(tx);
+	ASSERT(txg != 0);
+	return (txg);
+}
+
+static void
+ztest_pattern_set(void *buf, uint64_t size, uint64_t value)
+{
+	uint64_t *ip = buf;
+	uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+
+	while (ip < ip_end)
+		*ip++ = value;
+}
+
+#ifndef NDEBUG
+static boolean_t
+ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
+{
+	uint64_t *ip = buf;
+	uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+	uint64_t diff = 0;
+
+	while (ip < ip_end)
+		diff |= (value - *ip++);
+
+	return (diff == 0);
+}
+#endif
+
+static void
+ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+    uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+{
+	bt->bt_magic = BT_MAGIC;
+	bt->bt_objset = dmu_objset_id(os);
+	bt->bt_object = object;
+	bt->bt_offset = offset;
+	bt->bt_gen = gen;
+	bt->bt_txg = txg;
+	bt->bt_crtxg = crtxg;
+}
+
+static void
+ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+    uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+{
+	ASSERT3U(bt->bt_magic, ==, BT_MAGIC);
+	ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
+	ASSERT3U(bt->bt_object, ==, object);
+	ASSERT3U(bt->bt_offset, ==, offset);
+	ASSERT3U(bt->bt_gen, <=, gen);
+	ASSERT3U(bt->bt_txg, <=, txg);
+	ASSERT3U(bt->bt_crtxg, ==, crtxg);
+}
+
+static ztest_block_tag_t *
+ztest_bt_bonus(dmu_buf_t *db)
+{
+	dmu_object_info_t doi;
+	ztest_block_tag_t *bt;
+
+	dmu_object_info_from_db(db, &doi);
+	ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
+	ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
+	bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt));
+
+	return (bt);
+}
+
+/*
+ * ZIL logging ops
+ */
+
+#define	lrz_type	lr_mode
+#define	lrz_blocksize	lr_uid
+#define	lrz_ibshift	lr_gid
+#define	lrz_bonustype	lr_rdev
+#define	lrz_bonuslen	lr_crtime[1]
+
+static void
+ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
+{
+	char *name = (void *)(lr + 1);		/* name follows lr */
+	size_t namesize = strlen(name) + 1;
+	itx_t *itx;
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return;
+
+	itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) + namesize - sizeof (lr_t));
+
+	zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+static void
+ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object)
+{
+	char *name = (void *)(lr + 1);		/* name follows lr */
+	size_t namesize = strlen(name) + 1;
+	itx_t *itx;
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return;
+
+	itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) + namesize - sizeof (lr_t));
+
+	itx->itx_oid = object;
+	zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+static void
+ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
+{
+	itx_t *itx;
+	itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return;
+
+	if (lr->lr_length > ZIL_MAX_LOG_DATA)
+		write_state = WR_INDIRECT;
+
+	itx = zil_itx_create(TX_WRITE,
+	    sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
+
+	if (write_state == WR_COPIED &&
+	    dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
+	    ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
+		zil_itx_destroy(itx);
+		itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+		write_state = WR_NEED_COPY;
+	}
+	itx->itx_private = zd;
+	itx->itx_wr_state = write_state;
+	itx->itx_sync = (ztest_random(8) == 0);
+	itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
+
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) - sizeof (lr_t));
+
+	zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+static void
+ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
+{
+	itx_t *itx;
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return;
+
+	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) - sizeof (lr_t));
+
+	itx->itx_sync = B_FALSE;
+	zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+static void
+ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
+{
+	itx_t *itx;
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return;
+
+	itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) - sizeof (lr_t));
+
+	itx->itx_sync = B_FALSE;
+	zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+/*
+ * ZIL replay ops
+ */
+static int
+ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap)
+{
+	char *name = (void *)(lr + 1);		/* name follows lr */
+	objset_t *os = zd->zd_os;
+	ztest_block_tag_t *bbt;
+	dmu_buf_t *db;
+	dmu_tx_t *tx;
+	uint64_t txg;
+	int error = 0;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+	ASSERT(name[0] != '\0');
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
+
+	if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+	} else {
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+	}
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0)
+		return (ENOSPC);
+
+	ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
+
+	if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+		if (lr->lr_foid == 0) {
+			lr->lr_foid = zap_create(os,
+			    lr->lrz_type, lr->lrz_bonustype,
+			    lr->lrz_bonuslen, tx);
+		} else {
+			error = zap_create_claim(os, lr->lr_foid,
+			    lr->lrz_type, lr->lrz_bonustype,
+			    lr->lrz_bonuslen, tx);
+		}
+	} else {
+		if (lr->lr_foid == 0) {
+			lr->lr_foid = dmu_object_alloc(os,
+			    lr->lrz_type, 0, lr->lrz_bonustype,
+			    lr->lrz_bonuslen, tx);
+		} else {
+			error = dmu_object_claim(os, lr->lr_foid,
+			    lr->lrz_type, 0, lr->lrz_bonustype,
+			    lr->lrz_bonuslen, tx);
+		}
+	}
+
+	if (error) {
+		ASSERT3U(error, ==, EEXIST);
+		ASSERT(zd->zd_zilog->zl_replay);
+		dmu_tx_commit(tx);
+		return (error);
+	}
+
+	ASSERT(lr->lr_foid != 0);
+
+	if (lr->lrz_type != DMU_OT_ZAP_OTHER)
+		VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
+		    lr->lrz_blocksize, lr->lrz_ibshift, tx));
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+	bbt = ztest_bt_bonus(db);
+	dmu_buf_will_dirty(db, tx);
+	ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg);
+	dmu_buf_rele(db, FTAG);
+
+	VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
+	    &lr->lr_foid, tx));
+
+	(void) ztest_log_create(zd, tx, lr);
+
+	dmu_tx_commit(tx);
+
+	return (0);
+}
+
+static int
+ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap)
+{
+	char *name = (void *)(lr + 1);		/* name follows lr */
+	objset_t *os = zd->zd_os;
+	dmu_object_info_t doi;
+	dmu_tx_t *tx;
+	uint64_t object, txg;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+	ASSERT(name[0] != '\0');
+
+	VERIFY3U(0, ==,
+	    zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
+	ASSERT(object != 0);
+
+	ztest_object_lock(zd, object, RL_WRITER);
+
+	VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
+	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		ztest_object_unlock(zd, object);
+		return (ENOSPC);
+	}
+
+	if (doi.doi_type == DMU_OT_ZAP_OTHER) {
+		VERIFY3U(0, ==, zap_destroy(os, object, tx));
+	} else {
+		VERIFY3U(0, ==, dmu_object_free(os, object, tx));
+	}
+
+	VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
+
+	(void) ztest_log_remove(zd, tx, lr, object);
+
+	dmu_tx_commit(tx);
+
+	ztest_object_unlock(zd, object);
+
+	return (0);
+}
+
+static int
+ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
+{
+	objset_t *os = zd->zd_os;
+	void *data = lr + 1;			/* data follows lr */
+	uint64_t offset, length;
+	ztest_block_tag_t *bt = data;
+	ztest_block_tag_t *bbt;
+	uint64_t gen, txg, lrtxg, crtxg;
+	dmu_object_info_t doi;
+	dmu_tx_t *tx;
+	dmu_buf_t *db;
+	arc_buf_t *abuf = NULL;
+	rl_t *rl;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	offset = lr->lr_offset;
+	length = lr->lr_length;
+
+	/* If it's a dmu_sync() block, write the whole block */
+	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+		if (length < blocksize) {
+			offset -= offset % blocksize;
+			length = blocksize;
+		}
+	}
+
+	if (bt->bt_magic == BSWAP_64(BT_MAGIC))
+		byteswap_uint64_array(bt, sizeof (*bt));
+
+	if (bt->bt_magic != BT_MAGIC)
+		bt = NULL;
+
+	ztest_object_lock(zd, lr->lr_foid, RL_READER);
+	rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+	dmu_object_info_from_db(db, &doi);
+
+	bbt = ztest_bt_bonus(db);
+	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+	gen = bbt->bt_gen;
+	crtxg = bbt->bt_crtxg;
+	lrtxg = lr->lr_common.lrc_txg;
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
+
+	if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
+	    P2PHASE(offset, length) == 0)
+		abuf = dmu_request_arcbuf(db, length);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		if (abuf != NULL)
+			dmu_return_arcbuf(abuf);
+		dmu_buf_rele(db, FTAG);
+		ztest_range_unlock(rl);
+		ztest_object_unlock(zd, lr->lr_foid);
+		return (ENOSPC);
+	}
+
+	if (bt != NULL) {
+		/*
+		 * Usually, verify the old data before writing new data --
+		 * but not always, because we also want to verify correct
+		 * behavior when the data was not recently read into cache.
+		 */
+		ASSERT(offset % doi.doi_data_block_size == 0);
+		if (ztest_random(4) != 0) {
+			int prefetch = ztest_random(2) ?
+			    DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
+			ztest_block_tag_t rbt;
+
+			VERIFY(dmu_read(os, lr->lr_foid, offset,
+			    sizeof (rbt), &rbt, prefetch) == 0);
+			if (rbt.bt_magic == BT_MAGIC) {
+				ztest_bt_verify(&rbt, os, lr->lr_foid,
+				    offset, gen, txg, crtxg);
+			}
+		}
+
+		/*
+		 * Writes can appear to be newer than the bonus buffer because
+		 * the ztest_get_data() callback does a dmu_read() of the
+		 * open-context data, which may be different than the data
+		 * as it was when the write was generated.
+		 */
+		if (zd->zd_zilog->zl_replay) {
+			ztest_bt_verify(bt, os, lr->lr_foid, offset,
+			    MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
+			    bt->bt_crtxg);
+		}
+
+		/*
+		 * Set the bt's gen/txg to the bonus buffer's gen/txg
+		 * so that all of the usual ASSERTs will work.
+		 */
+		ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg);
+	}
+
+	if (abuf == NULL) {
+		dmu_write(os, lr->lr_foid, offset, length, data, tx);
+	} else {
+		bcopy(data, abuf->b_data, length);
+		dmu_assign_arcbuf(db, offset, abuf, tx);
+	}
+
+	(void) ztest_log_write(zd, tx, lr);
+
+	dmu_buf_rele(db, FTAG);
+
+	dmu_tx_commit(tx);
+
+	ztest_range_unlock(rl);
+	ztest_object_unlock(zd, lr->lr_foid);
+
+	return (0);
+}
+
+static int
+ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap)
+{
+	objset_t *os = zd->zd_os;
+	dmu_tx_t *tx;
+	uint64_t txg;
+	rl_t *rl;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	ztest_object_lock(zd, lr->lr_foid, RL_READER);
+	rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
+	    RL_WRITER);
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		ztest_range_unlock(rl);
+		ztest_object_unlock(zd, lr->lr_foid);
+		return (ENOSPC);
+	}
+
+	VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
+	    lr->lr_length, tx) == 0);
+
+	(void) ztest_log_truncate(zd, tx, lr);
+
+	dmu_tx_commit(tx);
+
+	ztest_range_unlock(rl);
+	ztest_object_unlock(zd, lr->lr_foid);
+
+	return (0);
+}
+
+static int
+ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap)
+{
+	objset_t *os = zd->zd_os;
+	dmu_tx_t *tx;
+	dmu_buf_t *db;
+	ztest_block_tag_t *bbt;
+	uint64_t txg, lrtxg, crtxg;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_bonus(tx, lr->lr_foid);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		dmu_buf_rele(db, FTAG);
+		ztest_object_unlock(zd, lr->lr_foid);
+		return (ENOSPC);
+	}
+
+	bbt = ztest_bt_bonus(db);
+	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+	crtxg = bbt->bt_crtxg;
+	lrtxg = lr->lr_common.lrc_txg;
+
+	if (zd->zd_zilog->zl_replay) {
+		ASSERT(lr->lr_size != 0);
+		ASSERT(lr->lr_mode != 0);
+		ASSERT(lrtxg != 0);
+	} else {
+		/*
+		 * Randomly change the size and increment the generation.
+		 */
+		lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) *
+		    sizeof (*bbt);
+		lr->lr_mode = bbt->bt_gen + 1;
+		ASSERT(lrtxg == 0);
+	}
+
+	/*
+	 * Verify that the current bonus buffer is not newer than our txg.
+	 */
+	ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode,
+	    MAX(txg, lrtxg), crtxg);
+
+	dmu_buf_will_dirty(db, tx);
+
+	ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
+	ASSERT3U(lr->lr_size, <=, db->db_size);
+	VERIFY0(dmu_set_bonus(db, lr->lr_size, tx));
+	bbt = ztest_bt_bonus(db);
+
+	ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg);
+
+	dmu_buf_rele(db, FTAG);
+
+	(void) ztest_log_setattr(zd, tx, lr);
+
+	dmu_tx_commit(tx);
+
+	ztest_object_unlock(zd, lr->lr_foid);
+
+	return (0);
+}
+
+zil_replay_func_t ztest_replay_vector[TX_MAX_TYPE] = {
+	NULL,				/* 0 no such transaction type */
+	(zil_replay_func_t)ztest_replay_create,		/* TX_CREATE */
+	NULL,						/* TX_MKDIR */
+	NULL,						/* TX_MKXATTR */
+	NULL,						/* TX_SYMLINK */
+	(zil_replay_func_t)ztest_replay_remove,		/* TX_REMOVE */
+	NULL,						/* TX_RMDIR */
+	NULL,						/* TX_LINK */
+	NULL,						/* TX_RENAME */
+	(zil_replay_func_t)ztest_replay_write,		/* TX_WRITE */
+	(zil_replay_func_t)ztest_replay_truncate,	/* TX_TRUNCATE */
+	(zil_replay_func_t)ztest_replay_setattr,	/* TX_SETATTR */
+	NULL,						/* TX_ACL */
+	NULL,						/* TX_CREATE_ACL */
+	NULL,						/* TX_CREATE_ATTR */
+	NULL,						/* TX_CREATE_ACL_ATTR */
+	NULL,						/* TX_MKDIR_ACL */
+	NULL,						/* TX_MKDIR_ATTR */
+	NULL,						/* TX_MKDIR_ACL_ATTR */
+	NULL,						/* TX_WRITE2 */
+};
+
+/*
+ * ZIL get_data callbacks
+ */
+
+static void
+ztest_get_done(zgd_t *zgd, int error)
+{
+	ztest_ds_t *zd = zgd->zgd_private;
+	uint64_t object = zgd->zgd_rl->rl_object;
+
+	if (zgd->zgd_db)
+		dmu_buf_rele(zgd->zgd_db, zgd);
+
+	ztest_range_unlock(zgd->zgd_rl);
+	ztest_object_unlock(zd, object);
+
+	if (error == 0 && zgd->zgd_bp)
+		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+
+	umem_free(zgd, sizeof (*zgd));
+}
+
+static int
+ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+{
+	ztest_ds_t *zd = arg;
+	objset_t *os = zd->zd_os;
+	uint64_t object = lr->lr_foid;
+	uint64_t offset = lr->lr_offset;
+	uint64_t size = lr->lr_length;
+	blkptr_t *bp = &lr->lr_blkptr;
+	uint64_t txg = lr->lr_common.lrc_txg;
+	uint64_t crtxg;
+	dmu_object_info_t doi;
+	dmu_buf_t *db;
+	zgd_t *zgd;
+	int error;
+
+	ztest_object_lock(zd, object, RL_READER);
+	error = dmu_bonus_hold(os, object, FTAG, &db);
+	if (error) {
+		ztest_object_unlock(zd, object);
+		return (error);
+	}
+
+	crtxg = ztest_bt_bonus(db)->bt_crtxg;
+
+	if (crtxg == 0 || crtxg > txg) {
+		dmu_buf_rele(db, FTAG);
+		ztest_object_unlock(zd, object);
+		return (ENOENT);
+	}
+
+	dmu_object_info_from_db(db, &doi);
+	dmu_buf_rele(db, FTAG);
+	db = NULL;
+
+	zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
+	zgd->zgd_zilog = zd->zd_zilog;
+	zgd->zgd_private = zd;
+
+	if (buf != NULL) {	/* immediate write */
+		zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+		    RL_READER);
+
+		error = dmu_read(os, object, offset, size, buf,
+		    DMU_READ_NO_PREFETCH);
+		ASSERT(error == 0);
+	} else {
+		size = doi.doi_data_block_size;
+		if (ISP2(size)) {
+			offset = P2ALIGN(offset, size);
+		} else {
+			ASSERT(offset < size);
+			offset = 0;
+		}
+
+		zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+		    RL_READER);
+
+		error = dmu_buf_hold(os, object, offset, zgd, &db,
+		    DMU_READ_NO_PREFETCH);
+
+		if (error == 0) {
+			blkptr_t *obp = dmu_buf_get_blkptr(db);
+			if (obp) {
+				ASSERT(BP_IS_HOLE(bp));
+				*bp = *obp;
+			}
+
+			zgd->zgd_db = db;
+			zgd->zgd_bp = bp;
+
+			ASSERT(db->db_offset == offset);
+			ASSERT(db->db_size == size);
+
+			error = dmu_sync(zio, lr->lr_common.lrc_txg,
+			    ztest_get_done, zgd);
+
+			if (error == 0)
+				return (0);
+		}
+	}
+
+	ztest_get_done(zgd, error);
+
+	return (error);
+}
+
+static void *
+ztest_lr_alloc(size_t lrsize, char *name)
+{
+	char *lr;
+	size_t namesize = name ? strlen(name) + 1 : 0;
+
+	lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
+
+	if (name)
+		bcopy(name, lr + lrsize, namesize);
+
+	return (lr);
+}
+
+void
+ztest_lr_free(void *lr, size_t lrsize, char *name)
+{
+	size_t namesize = name ? strlen(name) + 1 : 0;
+
+	umem_free(lr, lrsize + namesize);
+}
+
+/*
+ * Lookup a bunch of objects.  Returns the number of objects not found.
+ */
+static int
+ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+	int missing = 0;
+	int error;
+	int i;
+
+	ASSERT(mutex_held(&zd->zd_dirobj_lock));
+
+	for (i = 0; i < count; i++, od++) {
+		od->od_object = 0;
+		error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
+		    sizeof (uint64_t), 1, &od->od_object);
+		if (error) {
+			ASSERT(error == ENOENT);
+			ASSERT(od->od_object == 0);
+			missing++;
+		} else {
+			dmu_buf_t *db;
+			ztest_block_tag_t *bbt;
+			dmu_object_info_t doi;
+
+			ASSERT(od->od_object != 0);
+			ASSERT(missing == 0);	/* there should be no gaps */
+
+			ztest_object_lock(zd, od->od_object, RL_READER);
+			VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
+			    od->od_object, FTAG, &db));
+			dmu_object_info_from_db(db, &doi);
+			bbt = ztest_bt_bonus(db);
+			ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+			od->od_type = doi.doi_type;
+			od->od_blocksize = doi.doi_data_block_size;
+			od->od_gen = bbt->bt_gen;
+			dmu_buf_rele(db, FTAG);
+			ztest_object_unlock(zd, od->od_object);
+		}
+	}
+
+	return (missing);
+}
+
+static int
+ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+	int missing = 0;
+	int i;
+
+	ASSERT(mutex_held(&zd->zd_dirobj_lock));
+
+	for (i = 0; i < count; i++, od++) {
+		if (missing) {
+			od->od_object = 0;
+			missing++;
+			continue;
+		}
+
+		lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+		lr->lr_doid = od->od_dir;
+		lr->lr_foid = 0;	/* 0 to allocate, > 0 to claim */
+		lr->lrz_type = od->od_crtype;
+		lr->lrz_blocksize = od->od_crblocksize;
+		lr->lrz_ibshift = ztest_random_ibshift();
+		lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
+		lr->lrz_bonuslen = dmu_bonus_max();
+		lr->lr_gen = od->od_crgen;
+		lr->lr_crtime[0] = time(NULL);
+
+		if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
+			ASSERT(missing == 0);
+			od->od_object = 0;
+			missing++;
+		} else {
+			od->od_object = lr->lr_foid;
+			od->od_type = od->od_crtype;
+			od->od_blocksize = od->od_crblocksize;
+			od->od_gen = od->od_crgen;
+			ASSERT(od->od_object != 0);
+		}
+
+		ztest_lr_free(lr, sizeof (*lr), od->od_name);
+	}
+
+	return (missing);
+}
+
+static int
+ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+	int missing = 0;
+	int error;
+	int i;
+
+	ASSERT(mutex_held(&zd->zd_dirobj_lock));
+
+	od += count - 1;
+
+	for (i = count - 1; i >= 0; i--, od--) {
+		if (missing) {
+			missing++;
+			continue;
+		}
+
+		/*
+		 * No object was found.
+		 */
+		if (od->od_object == 0)
+			continue;
+
+		lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+		lr->lr_doid = od->od_dir;
+
+		if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
+			ASSERT3U(error, ==, ENOSPC);
+			missing++;
+		} else {
+			od->od_object = 0;
+		}
+		ztest_lr_free(lr, sizeof (*lr), od->od_name);
+	}
+
+	return (missing);
+}
+
+static int
+ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
+    void *data)
+{
+	lr_write_t *lr;
+	int error;
+
+	lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
+
+	lr->lr_foid = object;
+	lr->lr_offset = offset;
+	lr->lr_length = size;
+	lr->lr_blkoff = 0;
+	BP_ZERO(&lr->lr_blkptr);
+
+	bcopy(data, lr + 1, size);
+
+	error = ztest_replay_write(zd, lr, B_FALSE);
+
+	ztest_lr_free(lr, sizeof (*lr) + size, NULL);
+
+	return (error);
+}
+
+static int
+ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+{
+	lr_truncate_t *lr;
+	int error;
+
+	lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+	lr->lr_foid = object;
+	lr->lr_offset = offset;
+	lr->lr_length = size;
+
+	error = ztest_replay_truncate(zd, lr, B_FALSE);
+
+	ztest_lr_free(lr, sizeof (*lr), NULL);
+
+	return (error);
+}
+
+static int
+ztest_setattr(ztest_ds_t *zd, uint64_t object)
+{
+	lr_setattr_t *lr;
+	int error;
+
+	lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+	lr->lr_foid = object;
+	lr->lr_size = 0;
+	lr->lr_mode = 0;
+
+	error = ztest_replay_setattr(zd, lr, B_FALSE);
+
+	ztest_lr_free(lr, sizeof (*lr), NULL);
+
+	return (error);
+}
+
+static void
+ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+{
+	objset_t *os = zd->zd_os;
+	dmu_tx_t *tx;
+	uint64_t txg;
+	rl_t *rl;
+
+	txg_wait_synced(dmu_objset_pool(os), 0);
+
+	ztest_object_lock(zd, object, RL_READER);
+	rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_write(tx, object, offset, size);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+
+	if (txg != 0) {
+		dmu_prealloc(os, object, offset, size, tx);
+		dmu_tx_commit(tx);
+		txg_wait_synced(dmu_objset_pool(os), txg);
+	} else {
+		(void) dmu_free_long_range(os, object, offset, size);
+	}
+
+	ztest_range_unlock(rl);
+	ztest_object_unlock(zd, object);
+}
+
+static void
+ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
+{
+	int err;
+	ztest_block_tag_t wbt;
+	dmu_object_info_t doi;
+	enum ztest_io_type io_type;
+	uint64_t blocksize;
+	void *data;
+
+	VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
+	blocksize = doi.doi_data_block_size;
+	data = umem_alloc(blocksize, UMEM_NOFAIL);
+
+	/*
+	 * Pick an i/o type at random, biased toward writing block tags.
+	 */
+	io_type = ztest_random(ZTEST_IO_TYPES);
+	if (ztest_random(2) == 0)
+		io_type = ZTEST_IO_WRITE_TAG;
+
+	(void) rw_rdlock(&zd->zd_zilog_lock);
+
+	switch (io_type) {
+
+	case ZTEST_IO_WRITE_TAG:
+		ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
+		(void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
+		break;
+
+	case ZTEST_IO_WRITE_PATTERN:
+		(void) memset(data, 'a' + (object + offset) % 5, blocksize);
+		if (ztest_random(2) == 0) {
+			/*
+			 * Induce fletcher2 collisions to ensure that
+			 * zio_ddt_collision() detects and resolves them
+			 * when using fletcher2-verify for deduplication.
+			 */
+			((uint64_t *)data)[0] ^= 1ULL << 63;
+			((uint64_t *)data)[4] ^= 1ULL << 63;
+		}
+		(void) ztest_write(zd, object, offset, blocksize, data);
+		break;
+
+	case ZTEST_IO_WRITE_ZEROES:
+		bzero(data, blocksize);
+		(void) ztest_write(zd, object, offset, blocksize, data);
+		break;
+
+	case ZTEST_IO_TRUNCATE:
+		(void) ztest_truncate(zd, object, offset, blocksize);
+		break;
+
+	case ZTEST_IO_SETATTR:
+		(void) ztest_setattr(zd, object);
+		break;
+	default:
+		break;
+
+	case ZTEST_IO_REWRITE:
+		(void) rw_rdlock(&ztest_name_lock);
+		err = ztest_dsl_prop_set_uint64(zd->zd_name,
+		    ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa),
+		    B_FALSE);
+		VERIFY(err == 0 || err == ENOSPC);
+		err = ztest_dsl_prop_set_uint64(zd->zd_name,
+		    ZFS_PROP_COMPRESSION,
+		    ztest_random_dsl_prop(ZFS_PROP_COMPRESSION),
+		    B_FALSE);
+		VERIFY(err == 0 || err == ENOSPC);
+		(void) rw_unlock(&ztest_name_lock);
+
+		VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data,
+		    DMU_READ_NO_PREFETCH));
+
+		(void) ztest_write(zd, object, offset, blocksize, data);
+		break;
+	}
+
+	(void) rw_unlock(&zd->zd_zilog_lock);
+
+	umem_free(data, blocksize);
+}
+
+/*
+ * Initialize an object description template.
+ */
+static void
+ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
+    dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
+{
+	od->od_dir = ZTEST_DIROBJ;
+	od->od_object = 0;
+
+	od->od_crtype = type;
+	od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
+	od->od_crgen = gen;
+
+	od->od_type = DMU_OT_NONE;
+	od->od_blocksize = 0;
+	od->od_gen = 0;
+
+	(void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
+	    tag, (longlong_t)id, (u_longlong_t)index);
+}
+
+/*
+ * Lookup or create the objects for a test using the od template.
+ * If the objects do not all exist, or if 'remove' is specified,
+ * remove any existing objects and create new ones.  Otherwise,
+ * use the existing objects.
+ */
+static int
+ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
+{
+	int count = size / sizeof (*od);
+	int rv = 0;
+
+	mutex_enter(&zd->zd_dirobj_lock);
+	if ((ztest_lookup(zd, od, count) != 0 || remove) &&
+	    (ztest_remove(zd, od, count) != 0 ||
+	    ztest_create(zd, od, count) != 0))
+		rv = -1;
+	zd->zd_od = od;
+	mutex_exit(&zd->zd_dirobj_lock);
+
+	return (rv);
+}
+
+/* ARGSUSED */
+void
+ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
+{
+	zilog_t *zilog = zd->zd_zilog;
+
+	(void) rw_rdlock(&zd->zd_zilog_lock);
+
+	zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
+
+	/*
+	 * Remember the committed values in zd, which is in parent/child
+	 * shared memory.  If we die, the next iteration of ztest_run()
+	 * will verify that the log really does contain this record.
+	 */
+	mutex_enter(&zilog->zl_lock);
+	ASSERT(zd->zd_shared != NULL);
+	ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq);
+	zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq;
+	mutex_exit(&zilog->zl_lock);
+
+	(void) rw_unlock(&zd->zd_zilog_lock);
+}
+
+/*
+ * This function is designed to simulate the operations that occur during a
+ * mount/unmount operation.  We hold the dataset across these operations in an
+ * attempt to expose any implicit assumptions about ZIL management.
+ */
+/* ARGSUSED */
+void
+ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+
+	/*
+	 * We grab the zd_dirobj_lock to ensure that no other thread is
+	 * updating the zil (i.e. adding in-memory log records) and the
+	 * zd_zilog_lock to block any I/O.
+	 */
+	mutex_enter(&zd->zd_dirobj_lock);
+	(void) rw_wrlock(&zd->zd_zilog_lock);
+
+	/* zfs_sb_teardown() */
+	zil_close(zd->zd_zilog);
+
+	/* zfsvfs_setup() */
+	VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
+	zil_replay(os, zd, ztest_replay_vector);
+
+	(void) rw_unlock(&zd->zd_zilog_lock);
+	mutex_exit(&zd->zd_dirobj_lock);
+}
+
+/*
+ * Verify that we can't destroy an active pool, create an existing pool,
+ * or create a pool with a bad vdev spec.
+ */
+/* ARGSUSED */
+void
+ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_opts_t *zo = &ztest_opts;
+	spa_t *spa;
+	nvlist_t *nvroot;
+
+	/*
+	 * Attempt to create using a bad file.
+	 */
+	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
+	VERIFY3U(ENOENT, ==,
+	    spa_create("ztest_bad_file", nvroot, NULL, NULL));
+	nvlist_free(nvroot);
+
+	/*
+	 * Attempt to create using a bad mirror.
+	 */
+	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1);
+	VERIFY3U(ENOENT, ==,
+	    spa_create("ztest_bad_mirror", nvroot, NULL, NULL));
+	nvlist_free(nvroot);
+
+	/*
+	 * Attempt to create an existing pool.  It shouldn't matter
+	 * what's in the nvroot; we should fail with EEXIST.
+	 */
+	(void) rw_rdlock(&ztest_name_lock);
+	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
+	VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL));
+	nvlist_free(nvroot);
+	VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG));
+	VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool));
+	spa_close(spa, FTAG);
+
+	(void) rw_unlock(&ztest_name_lock);
+}
+
+/* ARGSUSED */
+void
+ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
+{
+	spa_t *spa;
+	uint64_t initial_version = SPA_VERSION_INITIAL;
+	uint64_t version, newversion;
+	nvlist_t *nvroot, *props;
+	char *name;
+
+	mutex_enter(&ztest_vdev_lock);
+	name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool);
+
+	/*
+	 * Clean up from previous runs.
+	 */
+	(void) spa_destroy(name);
+
+	nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
+	    0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
+
+	/*
+	 * If we're configuring a RAIDZ device then make sure that the
+	 * the initial version is capable of supporting that feature.
+	 */
+	switch (ztest_opts.zo_raidz_parity) {
+	case 0:
+	case 1:
+		initial_version = SPA_VERSION_INITIAL;
+		break;
+	case 2:
+		initial_version = SPA_VERSION_RAIDZ2;
+		break;
+	case 3:
+		initial_version = SPA_VERSION_RAIDZ3;
+		break;
+	}
+
+	/*
+	 * Create a pool with a spa version that can be upgraded. Pick
+	 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES.
+	 */
+	do {
+		version = ztest_random_spa_version(initial_version);
+	} while (version > SPA_VERSION_BEFORE_FEATURES);
+
+	props = fnvlist_alloc();
+	fnvlist_add_uint64(props,
+	    zpool_prop_to_name(ZPOOL_PROP_VERSION), version);
+	VERIFY3S(spa_create(name, nvroot, props, NULL), ==, 0);
+	fnvlist_free(nvroot);
+	fnvlist_free(props);
+
+	VERIFY3S(spa_open(name, &spa, FTAG), ==, 0);
+	VERIFY3U(spa_version(spa), ==, version);
+	newversion = ztest_random_spa_version(version + 1);
+
+	if (ztest_opts.zo_verbose >= 4) {
+		(void) printf("upgrading spa version from %llu to %llu\n",
+		    (u_longlong_t)version, (u_longlong_t)newversion);
+	}
+
+	spa_upgrade(spa, newversion);
+	VERIFY3U(spa_version(spa), >, version);
+	VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config,
+	    zpool_prop_to_name(ZPOOL_PROP_VERSION)));
+	spa_close(spa, FTAG);
+
+	strfree(name);
+	mutex_exit(&ztest_vdev_lock);
+}
+
+static vdev_t *
+vdev_lookup_by_path(vdev_t *vd, const char *path)
+{
+	vdev_t *mvd;
+	int c;
+
+	if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
+		return (vd);
+
+	for (c = 0; c < vd->vdev_children; c++)
+		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
+		    NULL)
+			return (mvd);
+
+	return (NULL);
+}
+
+/*
+ * Find the first available hole which can be used as a top-level.
+ */
+int
+find_vdev_hole(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	int c;
+
+	ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);
+
+	for (c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *cvd = rvd->vdev_child[c];
+
+		if (cvd->vdev_ishole)
+			break;
+	}
+	return (c);
+}
+
+/*
+ * Verify that vdev_add() works as expected.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = ztest_spa;
+	uint64_t leaves;
+	uint64_t guid;
+	nvlist_t *nvroot;
+	int error;
+
+	mutex_enter(&ztest_vdev_lock);
+	leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
+
+	/*
+	 * If we have slogs then remove them 1/4 of the time.
+	 */
+	if (spa_has_slogs(spa) && ztest_random(4) == 0) {
+		/*
+		 * Grab the guid from the head of the log class rotor.
+		 */
+		guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
+
+		spa_config_exit(spa, SCL_VDEV, FTAG);
+
+		/*
+		 * We have to grab the zs_name_lock as writer to
+		 * prevent a race between removing a slog (dmu_objset_find)
+		 * and destroying a dataset. Removing the slog will
+		 * grab a reference on the dataset which may cause
+		 * dsl_destroy_head() to fail with EBUSY thus
+		 * leaving the dataset in an inconsistent state.
+		 */
+		rw_wrlock(&ztest_name_lock);
+		error = spa_vdev_remove(spa, guid, B_FALSE);
+		rw_unlock(&ztest_name_lock);
+
+		if (error && error != EEXIST)
+			fatal(0, "spa_vdev_remove() = %d", error);
+	} else {
+		spa_config_exit(spa, SCL_VDEV, FTAG);
+
+		/*
+		 * Make 1/4 of the devices be log devices.
+		 */
+		nvroot = make_vdev_root(NULL, NULL, NULL,
+		    ztest_opts.zo_vdev_size, 0,
+		    ztest_random(4) == 0, ztest_opts.zo_raidz,
+		    zs->zs_mirrors, 1);
+
+		error = spa_vdev_add(spa, nvroot);
+		nvlist_free(nvroot);
+
+		if (error == ENOSPC)
+			ztest_record_enospc("spa_vdev_add");
+		else if (error != 0)
+			fatal(0, "spa_vdev_add() = %d", error);
+	}
+
+	mutex_exit(&ztest_vdev_lock);
+}
+
+/*
+ * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = ztest_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	spa_aux_vdev_t *sav;
+	char *aux;
+	char *path;
+	uint64_t guid = 0;
+	int error;
+
+	path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+	if (ztest_random(2) == 0) {
+		sav = &spa->spa_spares;
+		aux = ZPOOL_CONFIG_SPARES;
+	} else {
+		sav = &spa->spa_l2cache;
+		aux = ZPOOL_CONFIG_L2CACHE;
+	}
+
+	mutex_enter(&ztest_vdev_lock);
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	if (sav->sav_count != 0 && ztest_random(4) == 0) {
+		/*
+		 * Pick a random device to remove.
+		 */
+		guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
+	} else {
+		/*
+		 * Find an unused device we can add.
+		 */
+		zs->zs_vdev_aux = 0;
+		for (;;) {
+			int c;
+			(void) snprintf(path, MAXPATHLEN, ztest_aux_template,
+			    ztest_opts.zo_dir, ztest_opts.zo_pool, aux,
+			    zs->zs_vdev_aux);
+			for (c = 0; c < sav->sav_count; c++)
+				if (strcmp(sav->sav_vdevs[c]->vdev_path,
+				    path) == 0)
+					break;
+			if (c == sav->sav_count &&
+			    vdev_lookup_by_path(rvd, path) == NULL)
+				break;
+			zs->zs_vdev_aux++;
+		}
+	}
+
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	if (guid == 0) {
+		/*
+		 * Add a new device.
+		 */
+		nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
+		    (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
+		error = spa_vdev_add(spa, nvroot);
+		if (error != 0)
+			fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
+		nvlist_free(nvroot);
+	} else {
+		/*
+		 * Remove an existing device.  Sometimes, dirty its
+		 * vdev state first to make sure we handle removal
+		 * of devices that have pending state changes.
+		 */
+		if (ztest_random(2) == 0)
+			(void) vdev_online(spa, guid, 0, NULL);
+
+		error = spa_vdev_remove(spa, guid, B_FALSE);
+		if (error != 0 && error != EBUSY)
+			fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
+	}
+
+	mutex_exit(&ztest_vdev_lock);
+
+	umem_free(path, MAXPATHLEN);
+}
+
+/*
+ * split a pool if it has mirror tlvdevs
+ */
+/* ARGSUSED */
+void
+ztest_split_pool(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = ztest_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	nvlist_t *tree, **child, *config, *split, **schild;
+	uint_t c, children, schildren = 0, lastlogid = 0;
+	int error = 0;
+
+	mutex_enter(&ztest_vdev_lock);
+
+	/* ensure we have a useable config; mirrors of raidz aren't supported */
+	if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) {
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
+
+	/* clean up the old pool, if any */
+	(void) spa_destroy("splitp");
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	/* generate a config from the existing config */
+	mutex_enter(&spa->spa_props_lock);
+	VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE,
+	    &tree) == 0);
+	mutex_exit(&spa->spa_props_lock);
+
+	VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
+	    &children) == 0);
+
+	schild = malloc(rvd->vdev_children * sizeof (nvlist_t *));
+	for (c = 0; c < children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		nvlist_t **mchild;
+		uint_t mchildren;
+
+		if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) {
+			VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME,
+			    0) == 0);
+			VERIFY(nvlist_add_string(schild[schildren],
+			    ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0);
+			VERIFY(nvlist_add_uint64(schild[schildren],
+			    ZPOOL_CONFIG_IS_HOLE, 1) == 0);
+			if (lastlogid == 0)
+				lastlogid = schildren;
+			++schildren;
+			continue;
+		}
+		lastlogid = 0;
+		VERIFY(nvlist_lookup_nvlist_array(child[c],
+		    ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
+		VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0);
+	}
+
+	/* OK, create a config that can be used to split */
+	VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE,
+	    VDEV_TYPE_ROOT) == 0);
+	VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild,
+	    lastlogid != 0 ? lastlogid : schildren) == 0);
+
+	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0);
+
+	for (c = 0; c < schildren; c++)
+		nvlist_free(schild[c]);
+	free(schild);
+	nvlist_free(split);
+
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	(void) rw_wrlock(&ztest_name_lock);
+	error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE);
+	(void) rw_unlock(&ztest_name_lock);
+
+	nvlist_free(config);
+
+	if (error == 0) {
+		(void) printf("successful split - results:\n");
+		mutex_enter(&spa_namespace_lock);
+		show_pool_stats(spa);
+		show_pool_stats(spa_lookup("splitp"));
+		mutex_exit(&spa_namespace_lock);
+		++zs->zs_splits;
+		--zs->zs_mirrors;
+	}
+	mutex_exit(&ztest_vdev_lock);
+
+}
+
+/*
+ * Verify that we can attach and detach devices.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = ztest_spa;
+	spa_aux_vdev_t *sav = &spa->spa_spares;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *oldvd, *newvd, *pvd;
+	nvlist_t *root;
+	uint64_t leaves;
+	uint64_t leaf, top;
+	uint64_t ashift = ztest_get_ashift();
+	uint64_t oldguid, pguid;
+	uint64_t oldsize, newsize;
+	char *oldpath, *newpath;
+	int replacing;
+	int oldvd_has_siblings = B_FALSE;
+	int newvd_is_spare = B_FALSE;
+	int oldvd_is_log;
+	int error, expected_error;
+
+	oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+	newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+	mutex_enter(&ztest_vdev_lock);
+	leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	/*
+	 * Decide whether to do an attach or a replace.
+	 */
+	replacing = ztest_random(2);
+
+	/*
+	 * Pick a random top-level vdev.
+	 */
+	top = ztest_random_vdev_top(spa, B_TRUE);
+
+	/*
+	 * Pick a random leaf within it.
+	 */
+	leaf = ztest_random(leaves);
+
+	/*
+	 * Locate this vdev.
+	 */
+	oldvd = rvd->vdev_child[top];
+	if (zs->zs_mirrors >= 1) {
+		ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
+		ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
+		oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz];
+	}
+	if (ztest_opts.zo_raidz > 1) {
+		ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
+		ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz);
+		oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz];
+	}
+
+	/*
+	 * If we're already doing an attach or replace, oldvd may be a
+	 * mirror vdev -- in which case, pick a random child.
+	 */
+	while (oldvd->vdev_children != 0) {
+		oldvd_has_siblings = B_TRUE;
+		ASSERT(oldvd->vdev_children >= 2);
+		oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
+	}
+
+	oldguid = oldvd->vdev_guid;
+	oldsize = vdev_get_min_asize(oldvd);
+	oldvd_is_log = oldvd->vdev_top->vdev_islog;
+	(void) strcpy(oldpath, oldvd->vdev_path);
+	pvd = oldvd->vdev_parent;
+	pguid = pvd->vdev_guid;
+
+	/*
+	 * If oldvd has siblings, then half of the time, detach it.
+	 */
+	if (oldvd_has_siblings && ztest_random(2) == 0) {
+		spa_config_exit(spa, SCL_VDEV, FTAG);
+		error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
+		if (error != 0 && error != ENODEV && error != EBUSY &&
+		    error != ENOTSUP)
+			fatal(0, "detach (%s) returned %d", oldpath, error);
+		goto out;
+	}
+
+	/*
+	 * For the new vdev, choose with equal probability between the two
+	 * standard paths (ending in either 'a' or 'b') or a random hot spare.
+	 */
+	if (sav->sav_count != 0 && ztest_random(3) == 0) {
+		newvd = sav->sav_vdevs[ztest_random(sav->sav_count)];
+		newvd_is_spare = B_TRUE;
+		(void) strcpy(newpath, newvd->vdev_path);
+	} else {
+		(void) snprintf(newpath, MAXPATHLEN, ztest_dev_template,
+		    ztest_opts.zo_dir, ztest_opts.zo_pool,
+		    top * leaves + leaf);
+		if (ztest_random(2) == 0)
+			newpath[strlen(newpath) - 1] = 'b';
+		newvd = vdev_lookup_by_path(rvd, newpath);
+	}
+
+	if (newvd) {
+		newsize = vdev_get_min_asize(newvd);
+	} else {
+		/*
+		 * Make newsize a little bigger or smaller than oldsize.
+		 * If it's smaller, the attach should fail.
+		 * If it's larger, and we're doing a replace,
+		 * we should get dynamic LUN growth when we're done.
+		 */
+		newsize = 10 * oldsize / (9 + ztest_random(3));
+	}
+
+	/*
+	 * If pvd is not a mirror or root, the attach should fail with ENOTSUP,
+	 * unless it's a replace; in that case any non-replacing parent is OK.
+	 *
+	 * If newvd is already part of the pool, it should fail with EBUSY.
+	 *
+	 * If newvd is too small, it should fail with EOVERFLOW.
+	 */
+	if (pvd->vdev_ops != &vdev_mirror_ops &&
+	    pvd->vdev_ops != &vdev_root_ops && (!replacing ||
+	    pvd->vdev_ops == &vdev_replacing_ops ||
+	    pvd->vdev_ops == &vdev_spare_ops))
+		expected_error = ENOTSUP;
+	else if (newvd_is_spare && (!replacing || oldvd_is_log))
+		expected_error = ENOTSUP;
+	else if (newvd == oldvd)
+		expected_error = replacing ? 0 : EBUSY;
+	else if (vdev_lookup_by_path(rvd, newpath) != NULL)
+		expected_error = EBUSY;
+	else if (newsize < oldsize)
+		expected_error = EOVERFLOW;
+	else if (ashift > oldvd->vdev_top->vdev_ashift)
+		expected_error = EDOM;
+	else
+		expected_error = 0;
+
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	/*
+	 * Build the nvlist describing newpath.
+	 */
+	root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0,
+	    ashift, 0, 0, 0, 1);
+
+	error = spa_vdev_attach(spa, oldguid, root, replacing);
+
+	nvlist_free(root);
+
+	/*
+	 * If our parent was the replacing vdev, but the replace completed,
+	 * then instead of failing with ENOTSUP we may either succeed,
+	 * fail with ENODEV, or fail with EOVERFLOW.
+	 */
+	if (expected_error == ENOTSUP &&
+	    (error == 0 || error == ENODEV || error == EOVERFLOW))
+		expected_error = error;
+
+	/*
+	 * If someone grew the LUN, the replacement may be too small.
+	 */
+	if (error == EOVERFLOW || error == EBUSY)
+		expected_error = error;
+
+	/* XXX workaround 6690467 */
+	if (error != expected_error && expected_error != EBUSY) {
+		fatal(0, "attach (%s %llu, %s %llu, %d) "
+		    "returned %d, expected %d",
+		    oldpath, oldsize, newpath,
+		    newsize, replacing, error, expected_error);
+	}
+out:
+	mutex_exit(&ztest_vdev_lock);
+
+	umem_free(oldpath, MAXPATHLEN);
+	umem_free(newpath, MAXPATHLEN);
+}
+
+/*
+ * Callback function which expands the physical size of the vdev.
+ */
+vdev_t *
+grow_vdev(vdev_t *vd, void *arg)
+{
+	ASSERTV(spa_t *spa = vd->vdev_spa);
+	size_t *newsize = arg;
+	size_t fsize;
+	int fd;
+
+	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
+		return (vd);
+
+	fsize = lseek(fd, 0, SEEK_END);
+	VERIFY(ftruncate(fd, *newsize) == 0);
+
+	if (ztest_opts.zo_verbose >= 6) {
+		(void) printf("%s grew from %lu to %lu bytes\n",
+		    vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize);
+	}
+	(void) close(fd);
+	return (NULL);
+}
+
+/*
+ * Callback function which expands a given vdev by calling vdev_online().
+ */
+/* ARGSUSED */
+vdev_t *
+online_vdev(vdev_t *vd, void *arg)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *tvd = vd->vdev_top;
+	uint64_t guid = vd->vdev_guid;
+	uint64_t generation = spa->spa_config_generation + 1;
+	vdev_state_t newstate = VDEV_STATE_UNKNOWN;
+	int error;
+
+	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	/* Calling vdev_online will initialize the new metaslabs */
+	spa_config_exit(spa, SCL_STATE, spa);
+	error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate);
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+	/*
+	 * If vdev_online returned an error or the underlying vdev_open
+	 * failed then we abort the expand. The only way to know that
+	 * vdev_open fails is by checking the returned newstate.
+	 */
+	if (error || newstate != VDEV_STATE_HEALTHY) {
+		if (ztest_opts.zo_verbose >= 5) {
+			(void) printf("Unable to expand vdev, state %llu, "
+			    "error %d\n", (u_longlong_t)newstate, error);
+		}
+		return (vd);
+	}
+	ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY);
+
+	/*
+	 * Since we dropped the lock we need to ensure that we're
+	 * still talking to the original vdev. It's possible this
+	 * vdev may have been detached/replaced while we were
+	 * trying to online it.
+	 */
+	if (generation != spa->spa_config_generation) {
+		if (ztest_opts.zo_verbose >= 5) {
+			(void) printf("vdev configuration has changed, "
+			    "guid %llu, state %llu, expected gen %llu, "
+			    "got gen %llu\n",
+			    (u_longlong_t)guid,
+			    (u_longlong_t)tvd->vdev_state,
+			    (u_longlong_t)generation,
+			    (u_longlong_t)spa->spa_config_generation);
+		}
+		return (vd);
+	}
+	return (NULL);
+}
+
+/*
+ * Traverse the vdev tree calling the supplied function.
+ * We continue to walk the tree until we either have walked all
+ * children or we receive a non-NULL return from the callback.
+ * If a NULL callback is passed, then we just return back the first
+ * leaf vdev we encounter.
+ */
+vdev_t *
+vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg)
+{
+	uint_t c;
+
+	if (vd->vdev_ops->vdev_op_leaf) {
+		if (func == NULL)
+			return (vd);
+		else
+			return (func(vd, arg));
+	}
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+		if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL)
+			return (cvd);
+	}
+	return (NULL);
+}
+
+/*
+ * Verify that dynamic LUN growth works as expected.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
+{
+	spa_t *spa = ztest_spa;
+	vdev_t *vd, *tvd;
+	metaslab_class_t *mc;
+	metaslab_group_t *mg;
+	size_t psize, newsize;
+	uint64_t top;
+	uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
+
+	mutex_enter(&ztest_vdev_lock);
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+	top = ztest_random_vdev_top(spa, B_TRUE);
+
+	tvd = spa->spa_root_vdev->vdev_child[top];
+	mg = tvd->vdev_mg;
+	mc = mg->mg_class;
+	old_ms_count = tvd->vdev_ms_count;
+	old_class_space = metaslab_class_get_space(mc);
+
+	/*
+	 * Determine the size of the first leaf vdev associated with
+	 * our top-level device.
+	 */
+	vd = vdev_walk_tree(tvd, NULL, NULL);
+	ASSERT3P(vd, !=, NULL);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	psize = vd->vdev_psize;
+
+	/*
+	 * We only try to expand the vdev if it's healthy, less than 4x its
+	 * original size, and it has a valid psize.
+	 */
+	if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
+	    psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) {
+		spa_config_exit(spa, SCL_STATE, spa);
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
+	ASSERT(psize > 0);
+	newsize = psize + psize / 8;
+	ASSERT3U(newsize, >, psize);
+
+	if (ztest_opts.zo_verbose >= 6) {
+		(void) printf("Expanding LUN %s from %lu to %lu\n",
+		    vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
+	}
+
+	/*
+	 * Growing the vdev is a two step process:
+	 *	1). expand the physical size (i.e. relabel)
+	 *	2). online the vdev to create the new metaslabs
+	 */
+	if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL ||
+	    vdev_walk_tree(tvd, online_vdev, NULL) != NULL ||
+	    tvd->vdev_state != VDEV_STATE_HEALTHY) {
+		if (ztest_opts.zo_verbose >= 5) {
+			(void) printf("Could not expand LUN because "
+			    "the vdev configuration changed.\n");
+		}
+		spa_config_exit(spa, SCL_STATE, spa);
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
+
+	spa_config_exit(spa, SCL_STATE, spa);
+
+	/*
+	 * Expanding the LUN will update the config asynchronously,
+	 * thus we must wait for the async thread to complete any
+	 * pending tasks before proceeding.
+	 */
+	for (;;) {
+		boolean_t done;
+		mutex_enter(&spa->spa_async_lock);
+		done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
+		mutex_exit(&spa->spa_async_lock);
+		if (done)
+			break;
+		txg_wait_synced(spa_get_dsl(spa), 0);
+		(void) poll(NULL, 0, 100);
+	}
+
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+	tvd = spa->spa_root_vdev->vdev_child[top];
+	new_ms_count = tvd->vdev_ms_count;
+	new_class_space = metaslab_class_get_space(mc);
+
+	if (tvd->vdev_mg != mg || mg->mg_class != mc) {
+		if (ztest_opts.zo_verbose >= 5) {
+			(void) printf("Could not verify LUN expansion due to "
+			    "intervening vdev offline or remove.\n");
+		}
+		spa_config_exit(spa, SCL_STATE, spa);
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
+
+	/*
+	 * Make sure we were able to grow the vdev.
+	 */
+	if (new_ms_count <= old_ms_count)
+		fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n",
+		    old_ms_count, new_ms_count);
+
+	/*
+	 * Make sure we were able to grow the pool.
+	 */
+	if (new_class_space <= old_class_space)
+		fatal(0, "LUN expansion failed: class_space %llu <= %llu\n",
+		    old_class_space, new_class_space);
+
+	if (ztest_opts.zo_verbose >= 5) {
+		char oldnumbuf[6], newnumbuf[6];
+
+		nicenum(old_class_space, oldnumbuf);
+		nicenum(new_class_space, newnumbuf);
+		(void) printf("%s grew from %s to %s\n",
+		    spa->spa_name, oldnumbuf, newnumbuf);
+	}
+
+	spa_config_exit(spa, SCL_STATE, spa);
+	mutex_exit(&ztest_vdev_lock);
+}
+
+/*
+ * Verify that dmu_objset_{create,destroy,open,close} work as expected.
+ */
+/* ARGSUSED */
+static void
+ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+{
+	/*
+	 * Create the objects common to all ztest datasets.
+	 */
+	VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
+	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
+}
+
+static int
+ztest_dataset_create(char *dsname)
+{
+	uint64_t zilset = ztest_random(100);
+	int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0,
+	    ztest_objset_create_cb, NULL);
+
+	if (err || zilset < 80)
+		return (err);
+
+	if (ztest_opts.zo_verbose >= 5)
+		(void) printf("Setting dataset %s to sync always\n", dsname);
+	return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC,
+	    ZFS_SYNC_ALWAYS, B_FALSE));
+}
+
+/* ARGSUSED */
+static int
+ztest_objset_destroy_cb(const char *name, void *arg)
+{
+	objset_t *os;
+	dmu_object_info_t doi;
+	int error;
+
+	/*
+	 * Verify that the dataset contains a directory object.
+	 */
+	VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os));
+	error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
+	if (error != ENOENT) {
+		/* We could have crashed in the middle of destroying it */
+		ASSERT0(error);
+		ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
+		ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
+	}
+	dmu_objset_disown(os, FTAG);
+
+	/*
+	 * Destroy the dataset.
+	 */
+	if (strchr(name, '@') != NULL) {
+		VERIFY0(dsl_destroy_snapshot(name, B_FALSE));
+	} else {
+		VERIFY0(dsl_destroy_head(name));
+	}
+	return (0);
+}
+
+static boolean_t
+ztest_snapshot_create(char *osname, uint64_t id)
+{
+	char snapname[MAXNAMELEN];
+	int error;
+
+	(void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id);
+
+	error = dmu_objset_snapshot_one(osname, snapname);
+	if (error == ENOSPC) {
+		ztest_record_enospc(FTAG);
+		return (B_FALSE);
+	}
+	if (error != 0 && error != EEXIST) {
+		fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname,
+		    snapname, error);
+	}
+	return (B_TRUE);
+}
+
+static boolean_t
+ztest_snapshot_destroy(char *osname, uint64_t id)
+{
+	char snapname[MAXNAMELEN];
+	int error;
+
+	(void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+	    (u_longlong_t)id);
+
+	error = dsl_destroy_snapshot(snapname, B_FALSE);
+	if (error != 0 && error != ENOENT)
+		fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
+	return (B_TRUE);
+}
+
+/* ARGSUSED */
+void
+ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_ds_t *zdtmp;
+	int iters;
+	int error;
+	objset_t *os, *os2;
+	char *name;
+	zilog_t *zilog;
+	int i;
+
+	zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL);
+	name = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+
+	(void) rw_rdlock(&ztest_name_lock);
+
+	(void) snprintf(name, MAXNAMELEN, "%s/temp_%llu",
+	    ztest_opts.zo_pool, (u_longlong_t)id);
+
+	/*
+	 * If this dataset exists from a previous run, process its replay log
+	 * half of the time.  If we don't replay it, then dsl_destroy_head()
+	 * (invoked from ztest_objset_destroy_cb()) should just throw it away.
+	 */
+	if (ztest_random(2) == 0 &&
+	    dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) {
+		ztest_zd_init(zdtmp, NULL, os);
+		zil_replay(os, zdtmp, ztest_replay_vector);
+		ztest_zd_fini(zdtmp);
+		dmu_objset_disown(os, FTAG);
+	}
+
+	/*
+	 * There may be an old instance of the dataset we're about to
+	 * create lying around from a previous run.  If so, destroy it
+	 * and all of its snapshots.
+	 */
+	(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
+	    DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+
+	/*
+	 * Verify that the destroyed dataset is no longer in the namespace.
+	 */
+	VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
+	    FTAG, &os));
+
+	/*
+	 * Verify that we can create a new dataset.
+	 */
+	error = ztest_dataset_create(name);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
+		}
+		fatal(0, "dmu_objset_create(%s) = %d", name, error);
+	}
+
+	VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
+
+	ztest_zd_init(zdtmp, NULL, os);
+
+	/*
+	 * Open the intent log for it.
+	 */
+	zilog = zil_open(os, ztest_get_data);
+
+	/*
+	 * Put some objects in there, do a little I/O to them,
+	 * and randomly take a couple of snapshots along the way.
+	 */
+	iters = ztest_random(5);
+	for (i = 0; i < iters; i++) {
+		ztest_dmu_object_alloc_free(zdtmp, id);
+		if (ztest_random(iters) == 0)
+			(void) ztest_snapshot_create(name, i);
+	}
+
+	/*
+	 * Verify that we cannot create an existing dataset.
+	 */
+	VERIFY3U(EEXIST, ==,
+	    dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL));
+
+	/*
+	 * Verify that we can hold an objset that is also owned.
+	 */
+	VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
+	dmu_objset_rele(os2, FTAG);
+
+	/*
+	 * Verify that we cannot own an objset that is already owned.
+	 */
+	VERIFY3U(EBUSY, ==,
+	    dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2));
+
+	zil_close(zilog);
+	dmu_objset_disown(os, FTAG);
+	ztest_zd_fini(zdtmp);
+out:
+	(void) rw_unlock(&ztest_name_lock);
+
+	umem_free(name, MAXNAMELEN);
+	umem_free(zdtmp, sizeof (ztest_ds_t));
+}
+
+/*
+ * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
+ */
+void
+ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
+{
+	(void) rw_rdlock(&ztest_name_lock);
+	(void) ztest_snapshot_destroy(zd->zd_name, id);
+	(void) ztest_snapshot_create(zd->zd_name, id);
+	(void) rw_unlock(&ztest_name_lock);
+}
+
+/*
+ * Cleanup non-standard snapshots and clones.
+ */
+void
+ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
+{
+	char *snap1name;
+	char *clone1name;
+	char *snap2name;
+	char *clone2name;
+	char *snap3name;
+	int error;
+
+	snap1name  = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+	clone1name = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+	snap2name  = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+	clone2name = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+	snap3name  = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+
+	(void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu",
+	    osname, (u_longlong_t)id);
+	(void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu",
+	    osname, (u_longlong_t)id);
+	(void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu",
+	    clone1name, (u_longlong_t)id);
+	(void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu",
+	    osname, (u_longlong_t)id);
+	(void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu",
+	    clone1name, (u_longlong_t)id);
+
+	error = dsl_destroy_head(clone2name);
+	if (error && error != ENOENT)
+		fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error);
+	error = dsl_destroy_snapshot(snap3name, B_FALSE);
+	if (error && error != ENOENT)
+		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error);
+	error = dsl_destroy_snapshot(snap2name, B_FALSE);
+	if (error && error != ENOENT)
+		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error);
+	error = dsl_destroy_head(clone1name);
+	if (error && error != ENOENT)
+		fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error);
+	error = dsl_destroy_snapshot(snap1name, B_FALSE);
+	if (error && error != ENOENT)
+		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error);
+
+	umem_free(snap1name, MAXNAMELEN);
+	umem_free(clone1name, MAXNAMELEN);
+	umem_free(snap2name, MAXNAMELEN);
+	umem_free(clone2name, MAXNAMELEN);
+	umem_free(snap3name, MAXNAMELEN);
+}
+
+/*
+ * Verify dsl_dataset_promote handles EBUSY
+ */
+void
+ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os;
+	char *snap1name;
+	char *clone1name;
+	char *snap2name;
+	char *clone2name;
+	char *snap3name;
+	char *osname = zd->zd_name;
+	int error;
+
+	snap1name  = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+	clone1name = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+	snap2name  = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+	clone2name = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+	snap3name  = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+
+	(void) rw_rdlock(&ztest_name_lock);
+
+	ztest_dsl_dataset_cleanup(osname, id);
+
+	(void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu",
+	    osname, (u_longlong_t)id);
+	(void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu",
+	    osname, (u_longlong_t)id);
+	(void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu",
+	    clone1name, (u_longlong_t)id);
+	(void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu",
+	    osname, (u_longlong_t)id);
+	(void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu",
+	    clone1name, (u_longlong_t)id);
+
+	error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1);
+	if (error && error != EEXIST) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
+		}
+		fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
+	}
+
+	error = dmu_objset_clone(clone1name, snap1name);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
+		}
+		fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
+	}
+
+	error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1);
+	if (error && error != EEXIST) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
+		}
+		fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
+	}
+
+	error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1);
+	if (error && error != EEXIST) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
+		}
+		fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
+	}
+
+	error = dmu_objset_clone(clone2name, snap3name);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
+		}
+		fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
+	}
+
+	error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os);
+	if (error)
+		fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
+	error = dsl_dataset_promote(clone2name, NULL);
+	if (error == ENOSPC) {
+		dmu_objset_disown(os, FTAG);
+		ztest_record_enospc(FTAG);
+		goto out;
+	}
+	if (error != EBUSY)
+		fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
+		    error);
+	dmu_objset_disown(os, FTAG);
+
+out:
+	ztest_dsl_dataset_cleanup(osname, id);
+
+	(void) rw_unlock(&ztest_name_lock);
+
+	umem_free(snap1name, MAXNAMELEN);
+	umem_free(clone1name, MAXNAMELEN);
+	umem_free(snap2name, MAXNAMELEN);
+	umem_free(clone2name, MAXNAMELEN);
+	umem_free(snap3name, MAXNAMELEN);
+}
+
+#undef OD_ARRAY_SIZE
+#define	OD_ARRAY_SIZE	4
+
+/*
+ * Verify that dmu_object_{alloc,free} work as expected.
+ */
+void
+ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_od_t *od;
+	int batchsize;
+	int size;
+	int b;
+
+	size = sizeof (ztest_od_t) * OD_ARRAY_SIZE;
+	od = umem_alloc(size, UMEM_NOFAIL);
+	batchsize = OD_ARRAY_SIZE;
+
+	for (b = 0; b < batchsize; b++)
+		ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);
+
+	/*
+	 * Destroy the previous batch of objects, create a new batch,
+	 * and do some I/O on the new objects.
+	 */
+	if (ztest_object_init(zd, od, size, B_TRUE) != 0)
+		return;
+
+	while (ztest_random(4 * batchsize) != 0)
+		ztest_io(zd, od[ztest_random(batchsize)].od_object,
+		    ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+
+	umem_free(od, size);
+}
+
+#undef OD_ARRAY_SIZE
+#define	OD_ARRAY_SIZE	2
+
+/*
+ * Verify that dmu_{read,write} work as expected.
+ */
+void
+ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
+{
+	int size;
+	ztest_od_t *od;
+
+	objset_t *os = zd->zd_os;
+	size = sizeof (ztest_od_t) * OD_ARRAY_SIZE;
+	od = umem_alloc(size, UMEM_NOFAIL);
+	dmu_tx_t *tx;
+	int i, freeit, error;
+	uint64_t n, s, txg;
+	bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
+	uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
+	uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t);
+	uint64_t regions = 997;
+	uint64_t stride = 123456789ULL;
+	uint64_t width = 40;
+	int free_percent = 5;
+
+	/*
+	 * This test uses two objects, packobj and bigobj, that are always
+	 * updated together (i.e. in the same tx) so that their contents are
+	 * in sync and can be compared.  Their contents relate to each other
+	 * in a simple way: packobj is a dense array of 'bufwad' structures,
+	 * while bigobj is a sparse array of the same bufwads.  Specifically,
+	 * for any index n, there are three bufwads that should be identical:
+	 *
+	 *	packobj, at offset n * sizeof (bufwad_t)
+	 *	bigobj, at the head of the nth chunk
+	 *	bigobj, at the tail of the nth chunk
+	 *
+	 * The chunk size is arbitrary. It doesn't have to be a power of two,
+	 * and it doesn't have any relation to the object blocksize.
+	 * The only requirement is that it can hold at least two bufwads.
+	 *
+	 * Normally, we write the bufwad to each of these locations.
+	 * However, free_percent of the time we instead write zeroes to
+	 * packobj and perform a dmu_free_range() on bigobj.  By comparing
+	 * bigobj to packobj, we can verify that the DMU is correctly
+	 * tracking which parts of an object are allocated and free,
+	 * and that the contents of the allocated blocks are correct.
+	 */
+
+	/*
+	 * Read the directory info.  If it's the first time, set things up.
+	 */
+	ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize);
+	ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
+
+	if (ztest_object_init(zd, od, size, B_FALSE) != 0) {
+		umem_free(od, size);
+		return;
+	}
+
+	bigobj = od[0].od_object;
+	packobj = od[1].od_object;
+	chunksize = od[0].od_gen;
+	ASSERT(chunksize == od[1].od_gen);
+
+	/*
+	 * Prefetch a random chunk of the big object.
+	 * Our aim here is to get some async reads in flight
+	 * for blocks that we may free below; the DMU should
+	 * handle this race correctly.
+	 */
+	n = ztest_random(regions) * stride + ztest_random(width);
+	s = 1 + ztest_random(2 * width - 1);
+	dmu_prefetch(os, bigobj, n * chunksize, s * chunksize);
+
+	/*
+	 * Pick a random index and compute the offsets into packobj and bigobj.
+	 */
+	n = ztest_random(regions) * stride + ztest_random(width);
+	s = 1 + ztest_random(width - 1);
+
+	packoff = n * sizeof (bufwad_t);
+	packsize = s * sizeof (bufwad_t);
+
+	bigoff = n * chunksize;
+	bigsize = s * chunksize;
+
+	packbuf = umem_alloc(packsize, UMEM_NOFAIL);
+	bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
+
+	/*
+	 * free_percent of the time, free a range of bigobj rather than
+	 * overwriting it.
+	 */
+	freeit = (ztest_random(100) < free_percent);
+
+	/*
+	 * Read the current contents of our objects.
+	 */
+	error = dmu_read(os, packobj, packoff, packsize, packbuf,
+	    DMU_READ_PREFETCH);
+	ASSERT0(error);
+	error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
+	    DMU_READ_PREFETCH);
+	ASSERT0(error);
+
+	/*
+	 * Get a tx for the mods to both packobj and bigobj.
+	 */
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_write(tx, packobj, packoff, packsize);
+
+	if (freeit)
+		dmu_tx_hold_free(tx, bigobj, bigoff, bigsize);
+	else
+		dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
+
+	/* This accounts for setting the checksum/compression. */
+	dmu_tx_hold_bonus(tx, bigobj);
+
+	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+	if (txg == 0) {
+		umem_free(packbuf, packsize);
+		umem_free(bigbuf, bigsize);
+		umem_free(od, size);
+		return;
+	}
+
+	enum zio_checksum cksum;
+	do {
+		cksum = (enum zio_checksum)
+		    ztest_random_dsl_prop(ZFS_PROP_CHECKSUM);
+	} while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS);
+	dmu_object_set_checksum(os, bigobj, cksum, tx);
+
+	enum zio_compress comp;
+	do {
+		comp = (enum zio_compress)
+		    ztest_random_dsl_prop(ZFS_PROP_COMPRESSION);
+	} while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS);
+	dmu_object_set_compress(os, bigobj, comp, tx);
+
+	/*
+	 * For each index from n to n + s, verify that the existing bufwad
+	 * in packobj matches the bufwads at the head and tail of the
+	 * corresponding chunk in bigobj.  Then update all three bufwads
+	 * with the new values we want to write out.
+	 */
+	for (i = 0; i < s; i++) {
+		/* LINTED */
+		pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
+		/* LINTED */
+		bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
+		/* LINTED */
+		bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
+
+		ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
+		ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
+
+		if (pack->bw_txg > txg)
+			fatal(0, "future leak: got %llx, open txg is %llx",
+			    pack->bw_txg, txg);
+
+		if (pack->bw_data != 0 && pack->bw_index != n + i)
+			fatal(0, "wrong index: got %llx, wanted %llx+%llx",
+			    pack->bw_index, n, i);
+
+		if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
+			fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
+
+		if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
+			fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
+
+		if (freeit) {
+			bzero(pack, sizeof (bufwad_t));
+		} else {
+			pack->bw_index = n + i;
+			pack->bw_txg = txg;
+			pack->bw_data = 1 + ztest_random(-2ULL);
+		}
+		*bigH = *pack;
+		*bigT = *pack;
+	}
+
+	/*
+	 * We've verified all the old bufwads, and made new ones.
+	 * Now write them out.
+	 */
+	dmu_write(os, packobj, packoff, packsize, packbuf, tx);
+
+	if (freeit) {
+		if (ztest_opts.zo_verbose >= 7) {
+			(void) printf("freeing offset %llx size %llx"
+			    " txg %llx\n",
+			    (u_longlong_t)bigoff,
+			    (u_longlong_t)bigsize,
+			    (u_longlong_t)txg);
+		}
+		VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx));
+	} else {
+		if (ztest_opts.zo_verbose >= 7) {
+			(void) printf("writing offset %llx size %llx"
+			    " txg %llx\n",
+			    (u_longlong_t)bigoff,
+			    (u_longlong_t)bigsize,
+			    (u_longlong_t)txg);
+		}
+		dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx);
+	}
+
+	dmu_tx_commit(tx);
+
+	/*
+	 * Sanity check the stuff we just wrote.
+	 */
+	{
+		void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
+		void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
+
+		VERIFY(0 == dmu_read(os, packobj, packoff,
+		    packsize, packcheck, DMU_READ_PREFETCH));
+		VERIFY(0 == dmu_read(os, bigobj, bigoff,
+		    bigsize, bigcheck, DMU_READ_PREFETCH));
+
+		ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
+		ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
+
+		umem_free(packcheck, packsize);
+		umem_free(bigcheck, bigsize);
+	}
+
+	umem_free(packbuf, packsize);
+	umem_free(bigbuf, bigsize);
+	umem_free(od, size);
+}
+
+void
+compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
+    uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg)
+{
+	uint64_t i;
+	bufwad_t *pack;
+	bufwad_t *bigH;
+	bufwad_t *bigT;
+
+	/*
+	 * For each index from n to n + s, verify that the existing bufwad
+	 * in packobj matches the bufwads at the head and tail of the
+	 * corresponding chunk in bigobj.  Then update all three bufwads
+	 * with the new values we want to write out.
+	 */
+	for (i = 0; i < s; i++) {
+		/* LINTED */
+		pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
+		/* LINTED */
+		bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
+		/* LINTED */
+		bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
+
+		ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
+		ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
+
+		if (pack->bw_txg > txg)
+			fatal(0, "future leak: got %llx, open txg is %llx",
+			    pack->bw_txg, txg);
+
+		if (pack->bw_data != 0 && pack->bw_index != n + i)
+			fatal(0, "wrong index: got %llx, wanted %llx+%llx",
+			    pack->bw_index, n, i);
+
+		if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
+			fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
+
+		if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
+			fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
+
+		pack->bw_index = n + i;
+		pack->bw_txg = txg;
+		pack->bw_data = 1 + ztest_random(-2ULL);
+
+		*bigH = *pack;
+		*bigT = *pack;
+	}
+}
+
+#undef OD_ARRAY_SIZE
+#define	OD_ARRAY_SIZE	2
+
+void
+ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+	ztest_od_t *od;
+	dmu_tx_t *tx;
+	uint64_t i;
+	int error;
+	int size;
+	uint64_t n, s, txg;
+	bufwad_t *packbuf, *bigbuf;
+	uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
+	uint64_t blocksize = ztest_random_blocksize();
+	uint64_t chunksize = blocksize;
+	uint64_t regions = 997;
+	uint64_t stride = 123456789ULL;
+	uint64_t width = 9;
+	dmu_buf_t *bonus_db;
+	arc_buf_t **bigbuf_arcbufs;
+	dmu_object_info_t doi;
+
+	size = sizeof (ztest_od_t) * OD_ARRAY_SIZE;
+	od = umem_alloc(size, UMEM_NOFAIL);
+
+	/*
+	 * This test uses two objects, packobj and bigobj, that are always
+	 * updated together (i.e. in the same tx) so that their contents are
+	 * in sync and can be compared.  Their contents relate to each other
+	 * in a simple way: packobj is a dense array of 'bufwad' structures,
+	 * while bigobj is a sparse array of the same bufwads.  Specifically,
+	 * for any index n, there are three bufwads that should be identical:
+	 *
+	 *	packobj, at offset n * sizeof (bufwad_t)
+	 *	bigobj, at the head of the nth chunk
+	 *	bigobj, at the tail of the nth chunk
+	 *
+	 * The chunk size is set equal to bigobj block size so that
+	 * dmu_assign_arcbuf() can be tested for object updates.
+	 */
+
+	/*
+	 * Read the directory info.  If it's the first time, set things up.
+	 */
+	ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
+	ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
+
+
+	if (ztest_object_init(zd, od, size, B_FALSE) != 0) {
+		umem_free(od, size);
+		return;
+	}
+
+	bigobj = od[0].od_object;
+	packobj = od[1].od_object;
+	blocksize = od[0].od_blocksize;
+	chunksize = blocksize;
+	ASSERT(chunksize == od[1].od_gen);
+
+	VERIFY(dmu_object_info(os, bigobj, &doi) == 0);
+	VERIFY(ISP2(doi.doi_data_block_size));
+	VERIFY(chunksize == doi.doi_data_block_size);
+	VERIFY(chunksize >= 2 * sizeof (bufwad_t));
+
+	/*
+	 * Pick a random index and compute the offsets into packobj and bigobj.
+	 */
+	n = ztest_random(regions) * stride + ztest_random(width);
+	s = 1 + ztest_random(width - 1);
+
+	packoff = n * sizeof (bufwad_t);
+	packsize = s * sizeof (bufwad_t);
+
+	bigoff = n * chunksize;
+	bigsize = s * chunksize;
+
+	packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
+	bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db));
+
+	bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
+
+	/*
+	 * Iteration 0 test zcopy for DB_UNCACHED dbufs.
+	 * Iteration 1 test zcopy to already referenced dbufs.
+	 * Iteration 2 test zcopy to dirty dbuf in the same txg.
+	 * Iteration 3 test zcopy to dbuf dirty in previous txg.
+	 * Iteration 4 test zcopy when dbuf is no longer dirty.
+	 * Iteration 5 test zcopy when it can't be done.
+	 * Iteration 6 one more zcopy write.
+	 */
+	for (i = 0; i < 7; i++) {
+		uint64_t j;
+		uint64_t off;
+
+		/*
+		 * In iteration 5 (i == 5) use arcbufs
+		 * that don't match bigobj blksz to test
+		 * dmu_assign_arcbuf() when it can't directly
+		 * assign an arcbuf to a dbuf.
+		 */
+		for (j = 0; j < s; j++) {
+			if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
+				bigbuf_arcbufs[j] =
+				    dmu_request_arcbuf(bonus_db, chunksize);
+			} else {
+				bigbuf_arcbufs[2 * j] =
+				    dmu_request_arcbuf(bonus_db, chunksize / 2);
+				bigbuf_arcbufs[2 * j + 1] =
+				    dmu_request_arcbuf(bonus_db, chunksize / 2);
+			}
+		}
+
+		/*
+		 * Get a tx for the mods to both packobj and bigobj.
+		 */
+		tx = dmu_tx_create(os);
+
+		dmu_tx_hold_write(tx, packobj, packoff, packsize);
+		dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
+
+		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+		if (txg == 0) {
+			umem_free(packbuf, packsize);
+			umem_free(bigbuf, bigsize);
+			for (j = 0; j < s; j++) {
+				if (i != 5 ||
+				    chunksize < (SPA_MINBLOCKSIZE * 2)) {
+					dmu_return_arcbuf(bigbuf_arcbufs[j]);
+				} else {
+					dmu_return_arcbuf(
+					    bigbuf_arcbufs[2 * j]);
+					dmu_return_arcbuf(
+					    bigbuf_arcbufs[2 * j + 1]);
+				}
+			}
+			umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
+			umem_free(od, size);
+			dmu_buf_rele(bonus_db, FTAG);
+			return;
+		}
+
+		/*
+		 * 50% of the time don't read objects in the 1st iteration to
+		 * test dmu_assign_arcbuf() for the case when there're no
+		 * existing dbufs for the specified offsets.
+		 */
+		if (i != 0 || ztest_random(2) != 0) {
+			error = dmu_read(os, packobj, packoff,
+			    packsize, packbuf, DMU_READ_PREFETCH);
+			ASSERT0(error);
+			error = dmu_read(os, bigobj, bigoff, bigsize,
+			    bigbuf, DMU_READ_PREFETCH);
+			ASSERT0(error);
+		}
+		compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
+		    n, chunksize, txg);
+
+		/*
+		 * We've verified all the old bufwads, and made new ones.
+		 * Now write them out.
+		 */
+		dmu_write(os, packobj, packoff, packsize, packbuf, tx);
+		if (ztest_opts.zo_verbose >= 7) {
+			(void) printf("writing offset %llx size %llx"
+			    " txg %llx\n",
+			    (u_longlong_t)bigoff,
+			    (u_longlong_t)bigsize,
+			    (u_longlong_t)txg);
+		}
+		for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
+			dmu_buf_t *dbt;
+			if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
+				bcopy((caddr_t)bigbuf + (off - bigoff),
+				    bigbuf_arcbufs[j]->b_data, chunksize);
+			} else {
+				bcopy((caddr_t)bigbuf + (off - bigoff),
+				    bigbuf_arcbufs[2 * j]->b_data,
+				    chunksize / 2);
+				bcopy((caddr_t)bigbuf + (off - bigoff) +
+				    chunksize / 2,
+				    bigbuf_arcbufs[2 * j + 1]->b_data,
+				    chunksize / 2);
+			}
+
+			if (i == 1) {
+				VERIFY(dmu_buf_hold(os, bigobj, off,
+				    FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
+			}
+			if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
+				dmu_assign_arcbuf(bonus_db, off,
+				    bigbuf_arcbufs[j], tx);
+			} else {
+				dmu_assign_arcbuf(bonus_db, off,
+				    bigbuf_arcbufs[2 * j], tx);
+				dmu_assign_arcbuf(bonus_db,
+				    off + chunksize / 2,
+				    bigbuf_arcbufs[2 * j + 1], tx);
+			}
+			if (i == 1) {
+				dmu_buf_rele(dbt, FTAG);
+			}
+		}
+		dmu_tx_commit(tx);
+
+		/*
+		 * Sanity check the stuff we just wrote.
+		 */
+		{
+			void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
+			void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
+
+			VERIFY(0 == dmu_read(os, packobj, packoff,
+			    packsize, packcheck, DMU_READ_PREFETCH));
+			VERIFY(0 == dmu_read(os, bigobj, bigoff,
+			    bigsize, bigcheck, DMU_READ_PREFETCH));
+
+			ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
+			ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
+
+			umem_free(packcheck, packsize);
+			umem_free(bigcheck, bigsize);
+		}
+		if (i == 2) {
+			txg_wait_open(dmu_objset_pool(os), 0);
+		} else if (i == 3) {
+			txg_wait_synced(dmu_objset_pool(os), 0);
+		}
+	}
+
+	dmu_buf_rele(bonus_db, FTAG);
+	umem_free(packbuf, packsize);
+	umem_free(bigbuf, bigsize);
+	umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
+	umem_free(od, size);
+}
+
+/* ARGSUSED */
+void
+ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_od_t *od;
+
+	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+	uint64_t offset = (1ULL << (ztest_random(20) + 43)) +
+	    (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+
+	/*
+	 * Have multiple threads write to large offsets in an object
+	 * to verify that parallel writes to an object -- even to the
+	 * same blocks within the object -- doesn't cause any trouble.
+	 */
+	ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0)
+		return;
+
+	while (ztest_random(10) != 0)
+		ztest_io(zd, od->od_object, offset);
+
+	umem_free(od, sizeof (ztest_od_t));
+}
+
+void
+ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_od_t *od;
+	uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) +
+	    (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+	uint64_t count = ztest_random(20) + 1;
+	uint64_t blocksize = ztest_random_blocksize();
+	void *data;
+
+	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+
+	ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
+
+	if (ztest_object_init(zd, od, sizeof (ztest_od_t),
+	    !ztest_random(2)) != 0) {
+		umem_free(od, sizeof (ztest_od_t));
+		return;
+	}
+
+	if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) {
+		umem_free(od, sizeof (ztest_od_t));
+		return;
+	}
+
+	ztest_prealloc(zd, od->od_object, offset, count * blocksize);
+
+	data = umem_zalloc(blocksize, UMEM_NOFAIL);
+
+	while (ztest_random(count) != 0) {
+		uint64_t randoff = offset + (ztest_random(count) * blocksize);
+		if (ztest_write(zd, od->od_object, randoff, blocksize,
+		    data) != 0)
+			break;
+		while (ztest_random(4) != 0)
+			ztest_io(zd, od->od_object, randoff);
+	}
+
+	umem_free(data, blocksize);
+	umem_free(od, sizeof (ztest_od_t));
+}
+
+/*
+ * Verify that zap_{create,destroy,add,remove,update} work as expected.
+ */
+#define	ZTEST_ZAP_MIN_INTS	1
+#define	ZTEST_ZAP_MAX_INTS	4
+#define	ZTEST_ZAP_MAX_PROPS	1000
+
+void
+ztest_zap(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+	ztest_od_t *od;
+	uint64_t object;
+	uint64_t txg, last_txg;
+	uint64_t value[ZTEST_ZAP_MAX_INTS];
+	uint64_t zl_ints, zl_intsize, prop;
+	int i, ints;
+	dmu_tx_t *tx;
+	char propname[100], txgname[100];
+	int error;
+	char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
+
+	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+	ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (ztest_od_t),
+			!ztest_random(2)) != 0)
+		goto out;
+
+	object = od->od_object;
+
+	/*
+	 * Generate a known hash collision, and verify that
+	 * we can lookup and remove both entries.
+	 */
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+	if (txg == 0)
+		goto out;
+	for (i = 0; i < 2; i++) {
+		value[i] = i;
+		VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t),
+		    1, &value[i], tx));
+	}
+	for (i = 0; i < 2; i++) {
+		VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i],
+		    sizeof (uint64_t), 1, &value[i], tx));
+		VERIFY3U(0, ==,
+		    zap_length(os, object, hc[i], &zl_intsize, &zl_ints));
+		ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+		ASSERT3U(zl_ints, ==, 1);
+	}
+	for (i = 0; i < 2; i++) {
+		VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx));
+	}
+	dmu_tx_commit(tx);
+
+	/*
+	 * Generate a buch of random entries.
+	 */
+	ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
+
+	prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
+	(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
+	(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
+	bzero(value, sizeof (value));
+	last_txg = 0;
+
+	/*
+	 * If these zap entries already exist, validate their contents.
+	 */
+	error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
+	if (error == 0) {
+		ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+		ASSERT3U(zl_ints, ==, 1);
+
+		VERIFY(zap_lookup(os, object, txgname, zl_intsize,
+		    zl_ints, &last_txg) == 0);
+
+		VERIFY(zap_length(os, object, propname, &zl_intsize,
+		    &zl_ints) == 0);
+
+		ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+		ASSERT3U(zl_ints, ==, ints);
+
+		VERIFY(zap_lookup(os, object, propname, zl_intsize,
+		    zl_ints, value) == 0);
+
+		for (i = 0; i < ints; i++) {
+			ASSERT3U(value[i], ==, last_txg + object + i);
+		}
+	} else {
+		ASSERT3U(error, ==, ENOENT);
+	}
+
+	/*
+	 * Atomically update two entries in our zap object.
+	 * The first is named txg_%llu, and contains the txg
+	 * in which the property was last updated.  The second
+	 * is named prop_%llu, and the nth element of its value
+	 * should be txg + object + n.
+	 */
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+	if (txg == 0)
+		goto out;
+
+	if (last_txg > txg)
+		fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
+
+	for (i = 0; i < ints; i++)
+		value[i] = txg + object + i;
+
+	VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t),
+	    1, &txg, tx));
+	VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t),
+	    ints, value, tx));
+
+	dmu_tx_commit(tx);
+
+	/*
+	 * Remove a random pair of entries.
+	 */
+	prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
+	(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
+	(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
+
+	error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
+
+	if (error == ENOENT)
+		goto out;
+
+	ASSERT0(error);
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+	if (txg == 0)
+		goto out;
+	VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
+	VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
+	dmu_tx_commit(tx);
+out:
+	umem_free(od, sizeof (ztest_od_t));
+}
+
+/*
+ * Testcase to test the upgrading of a microzap to fatzap.
+ */
+void
+ztest_fzap(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+	ztest_od_t *od;
+	uint64_t object, txg;
+	int i;
+
+	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+	ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (ztest_od_t),
+				!ztest_random(2)) != 0)
+		goto out;
+	object = od->od_object;
+
+	/*
+	 * Add entries to this ZAP and make sure it spills over
+	 * and gets upgraded to a fatzap. Also, since we are adding
+	 * 2050 entries we should see ptrtbl growth and leaf-block split.
+	 */
+	for (i = 0; i < 2050; i++) {
+		char name[MAXNAMELEN];
+		uint64_t value = i;
+		dmu_tx_t *tx;
+		int error;
+
+		(void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
+		    (u_longlong_t)id, (u_longlong_t)value);
+
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_zap(tx, object, B_TRUE, name);
+		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+		if (txg == 0)
+			goto out;
+		error = zap_add(os, object, name, sizeof (uint64_t), 1,
+		    &value, tx);
+		ASSERT(error == 0 || error == EEXIST);
+		dmu_tx_commit(tx);
+	}
+out:
+	umem_free(od, sizeof (ztest_od_t));
+}
+
+/* ARGSUSED */
+void
+ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+	ztest_od_t *od;
+	uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
+	dmu_tx_t *tx;
+	int i, namelen, error;
+	int micro = ztest_random(2);
+	char name[20], string_value[20];
+	void *data;
+
+	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+	ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) {
+		umem_free(od, sizeof (ztest_od_t));
+		return;
+	}
+
+	object = od->od_object;
+
+	/*
+	 * Generate a random name of the form 'xxx.....' where each
+	 * x is a random printable character and the dots are dots.
+	 * There are 94 such characters, and the name length goes from
+	 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
+	 */
+	namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
+
+	for (i = 0; i < 3; i++)
+		name[i] = '!' + ztest_random('~' - '!' + 1);
+	for (; i < namelen - 1; i++)
+		name[i] = '.';
+	name[i] = '\0';
+
+	if ((namelen & 1) || micro) {
+		wsize = sizeof (txg);
+		wc = 1;
+		data = &txg;
+	} else {
+		wsize = 1;
+		wc = namelen;
+		data = string_value;
+	}
+
+	count = -1ULL;
+	VERIFY0(zap_count(os, object, &count));
+	ASSERT(count != -1ULL);
+
+	/*
+	 * Select an operation: length, lookup, add, update, remove.
+	 */
+	i = ztest_random(5);
+
+	if (i >= 2) {
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+		if (txg == 0)
+			return;
+		bcopy(name, string_value, namelen);
+	} else {
+		tx = NULL;
+		txg = 0;
+		bzero(string_value, namelen);
+	}
+
+	switch (i) {
+
+	case 0:
+		error = zap_length(os, object, name, &zl_wsize, &zl_wc);
+		if (error == 0) {
+			ASSERT3U(wsize, ==, zl_wsize);
+			ASSERT3U(wc, ==, zl_wc);
+		} else {
+			ASSERT3U(error, ==, ENOENT);
+		}
+		break;
+
+	case 1:
+		error = zap_lookup(os, object, name, wsize, wc, data);
+		if (error == 0) {
+			if (data == string_value &&
+			    bcmp(name, data, namelen) != 0)
+				fatal(0, "name '%s' != val '%s' len %d",
+				    name, data, namelen);
+		} else {
+			ASSERT3U(error, ==, ENOENT);
+		}
+		break;
+
+	case 2:
+		error = zap_add(os, object, name, wsize, wc, data, tx);
+		ASSERT(error == 0 || error == EEXIST);
+		break;
+
+	case 3:
+		VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
+		break;
+
+	case 4:
+		error = zap_remove(os, object, name, tx);
+		ASSERT(error == 0 || error == ENOENT);
+		break;
+	}
+
+	if (tx != NULL)
+		dmu_tx_commit(tx);
+
+	umem_free(od, sizeof (ztest_od_t));
+}
+
+/*
+ * Commit callback data.
+ */
+typedef struct ztest_cb_data {
+	list_node_t		zcd_node;
+	uint64_t		zcd_txg;
+	int			zcd_expected_err;
+	boolean_t		zcd_added;
+	boolean_t		zcd_called;
+	spa_t			*zcd_spa;
+} ztest_cb_data_t;
+
+/* This is the actual commit callback function */
+static void
+ztest_commit_callback(void *arg, int error)
+{
+	ztest_cb_data_t *data = arg;
+	uint64_t synced_txg;
+
+	VERIFY(data != NULL);
+	VERIFY3S(data->zcd_expected_err, ==, error);
+	VERIFY(!data->zcd_called);
+
+	synced_txg = spa_last_synced_txg(data->zcd_spa);
+	if (data->zcd_txg > synced_txg)
+		fatal(0, "commit callback of txg %" PRIu64 " called prematurely"
+		    ", last synced txg = %" PRIu64 "\n", data->zcd_txg,
+		    synced_txg);
+
+	data->zcd_called = B_TRUE;
+
+	if (error == ECANCELED) {
+		ASSERT0(data->zcd_txg);
+		ASSERT(!data->zcd_added);
+
+		/*
+		 * The private callback data should be destroyed here, but
+		 * since we are going to check the zcd_called field after
+		 * dmu_tx_abort(), we will destroy it there.
+		 */
+		return;
+	}
+
+	ASSERT(data->zcd_added);
+	ASSERT3U(data->zcd_txg, !=, 0);
+
+	(void) mutex_enter(&zcl.zcl_callbacks_lock);
+
+	/* See if this cb was called more quickly */
+	if ((synced_txg - data->zcd_txg) < zc_min_txg_delay)
+		zc_min_txg_delay = synced_txg - data->zcd_txg;
+
+	/* Remove our callback from the list */
+	list_remove(&zcl.zcl_callbacks, data);
+
+	(void) mutex_exit(&zcl.zcl_callbacks_lock);
+
+	umem_free(data, sizeof (ztest_cb_data_t));
+}
+
+/* Allocate and initialize callback data structure */
+static ztest_cb_data_t *
+ztest_create_cb_data(objset_t *os, uint64_t txg)
+{
+	ztest_cb_data_t *cb_data;
+
+	cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
+
+	cb_data->zcd_txg = txg;
+	cb_data->zcd_spa = dmu_objset_spa(os);
+	list_link_init(&cb_data->zcd_node);
+
+	return (cb_data);
+}
+
+/*
+ * Commit callback test.
+ */
+void
+ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+	ztest_od_t *od;
+	dmu_tx_t *tx;
+	ztest_cb_data_t *cb_data[3], *tmp_cb;
+	uint64_t old_txg, txg;
+	int i, error = 0;
+
+	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+	ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) {
+		umem_free(od, sizeof (ztest_od_t));
+		return;
+	}
+
+	tx = dmu_tx_create(os);
+
+	cb_data[0] = ztest_create_cb_data(os, 0);
+	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
+
+	dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t));
+
+	/* Every once in a while, abort the transaction on purpose */
+	if (ztest_random(100) == 0)
+		error = -1;
+
+	if (!error)
+		error = dmu_tx_assign(tx, TXG_NOWAIT);
+
+	txg = error ? 0 : dmu_tx_get_txg(tx);
+
+	cb_data[0]->zcd_txg = txg;
+	cb_data[1] = ztest_create_cb_data(os, txg);
+	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]);
+
+	if (error) {
+		/*
+		 * It's not a strict requirement to call the registered
+		 * callbacks from inside dmu_tx_abort(), but that's what
+		 * it's supposed to happen in the current implementation
+		 * so we will check for that.
+		 */
+		for (i = 0; i < 2; i++) {
+			cb_data[i]->zcd_expected_err = ECANCELED;
+			VERIFY(!cb_data[i]->zcd_called);
+		}
+
+		dmu_tx_abort(tx);
+
+		for (i = 0; i < 2; i++) {
+			VERIFY(cb_data[i]->zcd_called);
+			umem_free(cb_data[i], sizeof (ztest_cb_data_t));
+		}
+
+		umem_free(od, sizeof (ztest_od_t));
+		return;
+	}
+
+	cb_data[2] = ztest_create_cb_data(os, txg);
+	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
+
+	/*
+	 * Read existing data to make sure there isn't a future leak.
+	 */
+	VERIFY(0 == dmu_read(os, od->od_object, 0, sizeof (uint64_t),
+	    &old_txg, DMU_READ_PREFETCH));
+
+	if (old_txg > txg)
+		fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
+		    old_txg, txg);
+
+	dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx);
+
+	(void) mutex_enter(&zcl.zcl_callbacks_lock);
+
+	/*
+	 * Since commit callbacks don't have any ordering requirement and since
+	 * it is theoretically possible for a commit callback to be called
+	 * after an arbitrary amount of time has elapsed since its txg has been
+	 * synced, it is difficult to reliably determine whether a commit
+	 * callback hasn't been called due to high load or due to a flawed
+	 * implementation.
+	 *
+	 * In practice, we will assume that if after a certain number of txgs a
+	 * commit callback hasn't been called, then most likely there's an
+	 * implementation bug..
+	 */
+	tmp_cb = list_head(&zcl.zcl_callbacks);
+	if (tmp_cb != NULL &&
+	    tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) {
+		fatal(0, "Commit callback threshold exceeded, oldest txg: %"
+		    PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
+	}
+
+	/*
+	 * Let's find the place to insert our callbacks.
+	 *
+	 * Even though the list is ordered by txg, it is possible for the
+	 * insertion point to not be the end because our txg may already be
+	 * quiescing at this point and other callbacks in the open txg
+	 * (from other objsets) may have sneaked in.
+	 */
+	tmp_cb = list_tail(&zcl.zcl_callbacks);
+	while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
+		tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
+
+	/* Add the 3 callbacks to the list */
+	for (i = 0; i < 3; i++) {
+		if (tmp_cb == NULL)
+			list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
+		else
+			list_insert_after(&zcl.zcl_callbacks, tmp_cb,
+			    cb_data[i]);
+
+		cb_data[i]->zcd_added = B_TRUE;
+		VERIFY(!cb_data[i]->zcd_called);
+
+		tmp_cb = cb_data[i];
+	}
+
+	zc_cb_counter += 3;
+
+	(void) mutex_exit(&zcl.zcl_callbacks_lock);
+
+	dmu_tx_commit(tx);
+
+	umem_free(od, sizeof (ztest_od_t));
+}
+
+/* ARGSUSED */
+void
+ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
+{
+	zfs_prop_t proplist[] = {
+		ZFS_PROP_CHECKSUM,
+		ZFS_PROP_COMPRESSION,
+		ZFS_PROP_COPIES,
+		ZFS_PROP_DEDUP
+	};
+	int p;
+
+	(void) rw_rdlock(&ztest_name_lock);
+
+	for (p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
+		(void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
+		    ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
+
+	VERIFY0(ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE,
+	    ztest_random_blocksize(), (int)ztest_random(2)));
+
+	(void) rw_unlock(&ztest_name_lock);
+}
+
+/* ARGSUSED */
+void
+ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
+{
+	nvlist_t *props = NULL;
+
+	(void) rw_rdlock(&ztest_name_lock);
+
+	(void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO,
+	    ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
+
+	VERIFY0(spa_prop_get(ztest_spa, &props));
+
+	if (ztest_opts.zo_verbose >= 6)
+		dump_nvlist(props, 4);
+
+	nvlist_free(props);
+
+	(void) rw_unlock(&ztest_name_lock);
+}
+
+static int
+user_release_one(const char *snapname, const char *holdname)
+{
+	nvlist_t *snaps, *holds;
+	int error;
+
+	snaps = fnvlist_alloc();
+	holds = fnvlist_alloc();
+	fnvlist_add_boolean(holds, holdname);
+	fnvlist_add_nvlist(snaps, snapname, holds);
+	fnvlist_free(holds);
+	error = dsl_dataset_user_release(snaps, NULL);
+	fnvlist_free(snaps);
+	return (error);
+}
+
+/*
+ * Test snapshot hold/release and deferred destroy.
+ */
+void
+ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
+{
+	int error;
+	objset_t *os = zd->zd_os;
+	objset_t *origin;
+	char snapname[100];
+	char fullname[100];
+	char clonename[100];
+	char tag[100];
+	char osname[MAXNAMELEN];
+	nvlist_t *holds;
+
+	(void) rw_rdlock(&ztest_name_lock);
+
+	dmu_objset_name(os, osname);
+
+	(void) snprintf(snapname, sizeof (snapname), "sh1_%llu",
+	    (u_longlong_t)id);
+	(void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname);
+	(void) snprintf(clonename, sizeof (clonename),
+	    "%s/ch1_%llu", osname, (u_longlong_t)id);
+	(void) snprintf(tag, sizeof (tag), "tag_%llu", (u_longlong_t)id);
+
+	/*
+	 * Clean up from any previous run.
+	 */
+	error = dsl_destroy_head(clonename);
+	if (error != ENOENT)
+		ASSERT0(error);
+	error = user_release_one(fullname, tag);
+	if (error != ESRCH && error != ENOENT)
+		ASSERT0(error);
+	error = dsl_destroy_snapshot(fullname, B_FALSE);
+	if (error != ENOENT)
+		ASSERT0(error);
+
+	/*
+	 * Create snapshot, clone it, mark snap for deferred destroy,
+	 * destroy clone, verify snap was also destroyed.
+	 */
+	error = dmu_objset_snapshot_one(osname, snapname);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc("dmu_objset_snapshot");
+			goto out;
+		}
+		fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
+	}
+
+	error = dmu_objset_clone(clonename, fullname);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc("dmu_objset_clone");
+			goto out;
+		}
+		fatal(0, "dmu_objset_clone(%s) = %d", clonename, error);
+	}
+
+	error = dsl_destroy_snapshot(fullname, B_TRUE);
+	if (error) {
+		fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
+		    fullname, error);
+	}
+
+	error = dsl_destroy_head(clonename);
+	if (error)
+		fatal(0, "dsl_destroy_head(%s) = %d", clonename, error);
+
+	error = dmu_objset_hold(fullname, FTAG, &origin);
+	if (error != ENOENT)
+		fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
+
+	/*
+	 * Create snapshot, add temporary hold, verify that we can't
+	 * destroy a held snapshot, mark for deferred destroy,
+	 * release hold, verify snapshot was destroyed.
+	 */
+	error = dmu_objset_snapshot_one(osname, snapname);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc("dmu_objset_snapshot");
+			goto out;
+		}
+		fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
+	}
+
+	holds = fnvlist_alloc();
+	fnvlist_add_string(holds, fullname, tag);
+	error = dsl_dataset_user_hold(holds, 0, NULL);
+	fnvlist_free(holds);
+
+	if (error == ENOSPC) {
+		ztest_record_enospc("dsl_dataset_user_hold");
+		goto out;
+	} else if (error) {
+		fatal(0, "dsl_dataset_user_hold(%s, %s) = %u",
+		    fullname, tag, error);
+	}
+
+	error = dsl_destroy_snapshot(fullname, B_FALSE);
+	if (error != EBUSY) {
+		fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d",
+		    fullname, error);
+	}
+
+	error = dsl_destroy_snapshot(fullname, B_TRUE);
+	if (error) {
+		fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
+		    fullname, error);
+	}
+
+	error = user_release_one(fullname, tag);
+	if (error)
+		fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error);
+
+	VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT);
+
+out:
+	(void) rw_unlock(&ztest_name_lock);
+}
+
+/*
+ * Inject random faults into the on-disk data.
+ */
+/* ARGSUSED */
+void
+ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = ztest_spa;
+	int fd;
+	uint64_t offset;
+	uint64_t leaves;
+	uint64_t bad = 0x1990c0ffeedecadeull;
+	uint64_t top, leaf;
+	char *path0;
+	char *pathrand;
+	size_t fsize;
+	int bshift = SPA_OLD_MAXBLOCKSHIFT + 2;	/* don't scrog all labels */
+	int iters = 1000;
+	int maxfaults;
+	int mirror_save;
+	vdev_t *vd0 = NULL;
+	uint64_t guid0 = 0;
+	boolean_t islog = B_FALSE;
+
+	path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+	pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+	mutex_enter(&ztest_vdev_lock);
+	maxfaults = MAXFAULTS();
+	leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
+	mirror_save = zs->zs_mirrors;
+	mutex_exit(&ztest_vdev_lock);
+
+	ASSERT(leaves >= 1);
+
+	/*
+	 * Grab the name lock as reader. There are some operations
+	 * which don't like to have their vdevs changed while
+	 * they are in progress (i.e. spa_change_guid). Those
+	 * operations will have grabbed the name lock as writer.
+	 */
+	(void) rw_rdlock(&ztest_name_lock);
+
+	/*
+	 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
+	 */
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
+	if (ztest_random(2) == 0) {
+		/*
+		 * Inject errors on a normal data device or slog device.
+		 */
+		top = ztest_random_vdev_top(spa, B_TRUE);
+		leaf = ztest_random(leaves) + zs->zs_splits;
+
+		/*
+		 * Generate paths to the first leaf in this top-level vdev,
+		 * and to the random leaf we selected.  We'll induce transient
+		 * write failures and random online/offline activity on leaf 0,
+		 * and we'll write random garbage to the randomly chosen leaf.
+		 */
+		(void) snprintf(path0, MAXPATHLEN, ztest_dev_template,
+		    ztest_opts.zo_dir, ztest_opts.zo_pool,
+		    top * leaves + zs->zs_splits);
+		(void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template,
+		    ztest_opts.zo_dir, ztest_opts.zo_pool,
+		    top * leaves + leaf);
+
+		vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
+		if (vd0 != NULL && vd0->vdev_top->vdev_islog)
+			islog = B_TRUE;
+
+		/*
+		 * If the top-level vdev needs to be resilvered
+		 * then we only allow faults on the device that is
+		 * resilvering.
+		 */
+		if (vd0 != NULL && maxfaults != 1 &&
+		    (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) ||
+		    vd0->vdev_resilver_txg != 0)) {
+			/*
+			 * Make vd0 explicitly claim to be unreadable,
+			 * or unwriteable, or reach behind its back
+			 * and close the underlying fd.  We can do this if
+			 * maxfaults == 0 because we'll fail and reexecute,
+			 * and we can do it if maxfaults >= 2 because we'll
+			 * have enough redundancy.  If maxfaults == 1, the
+			 * combination of this with injection of random data
+			 * corruption below exceeds the pool's fault tolerance.
+			 */
+			vdev_file_t *vf = vd0->vdev_tsd;
+
+			if (vf != NULL && ztest_random(3) == 0) {
+				(void) close(vf->vf_vnode->v_fd);
+				vf->vf_vnode->v_fd = -1;
+			} else if (ztest_random(2) == 0) {
+				vd0->vdev_cant_read = B_TRUE;
+			} else {
+				vd0->vdev_cant_write = B_TRUE;
+			}
+			guid0 = vd0->vdev_guid;
+		}
+	} else {
+		/*
+		 * Inject errors on an l2cache device.
+		 */
+		spa_aux_vdev_t *sav = &spa->spa_l2cache;
+
+		if (sav->sav_count == 0) {
+			spa_config_exit(spa, SCL_STATE, FTAG);
+			(void) rw_unlock(&ztest_name_lock);
+			goto out;
+		}
+		vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
+		guid0 = vd0->vdev_guid;
+		(void) strcpy(path0, vd0->vdev_path);
+		(void) strcpy(pathrand, vd0->vdev_path);
+
+		leaf = 0;
+		leaves = 1;
+		maxfaults = INT_MAX;	/* no limit on cache devices */
+	}
+
+	spa_config_exit(spa, SCL_STATE, FTAG);
+	(void) rw_unlock(&ztest_name_lock);
+
+	/*
+	 * If we can tolerate two or more faults, or we're dealing
+	 * with a slog, randomly online/offline vd0.
+	 */
+	if ((maxfaults >= 2 || islog) && guid0 != 0) {
+		if (ztest_random(10) < 6) {
+			int flags = (ztest_random(2) == 0 ?
+			    ZFS_OFFLINE_TEMPORARY : 0);
+
+			/*
+			 * We have to grab the zs_name_lock as writer to
+			 * prevent a race between offlining a slog and
+			 * destroying a dataset. Offlining the slog will
+			 * grab a reference on the dataset which may cause
+			 * dsl_destroy_head() to fail with EBUSY thus
+			 * leaving the dataset in an inconsistent state.
+			 */
+			if (islog)
+				(void) rw_wrlock(&ztest_name_lock);
+
+			VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
+
+			if (islog)
+				(void) rw_unlock(&ztest_name_lock);
+		} else {
+			/*
+			 * Ideally we would like to be able to randomly
+			 * call vdev_[on|off]line without holding locks
+			 * to force unpredictable failures but the side
+			 * effects of vdev_[on|off]line prevent us from
+			 * doing so. We grab the ztest_vdev_lock here to
+			 * prevent a race between injection testing and
+			 * aux_vdev removal.
+			 */
+			mutex_enter(&ztest_vdev_lock);
+			(void) vdev_online(spa, guid0, 0, NULL);
+			mutex_exit(&ztest_vdev_lock);
+		}
+	}
+
+	if (maxfaults == 0)
+		goto out;
+
+	/*
+	 * We have at least single-fault tolerance, so inject data corruption.
+	 */
+	fd = open(pathrand, O_RDWR);
+
+	if (fd == -1)	/* we hit a gap in the device namespace */
+		goto out;
+
+	fsize = lseek(fd, 0, SEEK_END);
+
+	while (--iters != 0) {
+		offset = ztest_random(fsize / (leaves << bshift)) *
+		    (leaves << bshift) + (leaf << bshift) +
+		    (ztest_random(1ULL << (bshift - 1)) & -8ULL);
+
+		if (offset >= fsize)
+			continue;
+
+		mutex_enter(&ztest_vdev_lock);
+		if (mirror_save != zs->zs_mirrors) {
+			mutex_exit(&ztest_vdev_lock);
+			(void) close(fd);
+			goto out;
+		}
+
+		if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
+			fatal(1, "can't inject bad word at 0x%llx in %s",
+			    offset, pathrand);
+
+		mutex_exit(&ztest_vdev_lock);
+
+		if (ztest_opts.zo_verbose >= 7)
+			(void) printf("injected bad word into %s,"
+			    " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
+	}
+
+	(void) close(fd);
+out:
+	umem_free(path0, MAXPATHLEN);
+	umem_free(pathrand, MAXPATHLEN);
+}
+
+/*
+ * Verify that DDT repair works as expected.
+ */
+void
+ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = ztest_spa;
+	objset_t *os = zd->zd_os;
+	ztest_od_t *od;
+	uint64_t object, blocksize, txg, pattern, psize;
+	enum zio_checksum checksum = spa_dedup_checksum(spa);
+	dmu_buf_t *db;
+	dmu_tx_t *tx;
+	void *buf;
+	blkptr_t blk;
+	int copies = 2 * ZIO_DEDUPDITTO_MIN;
+	int i;
+
+	blocksize = ztest_random_blocksize();
+	blocksize = MIN(blocksize, 2048);	/* because we write so many */
+
+	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+	ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
+
+	if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) {
+		umem_free(od, sizeof (ztest_od_t));
+		return;
+	}
+
+	/*
+	 * Take the name lock as writer to prevent anyone else from changing
+	 * the pool and dataset properies we need to maintain during this test.
+	 */
+	(void) rw_wrlock(&ztest_name_lock);
+
+	if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
+	    B_FALSE) != 0 ||
+	    ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
+	    B_FALSE) != 0) {
+		(void) rw_unlock(&ztest_name_lock);
+		umem_free(od, sizeof (ztest_od_t));
+		return;
+	}
+
+	object = od[0].od_object;
+	blocksize = od[0].od_blocksize;
+	pattern = zs->zs_guid ^ dmu_objset_fsid_guid(os);
+
+	ASSERT(object != 0);
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_write(tx, object, 0, copies * blocksize);
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		(void) rw_unlock(&ztest_name_lock);
+		umem_free(od, sizeof (ztest_od_t));
+		return;
+	}
+
+	/*
+	 * Write all the copies of our block.
+	 */
+	for (i = 0; i < copies; i++) {
+		uint64_t offset = i * blocksize;
+		int error = dmu_buf_hold(os, object, offset, FTAG, &db,
+		    DMU_READ_NO_PREFETCH);
+		if (error != 0) {
+			fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u",
+			    os, (long long)object, (long long) offset, error);
+		}
+		ASSERT(db->db_offset == offset);
+		ASSERT(db->db_size == blocksize);
+		ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
+		    ztest_pattern_match(db->db_data, db->db_size, 0ULL));
+		dmu_buf_will_fill(db, tx);
+		ztest_pattern_set(db->db_data, db->db_size, pattern);
+		dmu_buf_rele(db, FTAG);
+	}
+
+	dmu_tx_commit(tx);
+	txg_wait_synced(spa_get_dsl(spa), txg);
+
+	/*
+	 * Find out what block we got.
+	 */
+	VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db,
+	    DMU_READ_NO_PREFETCH));
+	blk = *((dmu_buf_impl_t *)db)->db_blkptr;
+	dmu_buf_rele(db, FTAG);
+
+	/*
+	 * Damage the block.  Dedup-ditto will save us when we read it later.
+	 */
+	psize = BP_GET_PSIZE(&blk);
+	buf = zio_buf_alloc(psize);
+	ztest_pattern_set(buf, psize, ~pattern);
+
+	(void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
+	    buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
+
+	zio_buf_free(buf, psize);
+
+	(void) rw_unlock(&ztest_name_lock);
+	umem_free(od, sizeof (ztest_od_t));
+}
+
+/*
+ * Scrub the pool.
+ */
+/* ARGSUSED */
+void
+ztest_scrub(ztest_ds_t *zd, uint64_t id)
+{
+	spa_t *spa = ztest_spa;
+
+	(void) spa_scan(spa, POOL_SCAN_SCRUB);
+	(void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
+	(void) spa_scan(spa, POOL_SCAN_SCRUB);
+}
+
+/*
+ * Change the guid for the pool.
+ */
+/* ARGSUSED */
+void
+ztest_reguid(ztest_ds_t *zd, uint64_t id)
+{
+	spa_t *spa = ztest_spa;
+	uint64_t orig, load;
+	int error;
+
+	orig = spa_guid(spa);
+	load = spa_load_guid(spa);
+
+	(void) rw_wrlock(&ztest_name_lock);
+	error = spa_change_guid(spa);
+	(void) rw_unlock(&ztest_name_lock);
+
+	if (error != 0)
+		return;
+
+	if (ztest_opts.zo_verbose >= 4) {
+		(void) printf("Changed guid old %llu -> %llu\n",
+		    (u_longlong_t)orig, (u_longlong_t)spa_guid(spa));
+	}
+
+	VERIFY3U(orig, !=, spa_guid(spa));
+	VERIFY3U(load, ==, spa_load_guid(spa));
+}
+
+/*
+ * Rename the pool to a different name and then rename it back.
+ */
+/* ARGSUSED */
+void
+ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
+{
+	char *oldname, *newname;
+	spa_t *spa;
+
+	(void) rw_wrlock(&ztest_name_lock);
+
+	oldname = ztest_opts.zo_pool;
+	newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
+	(void) strcpy(newname, oldname);
+	(void) strcat(newname, "_tmp");
+
+	/*
+	 * Do the rename
+	 */
+	VERIFY3U(0, ==, spa_rename(oldname, newname));
+
+	/*
+	 * Try to open it under the old name, which shouldn't exist
+	 */
+	VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
+
+	/*
+	 * Open it under the new name and make sure it's still the same spa_t.
+	 */
+	VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
+
+	ASSERT(spa == ztest_spa);
+	spa_close(spa, FTAG);
+
+	/*
+	 * Rename it back to the original
+	 */
+	VERIFY3U(0, ==, spa_rename(newname, oldname));
+
+	/*
+	 * Make sure it can still be opened
+	 */
+	VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
+
+	ASSERT(spa == ztest_spa);
+	spa_close(spa, FTAG);
+
+	umem_free(newname, strlen(newname) + 1);
+
+	(void) rw_unlock(&ztest_name_lock);
+}
+
+static int
+ztest_check_path(char *path)
+{
+	struct stat s;
+	/* return true on success */
+	return (!stat(path, &s));
+}
+
+static void
+ztest_get_zdb_bin(char *bin, int len)
+{
+	char *zdb_path;
+	/*
+	 * Try to use ZDB_PATH and in-tree zdb path. If not successful, just
+	 * let popen to search through PATH.
+	 */
+	if ((zdb_path = getenv("ZDB_PATH"))) {
+		strlcpy(bin, zdb_path, len); /* In env */
+		if (!ztest_check_path(bin)) {
+			ztest_dump_core = 0;
+			fatal(1, "invalid ZDB_PATH '%s'", bin);
+		}
+		return;
+	}
+
+	VERIFY(realpath(getexecname(), bin) != NULL);
+	if (strstr(bin, "/ztest/")) {
+		strstr(bin, "/ztest/")[0] = '\0'; /* In-tree */
+		strcat(bin, "/zdb/zdb");
+		if (ztest_check_path(bin))
+			return;
+	}
+	strcpy(bin, "zdb");
+}
+
+/*
+ * Verify pool integrity by running zdb.
+ */
+static void
+ztest_run_zdb(char *pool)
+{
+	int status;
+	char *bin;
+	char *zdb;
+	char *zbuf;
+	const int len = MAXPATHLEN + MAXNAMELEN + 20;
+	FILE *fp;
+
+	bin = umem_alloc(len, UMEM_NOFAIL);
+	zdb = umem_alloc(len, UMEM_NOFAIL);
+	zbuf = umem_alloc(1024, UMEM_NOFAIL);
+
+	ztest_get_zdb_bin(bin, len);
+
+	(void) sprintf(zdb,
+	    "%s -bcc%s%s -d -U %s %s",
+	    bin,
+	    ztest_opts.zo_verbose >= 3 ? "s" : "",
+	    ztest_opts.zo_verbose >= 4 ? "v" : "",
+	    spa_config_path,
+	    pool);
+
+	if (ztest_opts.zo_verbose >= 5)
+		(void) printf("Executing %s\n", strstr(zdb, "zdb "));
+
+	fp = popen(zdb, "r");
+
+	while (fgets(zbuf, 1024, fp) != NULL)
+		if (ztest_opts.zo_verbose >= 3)
+			(void) printf("%s", zbuf);
+
+	status = pclose(fp);
+
+	if (status == 0)
+		goto out;
+
+	ztest_dump_core = 0;
+	if (WIFEXITED(status))
+		fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status));
+	else
+		fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status));
+out:
+	umem_free(bin, len);
+	umem_free(zdb, len);
+	umem_free(zbuf, 1024);
+}
+
+static void
+ztest_walk_pool_directory(char *header)
+{
+	spa_t *spa = NULL;
+
+	if (ztest_opts.zo_verbose >= 6)
+		(void) printf("%s\n", header);
+
+	mutex_enter(&spa_namespace_lock);
+	while ((spa = spa_next(spa)) != NULL)
+		if (ztest_opts.zo_verbose >= 6)
+			(void) printf("\t%s\n", spa_name(spa));
+	mutex_exit(&spa_namespace_lock);
+}
+
+static void
+ztest_spa_import_export(char *oldname, char *newname)
+{
+	nvlist_t *config, *newconfig;
+	uint64_t pool_guid;
+	spa_t *spa;
+	int error;
+
+	if (ztest_opts.zo_verbose >= 4) {
+		(void) printf("import/export: old = %s, new = %s\n",
+		    oldname, newname);
+	}
+
+	/*
+	 * Clean up from previous runs.
+	 */
+	(void) spa_destroy(newname);
+
+	/*
+	 * Get the pool's configuration and guid.
+	 */
+	VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
+
+	/*
+	 * Kick off a scrub to tickle scrub/export races.
+	 */
+	if (ztest_random(2) == 0)
+		(void) spa_scan(spa, POOL_SCAN_SCRUB);
+
+	pool_guid = spa_guid(spa);
+	spa_close(spa, FTAG);
+
+	ztest_walk_pool_directory("pools before export");
+
+	/*
+	 * Export it.
+	 */
+	VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE));
+
+	ztest_walk_pool_directory("pools after export");
+
+	/*
+	 * Try to import it.
+	 */
+	newconfig = spa_tryimport(config);
+	ASSERT(newconfig != NULL);
+	nvlist_free(newconfig);
+
+	/*
+	 * Import it under the new name.
+	 */
+	error = spa_import(newname, config, NULL, 0);
+	if (error != 0) {
+		dump_nvlist(config, 0);
+		fatal(B_FALSE, "couldn't import pool %s as %s: error %u",
+		    oldname, newname, error);
+	}
+
+	ztest_walk_pool_directory("pools after import");
+
+	/*
+	 * Try to import it again -- should fail with EEXIST.
+	 */
+	VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0));
+
+	/*
+	 * Try to import it under a different name -- should fail with EEXIST.
+	 */
+	VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0));
+
+	/*
+	 * Verify that the pool is no longer visible under the old name.
+	 */
+	VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
+
+	/*
+	 * Verify that we can open and close the pool using the new name.
+	 */
+	VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
+	ASSERT(pool_guid == spa_guid(spa));
+	spa_close(spa, FTAG);
+
+	nvlist_free(config);
+}
+
+static void
+ztest_resume(spa_t *spa)
+{
+	if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6)
+		(void) printf("resuming from suspended state\n");
+	spa_vdev_state_enter(spa, SCL_NONE);
+	vdev_clear(spa, NULL);
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+	(void) zio_resume(spa);
+}
+
+static void *
+ztest_resume_thread(void *arg)
+{
+	spa_t *spa = arg;
+
+	while (!ztest_exiting) {
+		if (spa_suspended(spa))
+			ztest_resume(spa);
+		(void) poll(NULL, 0, 100);
+	}
+
+	thread_exit();
+
+	return (NULL);
+}
+
+#define	GRACE	300
+
+#if 0
+static void
+ztest_deadman_alarm(int sig)
+{
+	fatal(0, "failed to complete within %d seconds of deadline", GRACE);
+}
+#endif
+
+static void
+ztest_execute(int test, ztest_info_t *zi, uint64_t id)
+{
+	ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets];
+	ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test);
+	hrtime_t functime = gethrtime();
+	int i;
+
+	for (i = 0; i < zi->zi_iters; i++)
+		zi->zi_func(zd, id);
+
+	functime = gethrtime() - functime;
+
+	atomic_add_64(&zc->zc_count, 1);
+	atomic_add_64(&zc->zc_time, functime);
+
+	if (ztest_opts.zo_verbose >= 4)
+		(void) printf("%6.2f sec in %s\n",
+		    (double)functime / NANOSEC, zi->zi_funcname);
+}
+
+static void *
+ztest_thread(void *arg)
+{
+	int rand;
+	uint64_t id = (uintptr_t)arg;
+	ztest_shared_t *zs = ztest_shared;
+	uint64_t call_next;
+	hrtime_t now;
+	ztest_info_t *zi;
+	ztest_shared_callstate_t *zc;
+
+	while ((now = gethrtime()) < zs->zs_thread_stop) {
+		/*
+		 * See if it's time to force a crash.
+		 */
+		if (now > zs->zs_thread_kill)
+			ztest_kill(zs);
+
+		/*
+		 * If we're getting ENOSPC with some regularity, stop.
+		 */
+		if (zs->zs_enospc_count > 10)
+			break;
+
+		/*
+		 * Pick a random function to execute.
+		 */
+		rand = ztest_random(ZTEST_FUNCS);
+		zi = &ztest_info[rand];
+		zc = ZTEST_GET_SHARED_CALLSTATE(rand);
+		call_next = zc->zc_next;
+
+		if (now >= call_next &&
+		    atomic_cas_64(&zc->zc_next, call_next, call_next +
+		    ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) {
+			ztest_execute(rand, zi, id);
+		}
+	}
+
+	thread_exit();
+
+	return (NULL);
+}
+
+static void
+ztest_dataset_name(char *dsname, char *pool, int d)
+{
+	(void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d);
+}
+
+static void
+ztest_dataset_destroy(int d)
+{
+	char name[MAXNAMELEN];
+	int t;
+
+	ztest_dataset_name(name, ztest_opts.zo_pool, d);
+
+	if (ztest_opts.zo_verbose >= 3)
+		(void) printf("Destroying %s to free up space\n", name);
+
+	/*
+	 * Cleanup any non-standard clones and snapshots.  In general,
+	 * ztest thread t operates on dataset (t % zopt_datasets),
+	 * so there may be more than one thing to clean up.
+	 */
+	for (t = d; t < ztest_opts.zo_threads;
+	    t += ztest_opts.zo_datasets)
+		ztest_dsl_dataset_cleanup(name, t);
+
+	(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
+	    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+}
+
+static void
+ztest_dataset_dirobj_verify(ztest_ds_t *zd)
+{
+	uint64_t usedobjs, dirobjs, scratch;
+
+	/*
+	 * ZTEST_DIROBJ is the object directory for the entire dataset.
+	 * Therefore, the number of objects in use should equal the
+	 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
+	 * If not, we have an object leak.
+	 *
+	 * Note that we can only check this in ztest_dataset_open(),
+	 * when the open-context and syncing-context values agree.
+	 * That's because zap_count() returns the open-context value,
+	 * while dmu_objset_space() returns the rootbp fill count.
+	 */
+	VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
+	dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
+	ASSERT3U(dirobjs + 1, ==, usedobjs);
+}
+
+static int
+ztest_dataset_open(int d)
+{
+	ztest_ds_t *zd = &ztest_ds[d];
+	uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq;
+	objset_t *os;
+	zilog_t *zilog;
+	char name[MAXNAMELEN];
+	int error;
+
+	ztest_dataset_name(name, ztest_opts.zo_pool, d);
+
+	(void) rw_rdlock(&ztest_name_lock);
+
+	error = ztest_dataset_create(name);
+	if (error == ENOSPC) {
+		(void) rw_unlock(&ztest_name_lock);
+		ztest_record_enospc(FTAG);
+		return (error);
+	}
+	ASSERT(error == 0 || error == EEXIST);
+
+	VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os));
+	(void) rw_unlock(&ztest_name_lock);
+
+	ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os);
+
+	zilog = zd->zd_zilog;
+
+	if (zilog->zl_header->zh_claim_lr_seq != 0 &&
+	    zilog->zl_header->zh_claim_lr_seq < committed_seq)
+		fatal(0, "missing log records: claimed %llu < committed %llu",
+		    zilog->zl_header->zh_claim_lr_seq, committed_seq);
+
+	ztest_dataset_dirobj_verify(zd);
+
+	zil_replay(os, zd, ztest_replay_vector);
+
+	ztest_dataset_dirobj_verify(zd);
+
+	if (ztest_opts.zo_verbose >= 6)
+		(void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
+		    zd->zd_name,
+		    (u_longlong_t)zilog->zl_parse_blk_count,
+		    (u_longlong_t)zilog->zl_parse_lr_count,
+		    (u_longlong_t)zilog->zl_replaying_seq);
+
+	zilog = zil_open(os, ztest_get_data);
+
+	if (zilog->zl_replaying_seq != 0 &&
+	    zilog->zl_replaying_seq < committed_seq)
+		fatal(0, "missing log records: replayed %llu < committed %llu",
+		    zilog->zl_replaying_seq, committed_seq);
+
+	return (0);
+}
+
+static void
+ztest_dataset_close(int d)
+{
+	ztest_ds_t *zd = &ztest_ds[d];
+
+	zil_close(zd->zd_zilog);
+	dmu_objset_disown(zd->zd_os, zd);
+
+	ztest_zd_fini(zd);
+}
+
+/*
+ * Kick off threads to run tests on all datasets in parallel.
+ */
+static void
+ztest_run(ztest_shared_t *zs)
+{
+	kt_did_t *tid;
+	spa_t *spa;
+	objset_t *os;
+	kthread_t *resume_thread;
+	uint64_t object;
+	int error;
+	int t, d;
+
+	ztest_exiting = B_FALSE;
+
+	/*
+	 * Initialize parent/child shared state.
+	 */
+	mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+	VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);
+
+	zs->zs_thread_start = gethrtime();
+	zs->zs_thread_stop =
+	    zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC;
+	zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
+	zs->zs_thread_kill = zs->zs_thread_stop;
+	if (ztest_random(100) < ztest_opts.zo_killrate) {
+		zs->zs_thread_kill -=
+		    ztest_random(ztest_opts.zo_passtime * NANOSEC);
+	}
+
+	mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
+	    offsetof(ztest_cb_data_t, zcd_node));
+
+	/*
+	 * Open our pool.
+	 */
+	kernel_init(FREAD | FWRITE);
+	VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
+	spa->spa_debug = B_TRUE;
+	metaslab_preload_limit = ztest_random(20) + 1;
+	ztest_spa = spa;
+
+	VERIFY0(dmu_objset_own(ztest_opts.zo_pool,
+	    DMU_OST_ANY, B_TRUE, FTAG, &os));
+	zs->zs_guid = dmu_objset_fsid_guid(os);
+	dmu_objset_disown(os, FTAG);
+
+	spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
+
+	/*
+	 * We don't expect the pool to suspend unless maxfaults == 0,
+	 * in which case ztest_fault_inject() temporarily takes away
+	 * the only valid replica.
+	 */
+	if (MAXFAULTS() == 0)
+		spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
+	else
+		spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
+
+	/*
+	 * Create a thread to periodically resume suspended I/O.
+	 */
+	VERIFY3P((resume_thread = zk_thread_create(NULL, 0,
+	    (thread_func_t)ztest_resume_thread, spa, TS_RUN, NULL, 0, 0,
+	    PTHREAD_CREATE_JOINABLE)), !=, NULL);
+
+#if 0
+	/*
+	 * Set a deadman alarm to abort() if we hang.
+	 */
+	signal(SIGALRM, ztest_deadman_alarm);
+	alarm((zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + GRACE);
+#endif
+
+	/*
+	 * Verify that we can safely inquire about about any object,
+	 * whether it's allocated or not.  To make it interesting,
+	 * we probe a 5-wide window around each power of two.
+	 * This hits all edge cases, including zero and the max.
+	 */
+	for (t = 0; t < 64; t++) {
+		for (d = -5; d <= 5; d++) {
+			error = dmu_object_info(spa->spa_meta_objset,
+			    (1ULL << t) + d, NULL);
+			ASSERT(error == 0 || error == ENOENT ||
+			    error == EINVAL);
+		}
+	}
+
+	/*
+	 * If we got any ENOSPC errors on the previous run, destroy something.
+	 */
+	if (zs->zs_enospc_count != 0) {
+		int d = ztest_random(ztest_opts.zo_datasets);
+		ztest_dataset_destroy(d);
+	}
+	zs->zs_enospc_count = 0;
+
+	tid = umem_zalloc(ztest_opts.zo_threads * sizeof (kt_did_t),
+	    UMEM_NOFAIL);
+
+	if (ztest_opts.zo_verbose >= 4)
+		(void) printf("starting main threads...\n");
+
+	/*
+	 * Kick off all the tests that run in parallel.
+	 */
+	for (t = 0; t < ztest_opts.zo_threads; t++) {
+		kthread_t *thread;
+
+		if (t < ztest_opts.zo_datasets &&
+		    ztest_dataset_open(t) != 0)
+			return;
+
+		VERIFY3P(thread = zk_thread_create(NULL, 0,
+		    (thread_func_t)ztest_thread,
+		    (void *)(uintptr_t)t, TS_RUN, NULL, 0, 0,
+		    PTHREAD_CREATE_JOINABLE), !=, NULL);
+		tid[t] = thread->t_tid;
+	}
+
+	/*
+	 * Wait for all of the tests to complete.  We go in reverse order
+	 * so we don't close datasets while threads are still using them.
+	 */
+	for (t = ztest_opts.zo_threads - 1; t >= 0; t--) {
+		thread_join(tid[t]);
+		if (t < ztest_opts.zo_datasets)
+			ztest_dataset_close(t);
+	}
+
+	txg_wait_synced(spa_get_dsl(spa), 0);
+
+	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+	zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
+
+	umem_free(tid, ztest_opts.zo_threads * sizeof (kt_did_t));
+
+	/* Kill the resume thread */
+	ztest_exiting = B_TRUE;
+	thread_join(resume_thread->t_tid);
+	ztest_resume(spa);
+
+	/*
+	 * Right before closing the pool, kick off a bunch of async I/O;
+	 * spa_close() should wait for it to complete.
+	 */
+	for (object = 1; object < 50; object++)
+		dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20);
+
+	/* Verify that at least one commit cb was called in a timely fashion */
+	if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG)
+		VERIFY0(zc_min_txg_delay);
+
+	spa_close(spa, FTAG);
+
+	/*
+	 * Verify that we can loop over all pools.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
+		if (ztest_opts.zo_verbose > 3)
+			(void) printf("spa_next: found %s\n", spa_name(spa));
+	mutex_exit(&spa_namespace_lock);
+
+	/*
+	 * Verify that we can export the pool and reimport it under a
+	 * different name.
+	 */
+	if (ztest_random(2) == 0) {
+		char name[MAXNAMELEN];
+		(void) snprintf(name, MAXNAMELEN, "%s_import",
+		    ztest_opts.zo_pool);
+		ztest_spa_import_export(ztest_opts.zo_pool, name);
+		ztest_spa_import_export(name, ztest_opts.zo_pool);
+	}
+
+	kernel_fini();
+
+	list_destroy(&zcl.zcl_callbacks);
+	mutex_destroy(&zcl.zcl_callbacks_lock);
+	(void) rwlock_destroy(&ztest_name_lock);
+	mutex_destroy(&ztest_vdev_lock);
+}
+
+static void
+ztest_freeze(void)
+{
+	ztest_ds_t *zd = &ztest_ds[0];
+	spa_t *spa;
+	int numloops = 0;
+
+	if (ztest_opts.zo_verbose >= 3)
+		(void) printf("testing spa_freeze()...\n");
+
+	kernel_init(FREAD | FWRITE);
+	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
+	VERIFY3U(0, ==, ztest_dataset_open(0));
+	spa->spa_debug = B_TRUE;
+	ztest_spa = spa;
+
+	/*
+	 * Force the first log block to be transactionally allocated.
+	 * We have to do this before we freeze the pool -- otherwise
+	 * the log chain won't be anchored.
+	 */
+	while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
+		ztest_dmu_object_alloc_free(zd, 0);
+		zil_commit(zd->zd_zilog, 0);
+	}
+
+	txg_wait_synced(spa_get_dsl(spa), 0);
+
+	/*
+	 * Freeze the pool.  This stops spa_sync() from doing anything,
+	 * so that the only way to record changes from now on is the ZIL.
+	 */
+	spa_freeze(spa);
+
+	/*
+	 * Because it is hard to predict how much space a write will actually
+	 * require beforehand, we leave ourselves some fudge space to write over
+	 * capacity.
+	 */
+	uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2;
+
+	/*
+	 * Run tests that generate log records but don't alter the pool config
+	 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
+	 * We do a txg_wait_synced() after each iteration to force the txg
+	 * to increase well beyond the last synced value in the uberblock.
+	 * The ZIL should be OK with that.
+	 *
+	 * Run a random number of times less than zo_maxloops and ensure we do
+	 * not run out of space on the pool.
+	 */
+	while (ztest_random(10) != 0 &&
+	    numloops++ < ztest_opts.zo_maxloops &&
+	    metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) {
+		ztest_od_t od;
+		ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+		VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE));
+		ztest_io(zd, od.od_object,
+		    ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+		txg_wait_synced(spa_get_dsl(spa), 0);
+	}
+
+	/*
+	 * Commit all of the changes we just generated.
+	 */
+	zil_commit(zd->zd_zilog, 0);
+	txg_wait_synced(spa_get_dsl(spa), 0);
+
+	/*
+	 * Close our dataset and close the pool.
+	 */
+	ztest_dataset_close(0);
+	spa_close(spa, FTAG);
+	kernel_fini();
+
+	/*
+	 * Open and close the pool and dataset to induce log replay.
+	 */
+	kernel_init(FREAD | FWRITE);
+	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
+	ASSERT(spa_freeze_txg(spa) == UINT64_MAX);
+	VERIFY3U(0, ==, ztest_dataset_open(0));
+	ztest_dataset_close(0);
+
+	spa->spa_debug = B_TRUE;
+	ztest_spa = spa;
+	txg_wait_synced(spa_get_dsl(spa), 0);
+	ztest_reguid(NULL, 0);
+
+	spa_close(spa, FTAG);
+	kernel_fini();
+}
+
+void
+print_time(hrtime_t t, char *timebuf)
+{
+	hrtime_t s = t / NANOSEC;
+	hrtime_t m = s / 60;
+	hrtime_t h = m / 60;
+	hrtime_t d = h / 24;
+
+	s -= m * 60;
+	m -= h * 60;
+	h -= d * 24;
+
+	timebuf[0] = '\0';
+
+	if (d)
+		(void) sprintf(timebuf,
+		    "%llud%02lluh%02llum%02llus", d, h, m, s);
+	else if (h)
+		(void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s);
+	else if (m)
+		(void) sprintf(timebuf, "%llum%02llus", m, s);
+	else
+		(void) sprintf(timebuf, "%llus", s);
+}
+
+static nvlist_t *
+make_random_props(void)
+{
+	nvlist_t *props;
+
+	VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+	if (ztest_random(2) == 0)
+		return (props);
+	VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0);
+
+	return (props);
+}
+
+/*
+ * Create a storage pool with the given name and initial vdev size.
+ * Then test spa_freeze() functionality.
+ */
+static void
+ztest_init(ztest_shared_t *zs)
+{
+	spa_t *spa;
+	nvlist_t *nvroot, *props;
+	int i;
+
+	mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+	VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);
+
+	kernel_init(FREAD | FWRITE);
+
+	/*
+	 * Create the storage pool.
+	 */
+	(void) spa_destroy(ztest_opts.zo_pool);
+	ztest_shared->zs_vdev_next_leaf = 0;
+	zs->zs_splits = 0;
+	zs->zs_mirrors = ztest_opts.zo_mirrors;
+	nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
+	    0, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
+	props = make_random_props();
+	for (i = 0; i < SPA_FEATURES; i++) {
+		char *buf;
+		VERIFY3S(-1, !=, asprintf(&buf, "feature@%s",
+		    spa_feature_table[i].fi_uname));
+		VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0));
+		free(buf);
+	}
+	VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL));
+	nvlist_free(nvroot);
+	nvlist_free(props);
+
+	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
+	zs->zs_metaslab_sz =
+	    1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
+	spa_close(spa, FTAG);
+
+	kernel_fini();
+
+	ztest_run_zdb(ztest_opts.zo_pool);
+
+	ztest_freeze();
+
+	ztest_run_zdb(ztest_opts.zo_pool);
+
+	(void) rwlock_destroy(&ztest_name_lock);
+	mutex_destroy(&ztest_vdev_lock);
+}
+
+static void
+setup_data_fd(void)
+{
+	static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX";
+
+	ztest_fd_data = mkstemp(ztest_name_data);
+	ASSERT3S(ztest_fd_data, >=, 0);
+	(void) unlink(ztest_name_data);
+}
+
+static int
+shared_data_size(ztest_shared_hdr_t *hdr)
+{
+	int size;
+
+	size = hdr->zh_hdr_size;
+	size += hdr->zh_opts_size;
+	size += hdr->zh_size;
+	size += hdr->zh_stats_size * hdr->zh_stats_count;
+	size += hdr->zh_ds_size * hdr->zh_ds_count;
+
+	return (size);
+}
+
+static void
+setup_hdr(void)
+{
+	int size;
+	ztest_shared_hdr_t *hdr;
+
+	hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
+	    PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
+	VERIFY3P(hdr, !=, MAP_FAILED);
+
+	VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t)));
+
+	hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t);
+	hdr->zh_opts_size = sizeof (ztest_shared_opts_t);
+	hdr->zh_size = sizeof (ztest_shared_t);
+	hdr->zh_stats_size = sizeof (ztest_shared_callstate_t);
+	hdr->zh_stats_count = ZTEST_FUNCS;
+	hdr->zh_ds_size = sizeof (ztest_shared_ds_t);
+	hdr->zh_ds_count = ztest_opts.zo_datasets;
+
+	size = shared_data_size(hdr);
+	VERIFY3U(0, ==, ftruncate(ztest_fd_data, size));
+
+	(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
+}
+
+static void
+setup_data(void)
+{
+	int size, offset;
+	ztest_shared_hdr_t *hdr;
+	uint8_t *buf;
+
+	hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
+	    PROT_READ, MAP_SHARED, ztest_fd_data, 0);
+	VERIFY3P(hdr, !=, MAP_FAILED);
+
+	size = shared_data_size(hdr);
+
+	(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
+	hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()),
+	    PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
+	VERIFY3P(hdr, !=, MAP_FAILED);
+	buf = (uint8_t *)hdr;
+
+	offset = hdr->zh_hdr_size;
+	ztest_shared_opts = (void *)&buf[offset];
+	offset += hdr->zh_opts_size;
+	ztest_shared = (void *)&buf[offset];
+	offset += hdr->zh_size;
+	ztest_shared_callstate = (void *)&buf[offset];
+	offset += hdr->zh_stats_size * hdr->zh_stats_count;
+	ztest_shared_ds = (void *)&buf[offset];
+}
+
+static boolean_t
+exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp)
+{
+	pid_t pid;
+	int status;
+	char *cmdbuf = NULL;
+
+	pid = fork();
+
+	if (cmd == NULL) {
+		cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+		(void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN);
+		cmd = cmdbuf;
+	}
+
+	if (pid == -1)
+		fatal(1, "fork failed");
+
+	if (pid == 0) {	/* child */
+		char *emptyargv[2] = { cmd, NULL };
+		char fd_data_str[12];
+
+		struct rlimit rl = { 1024, 1024 };
+		(void) setrlimit(RLIMIT_NOFILE, &rl);
+
+		(void) close(ztest_fd_rand);
+		VERIFY(11 >= snprintf(fd_data_str, 12, "%d", ztest_fd_data));
+		VERIFY(0 == setenv("ZTEST_FD_DATA", fd_data_str, 1));
+
+		(void) enable_extended_FILE_stdio(-1, -1);
+		if (libpath != NULL)
+			VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1));
+		(void) execv(cmd, emptyargv);
+		ztest_dump_core = B_FALSE;
+		fatal(B_TRUE, "exec failed: %s", cmd);
+	}
+
+	if (cmdbuf != NULL) {
+		umem_free(cmdbuf, MAXPATHLEN);
+		cmd = NULL;
+	}
+
+	while (waitpid(pid, &status, 0) != pid)
+		continue;
+	if (statusp != NULL)
+		*statusp = status;
+
+	if (WIFEXITED(status)) {
+		if (WEXITSTATUS(status) != 0) {
+			(void) fprintf(stderr, "child exited with code %d\n",
+			    WEXITSTATUS(status));
+			exit(2);
+		}
+		return (B_FALSE);
+	} else if (WIFSIGNALED(status)) {
+		if (!ignorekill || WTERMSIG(status) != SIGKILL) {
+			(void) fprintf(stderr, "child died with signal %d\n",
+			    WTERMSIG(status));
+			exit(3);
+		}
+		return (B_TRUE);
+	} else {
+		(void) fprintf(stderr, "something strange happened to child\n");
+		exit(4);
+		/* NOTREACHED */
+	}
+}
+
+static void
+ztest_run_init(void)
+{
+	int i;
+
+	ztest_shared_t *zs = ztest_shared;
+
+	ASSERT(ztest_opts.zo_init != 0);
+
+	/*
+	 * Blow away any existing copy of zpool.cache
+	 */
+	(void) remove(spa_config_path);
+
+	/*
+	 * Create and initialize our storage pool.
+	 */
+	for (i = 1; i <= ztest_opts.zo_init; i++) {
+		bzero(zs, sizeof (ztest_shared_t));
+		if (ztest_opts.zo_verbose >= 3 &&
+		    ztest_opts.zo_init != 1) {
+			(void) printf("ztest_init(), pass %d\n", i);
+		}
+		ztest_init(zs);
+	}
+}
+
+int
+main(int argc, char **argv)
+{
+	int kills = 0;
+	int iters = 0;
+	int older = 0;
+	int newer = 0;
+	ztest_shared_t *zs;
+	ztest_info_t *zi;
+	ztest_shared_callstate_t *zc;
+	char timebuf[100];
+	char numbuf[6];
+	spa_t *spa;
+	char *cmd;
+	boolean_t hasalt;
+	int f;
+	char *fd_data_str = getenv("ZTEST_FD_DATA");
+	struct sigaction action;
+
+	(void) setvbuf(stdout, NULL, _IOLBF, 0);
+
+	dprintf_setup(&argc, argv);
+
+	action.sa_handler = sig_handler;
+	sigemptyset(&action.sa_mask);
+	action.sa_flags = 0;
+
+	if (sigaction(SIGSEGV, &action, NULL) < 0) {
+		(void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n",
+		    strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
+	if (sigaction(SIGABRT, &action, NULL) < 0) {
+		(void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n",
+		    strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
+	ztest_fd_rand = open("/dev/urandom", O_RDONLY);
+	ASSERT3S(ztest_fd_rand, >=, 0);
+
+	if (!fd_data_str) {
+		process_options(argc, argv);
+
+		setup_data_fd();
+		setup_hdr();
+		setup_data();
+		bcopy(&ztest_opts, ztest_shared_opts,
+		    sizeof (*ztest_shared_opts));
+	} else {
+		ztest_fd_data = atoi(fd_data_str);
+		setup_data();
+		bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts));
+	}
+	ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count);
+
+	/* Override location of zpool.cache */
+	VERIFY(asprintf((char **)&spa_config_path, "%s/zpool.cache",
+	    ztest_opts.zo_dir) != -1);
+
+	ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t),
+	    UMEM_NOFAIL);
+	zs = ztest_shared;
+
+	if (fd_data_str) {
+		metaslab_gang_bang = ztest_opts.zo_metaslab_gang_bang;
+		metaslab_df_alloc_threshold =
+		    zs->zs_metaslab_df_alloc_threshold;
+
+		if (zs->zs_do_init)
+			ztest_run_init();
+		else
+			ztest_run(zs);
+		exit(0);
+	}
+
+	hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0);
+
+	if (ztest_opts.zo_verbose >= 1) {
+		(void) printf("%llu vdevs, %d datasets, %d threads,"
+		    " %llu seconds...\n",
+		    (u_longlong_t)ztest_opts.zo_vdevs,
+		    ztest_opts.zo_datasets,
+		    ztest_opts.zo_threads,
+		    (u_longlong_t)ztest_opts.zo_time);
+	}
+
+	cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+	(void) strlcpy(cmd, getexecname(), MAXNAMELEN);
+
+	zs->zs_do_init = B_TRUE;
+	if (strlen(ztest_opts.zo_alt_ztest) != 0) {
+		if (ztest_opts.zo_verbose >= 1) {
+			(void) printf("Executing older ztest for "
+			    "initialization: %s\n", ztest_opts.zo_alt_ztest);
+		}
+		VERIFY(!exec_child(ztest_opts.zo_alt_ztest,
+		    ztest_opts.zo_alt_libpath, B_FALSE, NULL));
+	} else {
+		VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL));
+	}
+	zs->zs_do_init = B_FALSE;
+
+	zs->zs_proc_start = gethrtime();
+	zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC;
+
+	for (f = 0; f < ZTEST_FUNCS; f++) {
+		zi = &ztest_info[f];
+		zc = ZTEST_GET_SHARED_CALLSTATE(f);
+		if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
+			zc->zc_next = UINT64_MAX;
+		else
+			zc->zc_next = zs->zs_proc_start +
+			    ztest_random(2 * zi->zi_interval[0] + 1);
+	}
+
+	/*
+	 * Run the tests in a loop.  These tests include fault injection
+	 * to verify that self-healing data works, and forced crashes
+	 * to verify that we never lose on-disk consistency.
+	 */
+	while (gethrtime() < zs->zs_proc_stop) {
+		int status;
+		boolean_t killed;
+
+		/*
+		 * Initialize the workload counters for each function.
+		 */
+		for (f = 0; f < ZTEST_FUNCS; f++) {
+			zc = ZTEST_GET_SHARED_CALLSTATE(f);
+			zc->zc_count = 0;
+			zc->zc_time = 0;
+		}
+
+		/* Set the allocation switch size */
+		zs->zs_metaslab_df_alloc_threshold =
+		    ztest_random(zs->zs_metaslab_sz / 4) + 1;
+
+		if (!hasalt || ztest_random(2) == 0) {
+			if (hasalt && ztest_opts.zo_verbose >= 1) {
+				(void) printf("Executing newer ztest: %s\n",
+				    cmd);
+			}
+			newer++;
+			killed = exec_child(cmd, NULL, B_TRUE, &status);
+		} else {
+			if (hasalt && ztest_opts.zo_verbose >= 1) {
+				(void) printf("Executing older ztest: %s\n",
+				    ztest_opts.zo_alt_ztest);
+			}
+			older++;
+			killed = exec_child(ztest_opts.zo_alt_ztest,
+			    ztest_opts.zo_alt_libpath, B_TRUE, &status);
+		}
+
+		if (killed)
+			kills++;
+		iters++;
+
+		if (ztest_opts.zo_verbose >= 1) {
+			hrtime_t now = gethrtime();
+
+			now = MIN(now, zs->zs_proc_stop);
+			print_time(zs->zs_proc_stop - now, timebuf);
+			nicenum(zs->zs_space, numbuf);
+
+			(void) printf("Pass %3d, %8s, %3llu ENOSPC, "
+			    "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
+			    iters,
+			    WIFEXITED(status) ? "Complete" : "SIGKILL",
+			    (u_longlong_t)zs->zs_enospc_count,
+			    100.0 * zs->zs_alloc / zs->zs_space,
+			    numbuf,
+			    100.0 * (now - zs->zs_proc_start) /
+			    (ztest_opts.zo_time * NANOSEC), timebuf);
+		}
+
+		if (ztest_opts.zo_verbose >= 2) {
+			(void) printf("\nWorkload summary:\n\n");
+			(void) printf("%7s %9s   %s\n",
+			    "Calls", "Time", "Function");
+			(void) printf("%7s %9s   %s\n",
+			    "-----", "----", "--------");
+			for (f = 0; f < ZTEST_FUNCS; f++) {
+				zi = &ztest_info[f];
+				zc = ZTEST_GET_SHARED_CALLSTATE(f);
+				print_time(zc->zc_time, timebuf);
+				(void) printf("%7llu %9s   %s\n",
+				    (u_longlong_t)zc->zc_count, timebuf,
+				    zi->zi_funcname);
+			}
+			(void) printf("\n");
+		}
+
+		/*
+		 * It's possible that we killed a child during a rename test,
+		 * in which case we'll have a 'ztest_tmp' pool lying around
+		 * instead of 'ztest'.  Do a blind rename in case this happened.
+		 */
+		kernel_init(FREAD);
+		if (spa_open(ztest_opts.zo_pool, &spa, FTAG) == 0) {
+			spa_close(spa, FTAG);
+		} else {
+			char tmpname[MAXNAMELEN];
+			kernel_fini();
+			kernel_init(FREAD | FWRITE);
+			(void) snprintf(tmpname, sizeof (tmpname), "%s_tmp",
+			    ztest_opts.zo_pool);
+			(void) spa_rename(tmpname, ztest_opts.zo_pool);
+		}
+		kernel_fini();
+
+		ztest_run_zdb(ztest_opts.zo_pool);
+	}
+
+	if (ztest_opts.zo_verbose >= 1) {
+		if (hasalt) {
+			(void) printf("%d runs of older ztest: %s\n", older,
+			    ztest_opts.zo_alt_ztest);
+			(void) printf("%d runs of newer ztest: %s\n", newer,
+			    cmd);
+		}
+		(void) printf("%d killed, %d completed, %.0f%% kill rate\n",
+		    kills, iters - kills, (100.0 * kills) / MAX(1, iters));
+	}
+
+	umem_free(cmd, MAXNAMELEN);
+
+	return (0);
+}
diff --git a/include/libzfs.h b/include/libzfs.h
index b7ab890c3e25..2a1f2f50d306 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -678,8 +678,8 @@ typedef struct recvflags {
 	boolean_t nomount;
 } recvflags_t;
 
-extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t *,
-    int, avl_tree_t *);
+extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *,
+    recvflags_t *, int, avl_tree_t *);
 
 typedef enum diff_flags {
 	ZFS_DIFF_PARSEABLE = 0x1,
diff --git a/include/libzfs.h.orig b/include/libzfs.h.orig
new file mode 100644
index 000000000000..b7ab890c3e25
--- /dev/null
+++ b/include/libzfs.h.orig
@@ -0,0 +1,798 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#ifndef	_LIBZFS_H
+#define	_LIBZFS_H
+
+#include <assert.h>
+#include <libnvpair.h>
+#include <sys/mnttab.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/varargs.h>
+#include <sys/fs/zfs.h>
+#include <sys/avl.h>
+#include <ucred.h>
+#include <libzfs_core.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Miscellaneous ZFS constants
+ */
+#define	ZFS_MAXNAMELEN		MAXNAMELEN
+#define	ZPOOL_MAXNAMELEN	MAXNAMELEN
+#define	ZFS_MAXPROPLEN		MAXPATHLEN
+#define	ZPOOL_MAXPROPLEN	MAXPATHLEN
+
+/*
+ * Default device paths
+ */
+#define	DISK_ROOT		"/dev"
+#define	UDISK_ROOT		"/dev/disk"
+
+/*
+ * Default wait time for a device name to be created.
+ */
+#define	DISK_LABEL_WAIT		(30 * 1000)  /* 30 seconds */
+
+#define	DEFAULT_IMPORT_PATH_SIZE	7
+extern char *zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE];
+
+/*
+ * libzfs errors
+ */
+typedef enum zfs_error {
+	EZFS_SUCCESS = 0,	/* no error -- success */
+	EZFS_NOMEM = 2000,	/* out of memory */
+	EZFS_BADPROP,		/* invalid property value */
+	EZFS_PROPREADONLY,	/* cannot set readonly property */
+	EZFS_PROPTYPE,		/* property does not apply to dataset type */
+	EZFS_PROPNONINHERIT,	/* property is not inheritable */
+	EZFS_PROPSPACE,		/* bad quota or reservation */
+	EZFS_BADTYPE,		/* dataset is not of appropriate type */
+	EZFS_BUSY,		/* pool or dataset is busy */
+	EZFS_EXISTS,		/* pool or dataset already exists */
+	EZFS_NOENT,		/* no such pool or dataset */
+	EZFS_BADSTREAM,		/* bad backup stream */
+	EZFS_DSREADONLY,	/* dataset is readonly */
+	EZFS_VOLTOOBIG,		/* volume is too large for 32-bit system */
+	EZFS_INVALIDNAME,	/* invalid dataset name */
+	EZFS_BADRESTORE,	/* unable to restore to destination */
+	EZFS_BADBACKUP,		/* backup failed */
+	EZFS_BADTARGET,		/* bad attach/detach/replace target */
+	EZFS_NODEVICE,		/* no such device in pool */
+	EZFS_BADDEV,		/* invalid device to add */
+	EZFS_NOREPLICAS,	/* no valid replicas */
+	EZFS_RESILVERING,	/* currently resilvering */
+	EZFS_BADVERSION,	/* unsupported version */
+	EZFS_POOLUNAVAIL,	/* pool is currently unavailable */
+	EZFS_DEVOVERFLOW,	/* too many devices in one vdev */
+	EZFS_BADPATH,		/* must be an absolute path */
+	EZFS_CROSSTARGET,	/* rename or clone across pool or dataset */
+	EZFS_ZONED,		/* used improperly in local zone */
+	EZFS_MOUNTFAILED,	/* failed to mount dataset */
+	EZFS_UMOUNTFAILED,	/* failed to unmount dataset */
+	EZFS_UNSHARENFSFAILED,	/* unshare(1M) failed */
+	EZFS_SHARENFSFAILED,	/* share(1M) failed */
+	EZFS_PERM,		/* permission denied */
+	EZFS_NOSPC,		/* out of space */
+	EZFS_FAULT,		/* bad address */
+	EZFS_IO,		/* I/O error */
+	EZFS_INTR,		/* signal received */
+	EZFS_ISSPARE,		/* device is a hot spare */
+	EZFS_INVALCONFIG,	/* invalid vdev configuration */
+	EZFS_RECURSIVE,		/* recursive dependency */
+	EZFS_NOHISTORY,		/* no history object */
+	EZFS_POOLPROPS,		/* couldn't retrieve pool props */
+	EZFS_POOL_NOTSUP,	/* ops not supported for this type of pool */
+	EZFS_POOL_INVALARG,	/* invalid argument for this pool operation */
+	EZFS_NAMETOOLONG,	/* dataset name is too long */
+	EZFS_OPENFAILED,	/* open of device failed */
+	EZFS_NOCAP,		/* couldn't get capacity */
+	EZFS_LABELFAILED,	/* write of label failed */
+	EZFS_BADWHO,		/* invalid permission who */
+	EZFS_BADPERM,		/* invalid permission */
+	EZFS_BADPERMSET,	/* invalid permission set name */
+	EZFS_NODELEGATION,	/* delegated administration is disabled */
+	EZFS_UNSHARESMBFAILED,	/* failed to unshare over smb */
+	EZFS_SHARESMBFAILED,	/* failed to share over smb */
+	EZFS_BADCACHE,		/* bad cache file */
+	EZFS_ISL2CACHE,		/* device is for the level 2 ARC */
+	EZFS_VDEVNOTSUP,	/* unsupported vdev type */
+	EZFS_NOTSUP,		/* ops not supported on this dataset */
+	EZFS_ACTIVE_SPARE,	/* pool has active shared spare devices */
+	EZFS_UNPLAYED_LOGS,	/* log device has unplayed logs */
+	EZFS_REFTAG_RELE,	/* snapshot release: tag not found */
+	EZFS_REFTAG_HOLD,	/* snapshot hold: tag already exists */
+	EZFS_TAGTOOLONG,	/* snapshot hold/rele: tag too long */
+	EZFS_PIPEFAILED,	/* pipe create failed */
+	EZFS_THREADCREATEFAILED, /* thread create failed */
+	EZFS_POSTSPLIT_ONLINE,	/* onlining a disk after splitting it */
+	EZFS_SCRUBBING,		/* currently scrubbing */
+	EZFS_NO_SCRUB,		/* no active scrub */
+	EZFS_DIFF,		/* general failure of zfs diff */
+	EZFS_DIFFDATA,		/* bad zfs diff data */
+	EZFS_POOLREADONLY,	/* pool is in read-only mode */
+	EZFS_UNKNOWN
+} zfs_error_t;
+
+/*
+ * The following data structures are all part
+ * of the zfs_allow_t data structure which is
+ * used for printing 'allow' permissions.
+ * It is a linked list of zfs_allow_t's which
+ * then contain avl tree's for user/group/sets/...
+ * and each one of the entries in those trees have
+ * avl tree's for the permissions they belong to and
+ * whether they are local,descendent or local+descendent
+ * permissions.  The AVL trees are used primarily for
+ * sorting purposes, but also so that we can quickly find
+ * a given user and or permission.
+ */
+typedef struct zfs_perm_node {
+	avl_node_t z_node;
+	char z_pname[MAXPATHLEN];
+} zfs_perm_node_t;
+
+typedef struct zfs_allow_node {
+	avl_node_t z_node;
+	char z_key[MAXPATHLEN];		/* name, such as joe */
+	avl_tree_t z_localdescend;	/* local+descendent perms */
+	avl_tree_t z_local;		/* local permissions */
+	avl_tree_t z_descend;		/* descendent permissions */
+} zfs_allow_node_t;
+
+typedef struct zfs_allow {
+	struct zfs_allow *z_next;
+	char z_setpoint[MAXPATHLEN];
+	avl_tree_t z_sets;
+	avl_tree_t z_crperms;
+	avl_tree_t z_user;
+	avl_tree_t z_group;
+	avl_tree_t z_everyone;
+} zfs_allow_t;
+
+/*
+ * Basic handle types
+ */
+typedef struct zfs_handle zfs_handle_t;
+typedef struct zpool_handle zpool_handle_t;
+typedef struct libzfs_handle libzfs_handle_t;
+
+/*
+ * Library initialization
+ */
+extern libzfs_handle_t *libzfs_init(void);
+extern void libzfs_fini(libzfs_handle_t *);
+
+extern libzfs_handle_t *zpool_get_handle(zpool_handle_t *);
+extern libzfs_handle_t *zfs_get_handle(zfs_handle_t *);
+
+extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t);
+
+extern void zfs_save_arguments(int argc, char **, char *, int);
+extern int zpool_log_history(libzfs_handle_t *, const char *);
+
+extern int libzfs_errno(libzfs_handle_t *);
+extern const char *libzfs_error_init(int);
+extern const char *libzfs_error_action(libzfs_handle_t *);
+extern const char *libzfs_error_description(libzfs_handle_t *);
+extern int zfs_standard_error(libzfs_handle_t *, int, const char *);
+extern void libzfs_mnttab_init(libzfs_handle_t *);
+extern void libzfs_mnttab_fini(libzfs_handle_t *);
+extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t);
+extern int libzfs_mnttab_find(libzfs_handle_t *, const char *,
+    struct mnttab *);
+extern void libzfs_mnttab_add(libzfs_handle_t *, const char *,
+    const char *, const char *);
+extern void libzfs_mnttab_remove(libzfs_handle_t *, const char *);
+
+/*
+ * Basic handle functions
+ */
+extern zpool_handle_t *zpool_open(libzfs_handle_t *, const char *);
+extern zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *);
+extern void zpool_close(zpool_handle_t *);
+extern const char *zpool_get_name(zpool_handle_t *);
+extern int zpool_get_state(zpool_handle_t *);
+extern char *zpool_state_to_name(vdev_state_t, vdev_aux_t);
+extern const char *zpool_pool_state_to_name(pool_state_t);
+extern void zpool_free_handles(libzfs_handle_t *);
+
+/*
+ * Iterate over all active pools in the system.
+ */
+typedef int (*zpool_iter_f)(zpool_handle_t *, void *);
+extern int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *);
+
+/*
+ * Functions to create and destroy pools
+ */
+extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *,
+    nvlist_t *, nvlist_t *);
+extern int zpool_destroy(zpool_handle_t *, const char *);
+extern int zpool_add(zpool_handle_t *, nvlist_t *);
+
+typedef struct splitflags {
+	/* do not split, but return the config that would be split off */
+	int dryrun : 1;
+
+	/* after splitting, import the pool */
+	int import : 1;
+} splitflags_t;
+
+/*
+ * Functions to manipulate pool and vdev state
+ */
+extern int zpool_scan(zpool_handle_t *, pool_scan_func_t);
+extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
+extern int zpool_reguid(zpool_handle_t *);
+extern int zpool_reopen(zpool_handle_t *);
+
+extern int zpool_vdev_online(zpool_handle_t *, const char *, int,
+    vdev_state_t *);
+extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t);
+extern int zpool_vdev_attach(zpool_handle_t *, const char *,
+    const char *, nvlist_t *, int);
+extern int zpool_vdev_detach(zpool_handle_t *, const char *);
+extern int zpool_vdev_remove(zpool_handle_t *, const char *);
+extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *,
+    splitflags_t);
+
+extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t);
+extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t);
+extern int zpool_vdev_clear(zpool_handle_t *, uint64_t);
+
+extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *,
+    boolean_t *, boolean_t *);
+extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *,
+    boolean_t *, boolean_t *, boolean_t *);
+extern int zpool_label_disk_wait(char *, int);
+extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *);
+
+/*
+ * Functions to manage pool properties
+ */
+extern int zpool_set_prop(zpool_handle_t *, const char *, const char *);
+extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *,
+    size_t proplen, zprop_source_t *);
+extern int zpool_get_prop_literal(zpool_handle_t *, zpool_prop_t, char *,
+    size_t proplen, zprop_source_t *, boolean_t literal);
+extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t,
+    zprop_source_t *);
+
+extern const char *zpool_prop_to_name(zpool_prop_t);
+extern const char *zpool_prop_values(zpool_prop_t);
+
+/*
+ * Pool health statistics.
+ */
+typedef enum {
+	/*
+	 * The following correspond to faults as defined in the (fault.fs.zfs.*)
+	 * event namespace.  Each is associated with a corresponding message ID.
+	 */
+	ZPOOL_STATUS_CORRUPT_CACHE,	/* corrupt /kernel/drv/zpool.cache */
+	ZPOOL_STATUS_MISSING_DEV_R,	/* missing device with replicas */
+	ZPOOL_STATUS_MISSING_DEV_NR,	/* missing device with no replicas */
+	ZPOOL_STATUS_CORRUPT_LABEL_R,	/* bad device label with replicas */
+	ZPOOL_STATUS_CORRUPT_LABEL_NR,	/* bad device label with no replicas */
+	ZPOOL_STATUS_BAD_GUID_SUM,	/* sum of device guids didn't match */
+	ZPOOL_STATUS_CORRUPT_POOL,	/* pool metadata is corrupted */
+	ZPOOL_STATUS_CORRUPT_DATA,	/* data errors in user (meta)data */
+	ZPOOL_STATUS_FAILING_DEV,	/* device experiencing errors */
+	ZPOOL_STATUS_VERSION_NEWER,	/* newer on-disk version */
+	ZPOOL_STATUS_HOSTID_MISMATCH,	/* last accessed by another system */
+	ZPOOL_STATUS_IO_FAILURE_WAIT,	/* failed I/O, failmode 'wait' */
+	ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */
+	ZPOOL_STATUS_BAD_LOG,		/* cannot read log chain(s) */
+	ZPOOL_STATUS_ERRATA,		/* informational errata available */
+
+	/*
+	 * If the pool has unsupported features but can still be opened in
+	 * read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the
+	 * pool has unsupported features but cannot be opened at all, its
+	 * status is ZPOOL_STATUS_UNSUP_FEAT_READ.
+	 */
+	ZPOOL_STATUS_UNSUP_FEAT_READ,	/* unsupported features for read */
+	ZPOOL_STATUS_UNSUP_FEAT_WRITE,	/* unsupported features for write */
+
+	/*
+	 * These faults have no corresponding message ID.  At the time we are
+	 * checking the status, the original reason for the FMA fault (I/O or
+	 * checksum errors) has been lost.
+	 */
+	ZPOOL_STATUS_FAULTED_DEV_R,	/* faulted device with replicas */
+	ZPOOL_STATUS_FAULTED_DEV_NR,	/* faulted device with no replicas */
+
+	/*
+	 * The following are not faults per se, but still an error possibly
+	 * requiring administrative attention.  There is no corresponding
+	 * message ID.
+	 */
+	ZPOOL_STATUS_VERSION_OLDER,	/* older legacy on-disk version */
+	ZPOOL_STATUS_FEAT_DISABLED,	/* supported features are disabled */
+	ZPOOL_STATUS_RESILVERING,	/* device being resilvered */
+	ZPOOL_STATUS_OFFLINE_DEV,	/* device online */
+	ZPOOL_STATUS_REMOVED_DEV,	/* removed device */
+
+	/*
+	 * Finally, the following indicates a healthy pool.
+	 */
+	ZPOOL_STATUS_OK
+} zpool_status_t;
+
+extern unsigned long get_system_hostid(void);
+extern zpool_status_t zpool_get_status(zpool_handle_t *, char **,
+    zpool_errata_t *);
+extern zpool_status_t zpool_import_status(nvlist_t *, char **,
+    zpool_errata_t *);
+extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh);
+
+/*
+ * Statistics and configuration functions.
+ */
+extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **);
+extern nvlist_t *zpool_get_features(zpool_handle_t *);
+extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *);
+extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **);
+
+/*
+ * Import and export functions
+ */
+extern int zpool_export(zpool_handle_t *, boolean_t, const char *);
+extern int zpool_export_force(zpool_handle_t *, const char *);
+extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *,
+    char *altroot);
+extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *,
+    nvlist_t *, int);
+extern void zpool_print_unsup_feat(nvlist_t *config);
+
+/*
+ * Search for pools to import
+ */
+
+typedef struct importargs {
+	char **path;		/* a list of paths to search		*/
+	int paths;		/* number of paths to search		*/
+	char *poolname;		/* name of a pool to find		*/
+	uint64_t guid;		/* guid of a pool to find		*/
+	char *cachefile;	/* cachefile to use for import		*/
+	int can_be_active : 1;	/* can the pool be active?		*/
+	int unique : 1;		/* does 'poolname' already exist?	*/
+	int exists : 1;		/* set on return if pool already exists	*/
+} importargs_t;
+
+extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *);
+
+/* legacy pool search routines */
+extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **);
+extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *,
+    char *, uint64_t);
+
+/*
+ * Miscellaneous pool functions
+ */
+struct zfs_cmd;
+
+extern const char *zfs_history_event_names[];
+
+extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *,
+    boolean_t verbose);
+extern int zpool_upgrade(zpool_handle_t *, uint64_t);
+extern int zpool_get_history(zpool_handle_t *, nvlist_t **);
+extern int zpool_history_unpack(char *, uint64_t, uint64_t *,
+    nvlist_t ***, uint_t *);
+extern int zpool_events_next(libzfs_handle_t *, nvlist_t **, int *, unsigned,
+    int);
+extern int zpool_events_clear(libzfs_handle_t *, int *);
+extern int zpool_events_seek(libzfs_handle_t *, uint64_t, int);
+extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *,
+    size_t len);
+extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *);
+extern int zpool_get_physpath(zpool_handle_t *, char *, size_t);
+extern void zpool_explain_recover(libzfs_handle_t *, const char *, int,
+    nvlist_t *);
+
+/*
+ * Basic handle manipulations.  These functions do not create or destroy the
+ * underlying datasets, only the references to them.
+ */
+extern zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int);
+extern zfs_handle_t *zfs_handle_dup(zfs_handle_t *);
+extern void zfs_close(zfs_handle_t *);
+extern zfs_type_t zfs_get_type(const zfs_handle_t *);
+extern const char *zfs_get_name(const zfs_handle_t *);
+extern zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *);
+
+/*
+ * Property management functions.  Some functions are shared with the kernel,
+ * and are found in sys/fs/zfs.h.
+ */
+
+/*
+ * zfs dataset property management
+ */
+extern const char *zfs_prop_default_string(zfs_prop_t);
+extern uint64_t zfs_prop_default_numeric(zfs_prop_t);
+extern const char *zfs_prop_column_name(zfs_prop_t);
+extern boolean_t zfs_prop_align_right(zfs_prop_t);
+
+extern nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t,
+    nvlist_t *, uint64_t, zfs_handle_t *, const char *);
+
+extern const char *zfs_prop_to_name(zfs_prop_t);
+extern int zfs_prop_set(zfs_handle_t *, const char *, const char *);
+extern int zfs_prop_set_list(zfs_handle_t *, nvlist_t *);
+extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t,
+    zprop_source_t *, char *, size_t, boolean_t);
+extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t,
+    boolean_t);
+extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *,
+    zprop_source_t *, char *, size_t);
+extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname,
+    uint64_t *propvalue);
+extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname,
+    char *propbuf, int proplen, boolean_t literal);
+extern int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname,
+    uint64_t *propvalue);
+extern int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname,
+    char *propbuf, int proplen, boolean_t literal);
+extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname,
+    char *buf, size_t len);
+extern uint64_t getprop_uint64(zfs_handle_t *, zfs_prop_t, char **);
+extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t);
+extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t);
+extern const char *zfs_prop_values(zfs_prop_t);
+extern int zfs_prop_is_string(zfs_prop_t prop);
+extern nvlist_t *zfs_get_user_props(zfs_handle_t *);
+extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *);
+extern nvlist_t *zfs_get_clones_nvl(zfs_handle_t *);
+
+typedef struct zprop_list {
+	int		pl_prop;
+	char		*pl_user_prop;
+	struct zprop_list *pl_next;
+	boolean_t	pl_all;
+	size_t		pl_width;
+	size_t		pl_recvd_width;
+	boolean_t	pl_fixed;
+} zprop_list_t;
+
+extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t,
+    boolean_t);
+extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *);
+
+#define	ZFS_MOUNTPOINT_NONE	"none"
+#define	ZFS_MOUNTPOINT_LEGACY	"legacy"
+
+#define	ZFS_FEATURE_DISABLED	"disabled"
+#define	ZFS_FEATURE_ENABLED	"enabled"
+#define	ZFS_FEATURE_ACTIVE	"active"
+
+#define	ZFS_UNSUPPORTED_INACTIVE	"inactive"
+#define	ZFS_UNSUPPORTED_READONLY	"readonly"
+
+/*
+ * zpool property management
+ */
+extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **);
+extern int zpool_prop_get_feature(zpool_handle_t *, const char *, char *,
+    size_t);
+extern const char *zpool_prop_default_string(zpool_prop_t);
+extern uint64_t zpool_prop_default_numeric(zpool_prop_t);
+extern const char *zpool_prop_column_name(zpool_prop_t);
+extern boolean_t zpool_prop_align_right(zpool_prop_t);
+
+/*
+ * Functions shared by zfs and zpool property management.
+ */
+extern int zprop_iter(zprop_func func, void *cb, boolean_t show_all,
+    boolean_t ordered, zfs_type_t type);
+extern int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **,
+    zfs_type_t);
+extern void zprop_free_list(zprop_list_t *);
+
+#define	ZFS_GET_NCOLS	5
+
+typedef enum {
+	GET_COL_NONE,
+	GET_COL_NAME,
+	GET_COL_PROPERTY,
+	GET_COL_VALUE,
+	GET_COL_RECVD,
+	GET_COL_SOURCE
+} zfs_get_column_t;
+
+/*
+ * Functions for printing zfs or zpool properties
+ */
+typedef struct zprop_get_cbdata {
+	int cb_sources;
+	zfs_get_column_t cb_columns[ZFS_GET_NCOLS];
+	int cb_colwidths[ZFS_GET_NCOLS + 1];
+	boolean_t cb_scripted;
+	boolean_t cb_literal;
+	boolean_t cb_first;
+	zprop_list_t *cb_proplist;
+	zfs_type_t cb_type;
+} zprop_get_cbdata_t;
+
+void zprop_print_one_property(const char *, zprop_get_cbdata_t *,
+    const char *, const char *, zprop_source_t, const char *,
+    const char *);
+
+/*
+ * Iterator functions.
+ */
+typedef int (*zfs_iter_f)(zfs_handle_t *, void *);
+extern int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *);
+extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *);
+extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *);
+extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *);
+extern int zfs_iter_snapshots(zfs_handle_t *, boolean_t, zfs_iter_f, void *);
+extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *);
+extern int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *);
+extern int zfs_iter_bookmarks(zfs_handle_t *, zfs_iter_f, void *);
+
+typedef struct get_all_cb {
+	zfs_handle_t	**cb_handles;
+	size_t		cb_alloc;
+	size_t		cb_used;
+	boolean_t	cb_verbose;
+	int		(*cb_getone)(zfs_handle_t *, void *);
+} get_all_cb_t;
+
+void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *);
+int libzfs_dataset_cmp(const void *, const void *);
+
+/*
+ * Functions to create and destroy datasets.
+ */
+extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t,
+    nvlist_t *);
+extern int zfs_create_ancestors(libzfs_handle_t *, const char *);
+extern int zfs_destroy(zfs_handle_t *, boolean_t);
+extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t);
+extern int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t);
+extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *);
+extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *);
+extern int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps,
+    nvlist_t *props);
+extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t);
+extern int zfs_rename(zfs_handle_t *, const char *, boolean_t, boolean_t);
+
+typedef struct sendflags {
+	/* print informational messages (ie, -v was specified) */
+	boolean_t verbose;
+
+	/* recursive send  (ie, -R) */
+	boolean_t replicate;
+
+	/* for incrementals, do all intermediate snapshots */
+	boolean_t doall;
+
+	/* if dataset is a clone, do incremental from its origin */
+	boolean_t fromorigin;
+
+	/* do deduplication */
+	boolean_t dedup;
+
+	/* send properties (ie, -p) */
+	boolean_t props;
+
+	/* do not send (no-op, ie. -n) */
+	boolean_t dryrun;
+
+	/* parsable verbose output (ie. -P) */
+	boolean_t parsable;
+
+	/* show progress (ie. -v) */
+	boolean_t progress;
+
+	/* large blocks (>128K) are permitted */
+	boolean_t largeblock;
+
+	/* WRITE_EMBEDDED records of type DATA are permitted */
+	boolean_t embed_data;
+} sendflags_t;
+
+typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
+
+extern int zfs_send(zfs_handle_t *, const char *, const char *,
+    sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **);
+extern int zfs_send_one(zfs_handle_t *, const char *, int, enum lzc_send_flags);
+
+extern int zfs_promote(zfs_handle_t *);
+extern int zfs_hold(zfs_handle_t *, const char *, const char *,
+    boolean_t, int);
+extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *);
+extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
+extern int zfs_get_holds(zfs_handle_t *, nvlist_t **);
+extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
+
+typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
+    uid_t rid, uint64_t space);
+
+extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t,
+    zfs_userspace_cb_t, void *);
+
+extern int zfs_get_fsacl(zfs_handle_t *, nvlist_t **);
+extern int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *);
+
+typedef struct recvflags {
+	/* print informational messages (ie, -v was specified) */
+	boolean_t verbose;
+
+	/* the destination is a prefix, not the exact fs (ie, -d) */
+	boolean_t isprefix;
+
+	/*
+	 * Only the tail of the sent snapshot path is appended to the
+	 * destination to determine the received snapshot name (ie, -e).
+	 */
+	boolean_t istail;
+
+	/* do not actually do the recv, just check if it would work (ie, -n) */
+	boolean_t dryrun;
+
+	/* rollback/destroy filesystems as necessary (eg, -F) */
+	boolean_t force;
+
+	/* set "canmount=off" on all modified filesystems */
+	boolean_t canmountoff;
+
+	/* byteswap flag is used internally; callers need not specify */
+	boolean_t byteswap;
+
+	/* do not mount file systems as they are extracted (private) */
+	boolean_t nomount;
+} recvflags_t;
+
+extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t *,
+    int, avl_tree_t *);
+
+typedef enum diff_flags {
+	ZFS_DIFF_PARSEABLE = 0x1,
+	ZFS_DIFF_TIMESTAMP = 0x2,
+	ZFS_DIFF_CLASSIFY = 0x4
+} diff_flags_t;
+
+extern int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *,
+    int);
+
+/*
+ * Miscellaneous functions.
+ */
+extern const char *zfs_type_to_name(zfs_type_t);
+extern void zfs_refresh_properties(zfs_handle_t *);
+extern int zfs_name_valid(const char *, zfs_type_t);
+extern zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, char *, zfs_type_t);
+extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *,
+    zfs_type_t);
+extern int zfs_spa_version(zfs_handle_t *, int *);
+extern boolean_t zfs_bookmark_exists(const char *path);
+extern int zfs_append_partition(char *path, size_t max_len);
+extern int zfs_resolve_shortname(const char *name, char *path, size_t pathlen);
+extern int zfs_strcmp_pathname(char *name, char *cmp_name, int wholedisk);
+
+/*
+ * Mount support functions.
+ */
+extern boolean_t is_mounted(libzfs_handle_t *, const char *special, char **);
+extern boolean_t zfs_is_mounted(zfs_handle_t *, char **);
+extern int zfs_mount(zfs_handle_t *, const char *, int);
+extern int zfs_unmount(zfs_handle_t *, const char *, int);
+extern int zfs_unmountall(zfs_handle_t *, int);
+
+/*
+ * Share support functions.
+ */
+extern boolean_t zfs_is_shared(zfs_handle_t *);
+extern int zfs_share(zfs_handle_t *);
+extern int zfs_unshare(zfs_handle_t *);
+
+/*
+ * Protocol-specific share support functions.
+ */
+extern boolean_t zfs_is_shared_nfs(zfs_handle_t *, char **);
+extern boolean_t zfs_is_shared_smb(zfs_handle_t *, char **);
+extern int zfs_share_nfs(zfs_handle_t *);
+extern int zfs_share_smb(zfs_handle_t *);
+extern int zfs_shareall(zfs_handle_t *);
+extern int zfs_unshare_nfs(zfs_handle_t *, const char *);
+extern int zfs_unshare_smb(zfs_handle_t *, const char *);
+extern int zfs_unshareall_nfs(zfs_handle_t *);
+extern int zfs_unshareall_smb(zfs_handle_t *);
+extern int zfs_unshareall_bypath(zfs_handle_t *, const char *);
+extern int zfs_unshareall(zfs_handle_t *);
+extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *,
+    void *, void *, int, zfs_share_op_t);
+
+/*
+ * Utility function to convert a number to a human-readable form.
+ */
+extern void zfs_nicenum(uint64_t, char *, size_t);
+extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *);
+
+/*
+ * Utility functions to run an external process.
+ */
+#define	STDOUT_VERBOSE	0x01
+#define	STDERR_VERBOSE	0x02
+
+int libzfs_run_process(const char *, char **, int flags);
+
+/*
+ * Given a device or file, determine if it is part of a pool.
+ */
+extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **,
+    boolean_t *);
+
+/*
+ * Label manipulation.
+ */
+extern int zpool_read_label(int, nvlist_t **, int *);
+extern int zpool_clear_label(int);
+
+/*
+ * Management interfaces for SMB ACL files
+ */
+
+int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *);
+int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *);
+int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *);
+int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *);
+
+/*
+ * Enable and disable datasets within a pool by mounting/unmounting and
+ * sharing/unsharing them.
+ */
+extern int zpool_enable_datasets(zpool_handle_t *, const char *, int);
+extern int zpool_disable_datasets(zpool_handle_t *, boolean_t);
+
+/*
+ * Mappings between vdev and FRU.
+ */
+extern void libzfs_fru_refresh(libzfs_handle_t *);
+extern const char *libzfs_fru_lookup(libzfs_handle_t *, const char *);
+extern const char *libzfs_fru_devpath(libzfs_handle_t *, const char *);
+extern boolean_t libzfs_fru_compare(libzfs_handle_t *, const char *,
+    const char *);
+extern boolean_t libzfs_fru_notself(libzfs_handle_t *, const char *);
+extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _LIBZFS_H */
diff --git a/include/sys/bqueue.h b/include/sys/bqueue.h
new file mode 100644
index 000000000000..63722df1bbf3
--- /dev/null
+++ b/include/sys/bqueue.h
@@ -0,0 +1,54 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef	_BQUEUE_H
+#define	_BQUEUE_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include	<sys/zfs_context.h>
+
+typedef struct bqueue {
+	list_t bq_list;
+	kmutex_t bq_lock;
+	kcondvar_t bq_add_cv;
+	kcondvar_t bq_pop_cv;
+	uint64_t bq_size;
+	uint64_t bq_maxsize;
+	size_t bq_node_offset;
+} bqueue_t;
+
+typedef struct bqueue_node {
+	list_node_t bqn_node;
+	uint64_t bqn_size;
+} bqueue_node_t;
+
+
+int bqueue_init(bqueue_t *, uint64_t, size_t);
+void bqueue_destroy(bqueue_t *);
+void bqueue_enqueue(bqueue_t *, void *, uint64_t);
+void *bqueue_dequeue(bqueue_t *);
+boolean_t bqueue_empty(bqueue_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _BQUEUE_H */
diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h
index 0d262e87b5bc..9147c9d4d6cc 100644
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@@ -261,8 +261,7 @@ typedef struct dbuf_hash_table {
 	kmutex_t hash_mutexes[DBUF_MUTEXES];
 } dbuf_hash_table_t;
 
-
-uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
+uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
 
 void dbuf_create_bonus(struct dnode *dn);
 int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
@@ -272,10 +271,12 @@ void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
 dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
 dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
     void *tag);
-int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
+int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
+    boolean_t fail_sparse, boolean_t fail_uncached,
     void *tag, dmu_buf_impl_t **dbp);
 
-void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio);
+void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
+    zio_priority_t prio, arc_flags_t aflags);
 
 void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
 boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
diff --git a/include/sys/dbuf.h.orig b/include/sys/dbuf.h.orig
new file mode 100644
index 000000000000..0d262e87b5bc
--- /dev/null
+++ b/include/sys/dbuf.h.orig
@@ -0,0 +1,392 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#ifndef	_SYS_DBUF_H
+#define	_SYS_DBUF_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+#include <sys/zrlock.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	IN_DMU_SYNC 2
+
+/*
+ * define flags for dbuf_read
+ */
+
+#define	DB_RF_MUST_SUCCEED	(1 << 0)
+#define	DB_RF_CANFAIL		(1 << 1)
+#define	DB_RF_HAVESTRUCT	(1 << 2)
+#define	DB_RF_NOPREFETCH	(1 << 3)
+#define	DB_RF_NEVERWAIT		(1 << 4)
+#define	DB_RF_CACHED		(1 << 5)
+
+/*
+ * The simplified state transition diagram for dbufs looks like:
+ *
+ *		+----> READ ----+
+ *		|		|
+ *		|		V
+ *  (alloc)-->UNCACHED	     CACHED-->EVICTING-->(free)
+ *		|		^	 ^
+ *		|		|	 |
+ *		+----> FILL ----+	 |
+ *		|			 |
+ *		|			 |
+ *		+--------> NOFILL -------+
+ *
+ * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
+ * to find all dbufs in a range of a dnode and must be less than any other
+ * dbuf_states_t (see comment on dn_dbufs in dnode.h).
+ */
+typedef enum dbuf_states {
+	DB_SEARCH = -1,
+	DB_UNCACHED,
+	DB_FILL,
+	DB_NOFILL,
+	DB_READ,
+	DB_CACHED,
+	DB_EVICTING
+} dbuf_states_t;
+
+struct dnode;
+struct dmu_tx;
+
+/*
+ * level = 0 means the user data
+ * level = 1 means the single indirect block
+ * etc.
+ */
+
+struct dmu_buf_impl;
+
+typedef enum override_states {
+	DR_NOT_OVERRIDDEN,
+	DR_IN_DMU_SYNC,
+	DR_OVERRIDDEN
+} override_states_t;
+
+typedef struct dbuf_dirty_record {
+	/* link on our parents dirty list */
+	list_node_t dr_dirty_node;
+
+	/* transaction group this data will sync in */
+	uint64_t dr_txg;
+
+	/* zio of outstanding write IO */
+	zio_t *dr_zio;
+
+	/* pointer back to our dbuf */
+	struct dmu_buf_impl *dr_dbuf;
+
+	/* pointer to next dirty record */
+	struct dbuf_dirty_record *dr_next;
+
+	/* pointer to parent dirty record */
+	struct dbuf_dirty_record *dr_parent;
+
+	/* How much space was changed to dsl_pool_dirty_space() for this? */
+	unsigned int dr_accounted;
+
+	union dirty_types {
+		struct dirty_indirect {
+
+			/* protect access to list */
+			kmutex_t dr_mtx;
+
+			/* Our list of dirty children */
+			list_t dr_children;
+		} di;
+		struct dirty_leaf {
+
+			/*
+			 * dr_data is set when we dirty the buffer
+			 * so that we can retain the pointer even if it
+			 * gets COW'd in a subsequent transaction group.
+			 */
+			arc_buf_t *dr_data;
+			blkptr_t dr_overridden_by;
+			override_states_t dr_override_state;
+			uint8_t dr_copies;
+			boolean_t dr_nopwrite;
+		} dl;
+	} dt;
+} dbuf_dirty_record_t;
+
+typedef struct dmu_buf_impl {
+	/*
+	 * The following members are immutable, with the exception of
+	 * db.db_data, which is protected by db_mtx.
+	 */
+
+	/* the publicly visible structure */
+	dmu_buf_t db;
+
+	/* the objset we belong to */
+	struct objset *db_objset;
+
+	/*
+	 * handle to safely access the dnode we belong to (NULL when evicted)
+	 */
+	struct dnode_handle *db_dnode_handle;
+
+	/*
+	 * our parent buffer; if the dnode points to us directly,
+	 * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
+	 * only accessed by sync thread ???
+	 * (NULL when evicted)
+	 * May change from NULL to non-NULL under the protection of db_mtx
+	 * (see dbuf_check_blkptr())
+	 */
+	struct dmu_buf_impl *db_parent;
+
+	/*
+	 * link for hash table of all dmu_buf_impl_t's
+	 */
+	struct dmu_buf_impl *db_hash_next;
+
+	/* our block number */
+	uint64_t db_blkid;
+
+	/*
+	 * Pointer to the blkptr_t which points to us. May be NULL if we
+	 * don't have one yet. (NULL when evicted)
+	 */
+	blkptr_t *db_blkptr;
+
+	/*
+	 * Our indirection level.  Data buffers have db_level==0.
+	 * Indirect buffers which point to data buffers have
+	 * db_level==1. etc.  Buffers which contain dnodes have
+	 * db_level==0, since the dnodes are stored in a file.
+	 */
+	uint8_t db_level;
+
+	/* db_mtx protects the members below */
+	kmutex_t db_mtx;
+
+	/*
+	 * Current state of the buffer
+	 */
+	dbuf_states_t db_state;
+
+	/*
+	 * Refcount accessed by dmu_buf_{hold,rele}.
+	 * If nonzero, the buffer can't be destroyed.
+	 * Protected by db_mtx.
+	 */
+	refcount_t db_holds;
+
+	/* buffer holding our data */
+	arc_buf_t *db_buf;
+
+	kcondvar_t db_changed;
+	dbuf_dirty_record_t *db_data_pending;
+
+	/* pointer to most recent dirty record for this buffer */
+	dbuf_dirty_record_t *db_last_dirty;
+
+	/*
+	 * Our link on the owner dnodes's dn_dbufs list.
+	 * Protected by its dn_dbufs_mtx.
+	 */
+	avl_node_t db_link;
+
+	/* Data which is unique to data (leaf) blocks: */
+
+	/* User callback information. */
+	dmu_buf_user_t *db_user;
+
+	/*
+	 * Evict user data as soon as the dirty and reference
+	 * counts are equal.
+	 */
+	uint8_t db_user_immediate_evict;
+
+	/*
+	 * This block was freed while a read or write was
+	 * active.
+	 */
+	uint8_t db_freed_in_flight;
+
+	/*
+	 * dnode_evict_dbufs() or dnode_evict_bonus() tried to
+	 * evict this dbuf, but couldn't due to outstanding
+	 * references.  Evict once the refcount drops to 0.
+	 */
+	uint8_t db_pending_evict;
+
+	uint8_t db_dirtycnt;
+} dmu_buf_impl_t;
+
+/* Note: the dbuf hash table is exposed only for the mdb module */
+#define	DBUF_MUTEXES 8192
+#define	DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
+typedef struct dbuf_hash_table {
+	uint64_t hash_table_mask;
+	dmu_buf_impl_t **hash_table;
+	kmutex_t hash_mutexes[DBUF_MUTEXES];
+} dbuf_hash_table_t;
+
+
+uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
+
+void dbuf_create_bonus(struct dnode *dn);
+int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
+
+void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
+
+dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
+dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
+    void *tag);
+int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
+    void *tag, dmu_buf_impl_t **dbp);
+
+void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio);
+
+void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
+boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
+    uint64_t blkid, void *tag);
+uint64_t dbuf_refcount(dmu_buf_impl_t *db);
+
+void dbuf_rele(dmu_buf_impl_t *db, void *tag);
+void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
+
+dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
+    uint64_t blkid);
+
+int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
+void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
+void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
+void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
+void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
+dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
+void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+    bp_embedded_type_t etype, enum zio_compress comp,
+    int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
+
+void dbuf_clear(dmu_buf_impl_t *db);
+void dbuf_evict(dmu_buf_impl_t *db);
+
+void dbuf_unoverride(dbuf_dirty_record_t *dr);
+void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
+void dbuf_release_bp(dmu_buf_impl_t *db);
+
+void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
+    struct dmu_tx *);
+
+void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
+
+void dbuf_stats_init(dbuf_hash_table_t *hash);
+void dbuf_stats_destroy(void);
+
+#define	DB_DNODE(_db)		((_db)->db_dnode_handle->dnh_dnode)
+#define	DB_DNODE_LOCK(_db)	((_db)->db_dnode_handle->dnh_zrlock)
+#define	DB_DNODE_ENTER(_db)	(zrl_add(&DB_DNODE_LOCK(_db)))
+#define	DB_DNODE_EXIT(_db)	(zrl_remove(&DB_DNODE_LOCK(_db)))
+#define	DB_DNODE_HELD(_db)	(!zrl_is_zero(&DB_DNODE_LOCK(_db)))
+
+void dbuf_init(void);
+void dbuf_fini(void);
+
+boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
+
+#define	DBUF_GET_BUFC_TYPE(_db)	\
+	(dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
+
+#define	DBUF_IS_CACHEABLE(_db)						\
+	((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
+	(dbuf_is_metadata(_db) &&					\
+	((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
+
+#define	DBUF_IS_L2CACHEABLE(_db)					\
+	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||	\
+	(dbuf_is_metadata(_db) &&					\
+	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
+
+#define	DBUF_IS_L2COMPRESSIBLE(_db)					\
+	((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF ||		\
+	(dbuf_is_metadata(_db) && zfs_mdcomp_disable == B_FALSE))
+
+#ifdef ZFS_DEBUG
+
+/*
+ * There should be a ## between the string literal and fmt, to make it
+ * clear that we're joining two strings together, but gcc does not
+ * support that preprocessor token.
+ */
+#define	dprintf_dbuf(dbuf, fmt, ...) do { \
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+	char __db_buf[32]; \
+	uint64_t __db_obj = (dbuf)->db.db_object; \
+	if (__db_obj == DMU_META_DNODE_OBJECT) \
+		(void) strcpy(__db_buf, "mdn"); \
+	else \
+		(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
+		    (u_longlong_t)__db_obj); \
+	dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
+	    "obj=%s lvl=%u blkid=%lld " fmt, \
+	    __db_buf, (dbuf)->db_level, \
+	    (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
+	} \
+_NOTE(CONSTCOND) } while (0)
+
+#define	dprintf_dbuf_bp(db, bp, fmt, ...) do {			\
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
+	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
+	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp);		\
+	dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);	\
+	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
+	}							\
+_NOTE(CONSTCOND) } while (0)
+
+#define	DBUF_VERIFY(db)	dbuf_verify(db)
+
+#else
+
+#define	dprintf_dbuf(db, fmt, ...)
+#define	dprintf_dbuf_bp(db, bp, fmt, ...)
+#define	DBUF_VERIFY(db)
+
+#endif
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DBUF_H */
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index d9434db46383..1d461729b611 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -45,6 +45,7 @@
 #include <sys/cred.h>
 #include <sys/fs/zfs.h>
 #include <sys/uio.h>
+#include <sys/zio_priority.h>
 
 #ifdef	__cplusplus
 extern "C" {
diff --git a/include/sys/dmu.h.orig b/include/sys/dmu.h.orig
new file mode 100644
index 000000000000..d9434db46383
--- /dev/null
+++ b/include/sys/dmu.h.orig
@@ -0,0 +1,934 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#ifndef	_SYS_DMU_H
+#define	_SYS_DMU_H
+
+/*
+ * This file describes the interface that the DMU provides for its
+ * consumers.
+ *
+ * The DMU also interacts with the SPA.  That interface is described in
+ * dmu_spa.h.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/inttypes.h>
+#include <sys/cred.h>
+#include <sys/fs/zfs.h>
+#include <sys/uio.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct page;
+struct vnode;
+struct spa;
+struct zilog;
+struct zio;
+struct blkptr;
+struct zap_cursor;
+struct dsl_dataset;
+struct dsl_pool;
+struct dnode;
+struct drr_begin;
+struct drr_end;
+struct zbookmark_phys;
+struct spa;
+struct nvlist;
+struct arc_buf;
+struct zio_prop;
+struct sa_handle;
+
+typedef struct objset objset_t;
+typedef struct dmu_tx dmu_tx_t;
+typedef struct dsl_dir dsl_dir_t;
+
+typedef enum dmu_object_byteswap {
+	DMU_BSWAP_UINT8,
+	DMU_BSWAP_UINT16,
+	DMU_BSWAP_UINT32,
+	DMU_BSWAP_UINT64,
+	DMU_BSWAP_ZAP,
+	DMU_BSWAP_DNODE,
+	DMU_BSWAP_OBJSET,
+	DMU_BSWAP_ZNODE,
+	DMU_BSWAP_OLDACL,
+	DMU_BSWAP_ACL,
+	/*
+	 * Allocating a new byteswap type number makes the on-disk format
+	 * incompatible with any other format that uses the same number.
+	 *
+	 * Data can usually be structured to work with one of the
+	 * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
+	 */
+	DMU_BSWAP_NUMFUNCS
+} dmu_object_byteswap_t;
+
+#define	DMU_OT_NEWTYPE 0x80
+#define	DMU_OT_METADATA 0x40
+#define	DMU_OT_BYTESWAP_MASK 0x3f
+
+/*
+ * Defines a uint8_t object type. Object types specify if the data
+ * in the object is metadata (boolean) and how to byteswap the data
+ * (dmu_object_byteswap_t).
+ */
+#define	DMU_OT(byteswap, metadata) \
+	(DMU_OT_NEWTYPE | \
+	((metadata) ? DMU_OT_METADATA : 0) | \
+	((byteswap) & DMU_OT_BYTESWAP_MASK))
+
+#define	DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+	((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
+	(ot) < DMU_OT_NUMTYPES)
+
+#define	DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+	((ot) & DMU_OT_METADATA) : \
+	dmu_ot[(int)(ot)].ot_metadata)
+
+/*
+ * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
+ * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
+ * is repurposed for embedded BPs.
+ */
+#define	DMU_OT_HAS_FILL(ot) \
+	((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
+
+#define	DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+	((ot) & DMU_OT_BYTESWAP_MASK) : \
+	dmu_ot[(int)(ot)].ot_byteswap)
+
+typedef enum dmu_object_type {
+	DMU_OT_NONE,
+	/* general: */
+	DMU_OT_OBJECT_DIRECTORY,	/* ZAP */
+	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
+	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
+	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
+	DMU_OT_BPOBJ,			/* UINT64 */
+	DMU_OT_BPOBJ_HDR,		/* UINT64 */
+	/* spa: */
+	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
+	DMU_OT_SPACE_MAP,		/* UINT64 */
+	/* zil: */
+	DMU_OT_INTENT_LOG,		/* UINT64 */
+	/* dmu: */
+	DMU_OT_DNODE,			/* DNODE */
+	DMU_OT_OBJSET,			/* OBJSET */
+	/* dsl: */
+	DMU_OT_DSL_DIR,			/* UINT64 */
+	DMU_OT_DSL_DIR_CHILD_MAP,	/* ZAP */
+	DMU_OT_DSL_DS_SNAP_MAP,		/* ZAP */
+	DMU_OT_DSL_PROPS,		/* ZAP */
+	DMU_OT_DSL_DATASET,		/* UINT64 */
+	/* zpl: */
+	DMU_OT_ZNODE,			/* ZNODE */
+	DMU_OT_OLDACL,			/* Old ACL */
+	DMU_OT_PLAIN_FILE_CONTENTS,	/* UINT8 */
+	DMU_OT_DIRECTORY_CONTENTS,	/* ZAP */
+	DMU_OT_MASTER_NODE,		/* ZAP */
+	DMU_OT_UNLINKED_SET,		/* ZAP */
+	/* zvol: */
+	DMU_OT_ZVOL,			/* UINT8 */
+	DMU_OT_ZVOL_PROP,		/* ZAP */
+	/* other; for testing only! */
+	DMU_OT_PLAIN_OTHER,		/* UINT8 */
+	DMU_OT_UINT64_OTHER,		/* UINT64 */
+	DMU_OT_ZAP_OTHER,		/* ZAP */
+	/* new object types: */
+	DMU_OT_ERROR_LOG,		/* ZAP */
+	DMU_OT_SPA_HISTORY,		/* UINT8 */
+	DMU_OT_SPA_HISTORY_OFFSETS,	/* spa_his_phys_t */
+	DMU_OT_POOL_PROPS,		/* ZAP */
+	DMU_OT_DSL_PERMS,		/* ZAP */
+	DMU_OT_ACL,			/* ACL */
+	DMU_OT_SYSACL,			/* SYSACL */
+	DMU_OT_FUID,			/* FUID table (Packed NVLIST UINT8) */
+	DMU_OT_FUID_SIZE,		/* FUID table size UINT64 */
+	DMU_OT_NEXT_CLONES,		/* ZAP */
+	DMU_OT_SCAN_QUEUE,		/* ZAP */
+	DMU_OT_USERGROUP_USED,		/* ZAP */
+	DMU_OT_USERGROUP_QUOTA,		/* ZAP */
+	DMU_OT_USERREFS,		/* ZAP */
+	DMU_OT_DDT_ZAP,			/* ZAP */
+	DMU_OT_DDT_STATS,		/* ZAP */
+	DMU_OT_SA,			/* System attr */
+	DMU_OT_SA_MASTER_NODE,		/* ZAP */
+	DMU_OT_SA_ATTR_REGISTRATION,	/* ZAP */
+	DMU_OT_SA_ATTR_LAYOUTS,		/* ZAP */
+	DMU_OT_SCAN_XLATE,		/* ZAP */
+	DMU_OT_DEDUP,			/* fake dedup BP from ddt_bp_create() */
+	DMU_OT_DEADLIST,		/* ZAP */
+	DMU_OT_DEADLIST_HDR,		/* UINT64 */
+	DMU_OT_DSL_CLONES,		/* ZAP */
+	DMU_OT_BPOBJ_SUBOBJ,		/* UINT64 */
+	/*
+	 * Do not allocate new object types here. Doing so makes the on-disk
+	 * format incompatible with any other format that uses the same object
+	 * type number.
+	 *
+	 * When creating an object which does not have one of the above types
+	 * use the DMU_OTN_* type with the correct byteswap and metadata
+	 * values.
+	 *
+	 * The DMU_OTN_* types do not have entries in the dmu_ot table,
+	 * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead
+	 * of indexing into dmu_ot directly (this works for both DMU_OT_* types
+	 * and DMU_OTN_* types).
+	 */
+	DMU_OT_NUMTYPES,
+
+	/*
+	 * Names for valid types declared with DMU_OT().
+	 */
+	DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE),
+	DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE),
+	DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE),
+	DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE),
+	DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE),
+	DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE),
+	DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE),
+	DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE),
+	DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE),
+	DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
+} dmu_object_type_t;
+
+typedef enum txg_how {
+	TXG_WAIT = 1,
+	TXG_NOWAIT,
+	TXG_WAITED,
+} txg_how_t;
+
+void byteswap_uint64_array(void *buf, size_t size);
+void byteswap_uint32_array(void *buf, size_t size);
+void byteswap_uint16_array(void *buf, size_t size);
+void byteswap_uint8_array(void *buf, size_t size);
+void zap_byteswap(void *buf, size_t size);
+void zfs_oldacl_byteswap(void *buf, size_t size);
+void zfs_acl_byteswap(void *buf, size_t size);
+void zfs_znode_byteswap(void *buf, size_t size);
+
+#define	DS_FIND_SNAPSHOTS	(1<<0)
+#define	DS_FIND_CHILDREN	(1<<1)
+#define	DS_FIND_SERIALIZE	(1<<2)
+
+/*
+ * The maximum number of bytes that can be accessed as part of one
+ * operation, including metadata.
+ */
+#define	DMU_MAX_ACCESS (64 * 1024 * 1024) /* 64MB */
+#define	DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
+
+#define	DMU_USERUSED_OBJECT	(-1ULL)
+#define	DMU_GROUPUSED_OBJECT	(-2ULL)
+
+/*
+ * artificial blkids for bonus buffer and spill blocks
+ */
+#define	DMU_BONUS_BLKID		(-1ULL)
+#define	DMU_SPILL_BLKID		(-2ULL)
+/*
+ * Public routines to create, destroy, open, and close objsets.
+ */
+int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
+int dmu_objset_own(const char *name, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_rele(objset_t *os, void *tag);
+void dmu_objset_disown(objset_t *os, void *tag);
+int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
+
+void dmu_objset_evict_dbufs(objset_t *os);
+int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
+    void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
+int dmu_objset_clone(const char *name, const char *origin);
+int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
+    struct nvlist *errlist);
+int dmu_objset_snapshot_one(const char *fsname, const char *snapname);
+int dmu_objset_snapshot_tmp(const char *, const char *, int);
+int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
+    int flags);
+void dmu_objset_byteswap(void *buf, size_t size);
+int dsl_dataset_rename_snapshot(const char *fsname,
+    const char *oldsnapname, const char *newsnapname, boolean_t recursive);
+
+typedef struct dmu_buf {
+	uint64_t db_object;		/* object that this buffer is part of */
+	uint64_t db_offset;		/* byte offset in this object */
+	uint64_t db_size;		/* size of buffer in bytes */
+	void *db_data;			/* data in buffer */
+} dmu_buf_t;
+
+/*
+ * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
+ */
+#define	DMU_POOL_DIRECTORY_OBJECT	1
+#define	DMU_POOL_CONFIG			"config"
+#define	DMU_POOL_FEATURES_FOR_WRITE	"features_for_write"
+#define	DMU_POOL_FEATURES_FOR_READ	"features_for_read"
+#define	DMU_POOL_FEATURE_DESCRIPTIONS	"feature_descriptions"
+#define	DMU_POOL_FEATURE_ENABLED_TXG	"feature_enabled_txg"
+#define	DMU_POOL_ROOT_DATASET		"root_dataset"
+#define	DMU_POOL_SYNC_BPOBJ		"sync_bplist"
+#define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
+#define	DMU_POOL_ERRLOG_LAST		"errlog_last"
+#define	DMU_POOL_SPARES			"spares"
+#define	DMU_POOL_DEFLATE		"deflate"
+#define	DMU_POOL_HISTORY		"history"
+#define	DMU_POOL_PROPS			"pool_props"
+#define	DMU_POOL_L2CACHE		"l2cache"
+#define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
+#define	DMU_POOL_DDT			"DDT-%s-%s-%s"
+#define	DMU_POOL_DDT_STATS		"DDT-statistics"
+#define	DMU_POOL_CREATION_VERSION	"creation_version"
+#define	DMU_POOL_SCAN			"scan"
+#define	DMU_POOL_FREE_BPOBJ		"free_bpobj"
+#define	DMU_POOL_BPTREE_OBJ		"bptree_obj"
+#define	DMU_POOL_EMPTY_BPOBJ		"empty_bpobj"
+
+/*
+ * Allocate an object from this objset.  The range of object numbers
+ * available is (0, DN_MAX_OBJECT).  Object 0 is the meta-dnode.
+ *
+ * The transaction must be assigned to a txg.  The newly allocated
+ * object will be "held" in the transaction (ie. you can modify the
+ * newly allocated object in this transaction).
+ *
+ * dmu_object_alloc() chooses an object and returns it in *objectp.
+ *
+ * dmu_object_claim() allocates a specific object number.  If that
+ * number is already allocated, it fails and returns EEXIST.
+ *
+ * Return 0 on success, or ENOSPC or EEXIST as specified above.
+ */
+uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
+
+/*
+ * Free an object from this objset.
+ *
+ * The object's data will be freed as well (ie. you don't need to call
+ * dmu_free(object, 0, -1, tx)).
+ *
+ * The object need not be held in the transaction.
+ *
+ * If there are any holds on this object's buffers (via dmu_buf_hold()),
+ * or tx holds on the object (via dmu_tx_hold_object()), you can not
+ * free it; it fails and returns EBUSY.
+ *
+ * If the object is not allocated, it fails and returns ENOENT.
+ *
+ * Return 0 on success, or EBUSY or ENOENT as specified above.
+ */
+int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
+
+/*
+ * Find the next allocated or free object.
+ *
+ * The objectp parameter is in-out.  It will be updated to be the next
+ * object which is allocated.  Ignore objects which have not been
+ * modified since txg.
+ *
+ * XXX Can only be called on a objset with no dirty data.
+ *
+ * Returns 0 on success, or ENOENT if there are no more objects.
+ */
+int dmu_object_next(objset_t *os, uint64_t *objectp,
+    boolean_t hole, uint64_t txg);
+
+/*
+ * Set the data blocksize for an object.
+ *
+ * The object cannot have any blocks allcated beyond the first.  If
+ * the first block is allocated already, the new size must be greater
+ * than the current block size.  If these conditions are not met,
+ * ENOTSUP will be returned.
+ *
+ * Returns 0 on success, or EBUSY if there are any holds on the object
+ * contents, or ENOTSUP as described above.
+ */
+int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
+    int ibs, dmu_tx_t *tx);
+
+/*
+ * Set the checksum property on a dnode.  The new checksum algorithm will
+ * apply to all newly written blocks; existing blocks will not be affected.
+ */
+void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+    dmu_tx_t *tx);
+
+/*
+ * Set the compress property on a dnode.  The new compression algorithm will
+ * apply to all newly written blocks; existing blocks will not be affected.
+ */
+void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+    dmu_tx_t *tx);
+
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+    void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+    int compressed_size, int byteorder, dmu_tx_t *tx);
+
+/*
+ * Decide how to write a block: checksum, compression, number of copies, etc.
+ */
+#define	WP_NOFILL	0x1
+#define	WP_DMU_SYNC	0x2
+#define	WP_SPILL	0x4
+
+void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
+    struct zio_prop *zp);
+/*
+ * The bonus data is accessed more or less like a regular buffer.
+ * You must dmu_bonus_hold() to get the buffer, which will give you a
+ * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
+ * data.  As with any normal buffer, you must call dmu_buf_read() to
+ * read db_data, dmu_buf_will_dirty() before modifying it, and the
+ * object must be held in an assigned transaction before calling
+ * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
+ * buffer as well.  You must release what you hold with dmu_buf_rele().
+ *
+ * Returns ENOENT, EIO, or 0.
+ */
+int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
+int dmu_bonus_max(void);
+int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
+int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
+dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
+int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
+
+/*
+ * Special spill buffer support used by "SA" framework
+ */
+
+int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
+    void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
+
+/*
+ * Obtain the DMU buffer from the specified object which contains the
+ * specified offset.  dmu_buf_hold() puts a "hold" on the buffer, so
+ * that it will remain in memory.  You must release the hold with
+ * dmu_buf_rele().  You must not access the dmu_buf_t after releasing
+ * what you hold.  You must have a hold on any dmu_buf_t* you pass to the DMU.
+ *
+ * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
+ * on the returned buffer before reading or writing the buffer's
+ * db_data.  The comments for those routines describe what particular
+ * operations are valid after calling them.
+ *
+ * The object number must be a valid, allocated object number.
+ */
+int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+    void *tag, dmu_buf_t **, int flags);
+
+/*
+ * Add a reference to a dmu buffer that has already been held via
+ * dmu_buf_hold() in the current context.
+ */
+void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
+
+/*
+ * Attempt to add a reference to a dmu buffer that is in an unknown state,
+ * using a pointer that may have been invalidated by eviction processing.
+ * The request will succeed if the passed in dbuf still represents the
+ * same os/object/blkid, is ineligible for eviction, and has at least
+ * one hold by a user other than the syncer.
+ */
+boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object,
+    uint64_t blkid, void *tag);
+
+void dmu_buf_rele(dmu_buf_t *db, void *tag);
+uint64_t dmu_buf_refcount(dmu_buf_t *db);
+
+/*
+ * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
+ * range of an object.  A pointer to an array of dmu_buf_t*'s is
+ * returned (in *dbpp).
+ *
+ * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
+ * frees the array.  The hold on the array of buffers MUST be released
+ * with dmu_buf_rele_array.  You can NOT release the hold on each buffer
+ * individually with dmu_buf_rele.
+ */
+int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
+    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
+void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
+
+typedef void dmu_buf_evict_func_t(void *user_ptr);
+
+/*
+ * A DMU buffer user object may be associated with a dbuf for the
+ * duration of its lifetime.  This allows the user of a dbuf (client)
+ * to attach private data to a dbuf (e.g. in-core only data such as a
+ * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
+ * when that dbuf has been evicted.  Clients typically respond to the
+ * eviction notification by freeing their private data, thus ensuring
+ * the same lifetime for both dbuf and private data.
+ *
+ * The mapping from a dmu_buf_user_t to any client private data is the
+ * client's responsibility.  All current consumers of the API with private
+ * data embed a dmu_buf_user_t as the first member of the structure for
+ * their private data.  This allows conversions between the two types
+ * with a simple cast.  Since the DMU buf user API never needs access
+ * to the private data, other strategies can be employed if necessary
+ * or convenient for the client (e.g. using container_of() to do the
+ * conversion for private data that cannot have the dmu_buf_user_t as
+ * its first member).
+ *
+ * Eviction callbacks are executed without the dbuf mutex held or any
+ * other type of mechanism to guarantee that the dbuf is still available.
+ * For this reason, users must assume the dbuf has already been freed
+ * and not reference the dbuf from the callback context.
+ *
+ * Users requesting "immediate eviction" are notified as soon as the dbuf
+ * is only referenced by dirty records (dirties == holds).  Otherwise the
+ * notification occurs after eviction processing for the dbuf begins.
+ */
+typedef struct dmu_buf_user {
+	/*
+	 * Asynchronous user eviction callback state.
+	 */
+	taskq_ent_t	dbu_tqent;
+
+	/* This instance's eviction function pointer. */
+	dmu_buf_evict_func_t *dbu_evict_func;
+#ifdef ZFS_DEBUG
+	/*
+	 * Pointer to user's dbuf pointer.  NULL for clients that do
+	 * not associate a dbuf with their user data.
+	 *
+	 * The dbuf pointer is cleared upon eviction so as to catch
+	 * use-after-evict bugs in clients.
+	 */
+	dmu_buf_t **dbu_clear_on_evict_dbufp;
+#endif
+} dmu_buf_user_t;
+
+/*
+ * Initialize the given dmu_buf_user_t instance with the eviction function
+ * evict_func, to be called when the user is evicted.
+ *
+ * NOTE: This function should only be called once on a given dmu_buf_user_t.
+ *       To allow enforcement of this, dbu must already be zeroed on entry.
+ */
+#ifdef __lint
+/* Very ugly, but it beats issuing suppression directives in many Makefiles. */
+extern void
+dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func,
+    dmu_buf_t **clear_on_evict_dbufp);
+#else /* __lint */
+static inline void
+dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func,
+    dmu_buf_t **clear_on_evict_dbufp)
+{
+	ASSERT(dbu->dbu_evict_func == NULL);
+	ASSERT(evict_func != NULL);
+	dbu->dbu_evict_func = evict_func;
+	taskq_init_ent(&dbu->dbu_tqent);
+#ifdef ZFS_DEBUG
+	dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
+#endif
+}
+#endif /* __lint */
+
+/*
+ * Attach user data to a dbuf and mark it for normal (when the dbuf's
+ * data is cleared or its reference count goes to zero) eviction processing.
+ *
+ * Returns NULL on success, or the existing user if another user currently
+ * owns the buffer.
+ */
+void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);
+
+/*
+ * Attach user data to a dbuf and mark it for immediate (its dirty and
+ * reference counts are equal) eviction processing.
+ *
+ * Returns NULL on success, or the existing user if another user currently
+ * owns the buffer.
+ */
+void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);
+
+/*
+ * Replace the current user of a dbuf.
+ *
+ * If given the current user of a dbuf, replaces the dbuf's user with
+ * "new_user" and returns the user data pointer that was replaced.
+ * Otherwise returns the current, and unmodified, dbuf user pointer.
+ */
+void *dmu_buf_replace_user(dmu_buf_t *db,
+    dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);
+
+/*
+ * Remove the specified user data for a DMU buffer.
+ *
+ * Returns the user that was removed on success, or the current user if
+ * another user currently owns the buffer.
+ */
+void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
+
+/*
+ * Returns the user data (dmu_buf_user_t *) associated with this dbuf.
+ */
+void *dmu_buf_get_user(dmu_buf_t *db);
+
+/* Block until any in-progress dmu buf user evictions complete. */
+void dmu_buf_user_evict_wait(void);
+
+/*
+ * Returns the blkptr associated with this dbuf, or NULL if not set.
+ */
+struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
+
+/*
+ * Indicate that you are going to modify the buffer's data (db_data).
+ *
+ * The transaction (tx) must be assigned to a txg (ie. you've called
+ * dmu_tx_assign()).  The buffer's object must be held in the tx
+ * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
+ */
+void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
+
+/*
+ * Tells if the given dbuf is freeable.
+ */
+boolean_t dmu_buf_freeable(dmu_buf_t *);
+
+/*
+ * You must create a transaction, then hold the objects which you will
+ * (or might) modify as part of this transaction.  Then you must assign
+ * the transaction to a transaction group.  Once the transaction has
+ * been assigned, you can modify buffers which belong to held objects as
+ * part of this transaction.  You can't modify buffers before the
+ * transaction has been assigned; you can't modify buffers which don't
+ * belong to objects which this transaction holds; you can't hold
+ * objects once the transaction has been assigned.  You may hold an
+ * object which you are going to free (with dmu_object_free()), but you
+ * don't have to.
+ *
+ * You can abort the transaction before it has been assigned.
+ *
+ * Note that you may hold buffers (with dmu_buf_hold) at any time,
+ * regardless of transaction state.
+ */
+
+#define	DMU_NEW_OBJECT	(-1ULL)
+#define	DMU_OBJECT_END	(-1ULL)
+
+dmu_tx_t *dmu_tx_create(objset_t *os);
+void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
+void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
+    uint64_t len);
+void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
+void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
+void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
+void dmu_tx_abort(dmu_tx_t *tx);
+int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how);
+void dmu_tx_wait(dmu_tx_t *tx);
+void dmu_tx_commit(dmu_tx_t *tx);
+
+/*
+ * To register a commit callback, dmu_tx_callback_register() must be called.
+ *
+ * dcb_data is a pointer to caller private data that is passed on as a
+ * callback parameter. The caller is responsible for properly allocating and
+ * freeing it.
+ *
+ * When registering a callback, the transaction must be already created, but
+ * it cannot be committed or aborted. It can be assigned to a txg or not.
+ *
+ * The callback will be called after the transaction has been safely written
+ * to stable storage and will also be called if the dmu_tx is aborted.
+ * If there is any error which prevents the transaction from being committed to
+ * disk, the callback will be called with a value of error != 0.
+ */
+typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
+
+void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
+    void *dcb_data);
+
+/*
+ * Free up the data blocks for a defined range of a file.  If size is
+ * -1, the range from offset to end-of-file is freed.
+ */
+int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+	uint64_t size, dmu_tx_t *tx);
+int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
+	uint64_t size);
+int dmu_free_long_object(objset_t *os, uint64_t object);
+
+/*
+ * Convenience functions.
+ *
+ * Canfail routines will return 0 on success, or an errno if there is a
+ * nonrecoverable I/O error.
+ */
+#define	DMU_READ_PREFETCH	0 /* prefetch */
+#define	DMU_READ_NO_PREFETCH	1 /* don't prefetch */
+int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+	void *buf, uint32_t flags);
+void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+	const void *buf, dmu_tx_t *tx);
+void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+	dmu_tx_t *tx);
+#ifdef _KERNEL
+#include <linux/blkdev_compat.h>
+int dmu_read_bio(objset_t *os, uint64_t object, struct bio *bio);
+int dmu_write_bio(objset_t *os, uint64_t object, struct bio *bio,
+	dmu_tx_t *tx);
+int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
+int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
+int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
+	dmu_tx_t *tx);
+int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
+	dmu_tx_t *tx);
+#endif
+struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
+void dmu_return_arcbuf(struct arc_buf *buf);
+void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
+    dmu_tx_t *tx);
+int dmu_xuio_init(struct xuio *uio, int niov);
+void dmu_xuio_fini(struct xuio *uio);
+int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
+    size_t n);
+int dmu_xuio_cnt(struct xuio *uio);
+struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
+void dmu_xuio_clear(struct xuio *uio, int i);
+void xuio_stat_wbuf_copied(void);
+void xuio_stat_wbuf_nocopy(void);
+
+extern int zfs_prefetch_disable;
+extern int zfs_max_recordsize;
+
+/*
+ * Asynchronously try to read in the data.
+ */
+void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t len);
+
+typedef struct dmu_object_info {
+	/* All sizes are in bytes unless otherwise indicated. */
+	uint32_t doi_data_block_size;
+	uint32_t doi_metadata_block_size;
+	dmu_object_type_t doi_type;
+	dmu_object_type_t doi_bonus_type;
+	uint64_t doi_bonus_size;
+	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
+	uint8_t doi_checksum;
+	uint8_t doi_compress;
+	uint8_t doi_nblkptr;
+	uint8_t doi_pad[4];
+	uint64_t doi_physical_blocks_512;	/* data + metadata, 512b blks */
+	uint64_t doi_max_offset;
+	uint64_t doi_fill_count;		/* number of non-empty blocks */
+} dmu_object_info_t;
+
+typedef void (*const arc_byteswap_func_t)(void *buf, size_t size);
+
+typedef struct dmu_object_type_info {
+	dmu_object_byteswap_t	ot_byteswap;
+	boolean_t		ot_metadata;
+	char			*ot_name;
+} dmu_object_type_info_t;
+
+typedef const struct dmu_object_byteswap_info {
+	arc_byteswap_func_t	 ob_func;
+	char			*ob_name;
+} dmu_object_byteswap_info_t;
+
+extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
+extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
+
+/*
+ * Get information on a DMU object.
+ *
+ * Return 0 on success or ENOENT if object is not allocated.
+ *
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
+void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+/* Like dmu_object_info, but faster if you have a held dnode in hand. */
+void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
+void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
+/*
+ * Like dmu_object_info_from_db, but faster still when you only care about
+ * the size.  This is specifically optimized for zfs_getattr().
+ */
+void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
+    u_longlong_t *nblk512);
+
+typedef struct dmu_objset_stats {
+	uint64_t dds_num_clones; /* number of clones of this */
+	uint64_t dds_creation_txg;
+	uint64_t dds_guid;
+	dmu_objset_type_t dds_type;
+	uint8_t dds_is_snapshot;
+	uint8_t dds_inconsistent;
+	char dds_origin[MAXNAMELEN];
+} dmu_objset_stats_t;
+
+/*
+ * Get stats on a dataset.
+ */
+void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
+
+/*
+ * Add entries to the nvlist for all the objset's properties.  See
+ * zfs_prop_table[] and zfs(1m) for details on the properties.
+ */
+void dmu_objset_stats(objset_t *os, struct nvlist *nv);
+
+/*
+ * Get the space usage statistics for statvfs().
+ *
+ * refdbytes is the amount of space "referenced" by this objset.
+ * availbytes is the amount of space available to this objset, taking
+ * into account quotas & reservations, assuming that no other objsets
+ * use the space first.  These values correspond to the 'referenced' and
+ * 'available' properties, described in the zfs(1m) manpage.
+ *
+ * usedobjs and availobjs are the number of objects currently allocated,
+ * and available.
+ */
+void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
+    uint64_t *usedobjsp, uint64_t *availobjsp);
+
+/*
+ * The fsid_guid is a 56-bit ID that can change to avoid collisions.
+ * (Contrast with the ds_guid which is a 64-bit ID that will never
+ * change, so there is a small probability that it will collide.)
+ */
+uint64_t dmu_objset_fsid_guid(objset_t *os);
+
+/*
+ * Get the [cm]time for an objset's snapshot dir
+ */
+timestruc_t dmu_objset_snap_cmtime(objset_t *os);
+
+int dmu_objset_is_snapshot(objset_t *os);
+
+extern struct spa *dmu_objset_spa(objset_t *os);
+extern struct zilog *dmu_objset_zil(objset_t *os);
+extern struct dsl_pool *dmu_objset_pool(objset_t *os);
+extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
+extern void dmu_objset_name(objset_t *os, char *buf);
+extern dmu_objset_type_t dmu_objset_type(objset_t *os);
+extern uint64_t dmu_objset_id(objset_t *os);
+extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
+extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
+extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+    uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
+extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val);
+extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
+    int maxlen, boolean_t *conflict);
+extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
+    uint64_t *idp, uint64_t *offp);
+
+typedef int objset_used_cb_t(dmu_object_type_t bonustype,
+    void *bonus, uint64_t *userp, uint64_t *groupp);
+extern void dmu_objset_register_type(dmu_objset_type_t ost,
+    objset_used_cb_t *cb);
+extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
+extern void *dmu_objset_get_user(objset_t *os);
+
+/*
+ * Return the txg number for the given assigned transaction.
+ */
+uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
+
+/*
+ * Synchronous write.
+ * If a parent zio is provided this function initiates a write on the
+ * provided buffer as a child of the parent zio.
+ * In the absence of a parent zio, the write is completed synchronously.
+ * At write completion, blk is filled with the bp of the written block.
+ * Note that while the data covered by this function will be on stable
+ * storage when the write completes this new data does not become a
+ * permanent part of the file until the associated transaction commits.
+ */
+
+/*
+ * {zfs,zvol,ztest}_get_done() args
+ */
+typedef struct zgd {
+	struct zilog	*zgd_zilog;
+	struct blkptr	*zgd_bp;
+	dmu_buf_t	*zgd_db;
+	struct rl	*zgd_rl;
+	void		*zgd_private;
+} zgd_t;
+
+typedef void dmu_sync_cb_t(zgd_t *arg, int error);
+int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
+
+/*
+ * Find the next hole or data block in file starting at *off
+ * Return found offset in *off. Return ESRCH for end of file.
+ */
+int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
+    uint64_t *off);
+
+/*
+ * Initial setup and final teardown.
+ */
+extern void dmu_init(void);
+extern void dmu_fini(void);
+
+typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
+    uint64_t object, uint64_t offset, int len);
+void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
+    dmu_traverse_cb_t cb, void *arg);
+
+int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
+    struct vnode *vp, offset_t *offp);
+
+/* CRC64 table */
+#define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
+extern uint64_t zfs_crc64_table[256];
+
+extern int zfs_mdcomp_disable;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_DMU_H */
diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h
index 25622263e631..f033eace9945 100644
--- a/include/sys/dsl_dataset.h
+++ b/include/sys/dsl_dataset.h
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 4f7e3287f3da..792d0796b133 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -160,8 +160,18 @@ extern int aok;
 
 /*
  * DTrace SDT probes have different signatures in userland than they do in
- * kernel.  If they're being used in kernel code, re-define them out of
+ * the kernel.  If they're being used in kernel code, re-define them out of
  * existence for their counterparts in libzpool.
+ *
+ * Here's an example of how to use the set-error probes in userland:
+ * zfs$target:::set-error /arg0 == EBUSY/ {stack();}
+ *
+ * Here's an example of how to use DTRACE_PROBE probes in userland:
+ * If there is a probe declared as follows:
+ * DTRACE_PROBE2(zfs__probe_name, uint64_t, blkid, dnode_t *, dn);
+ * Then you can use it as follows:
+ * zfs$target:::probe2 /copyinstr(arg0) == "zfs__probe_name"/
+ *     {printf("%u %p\n", arg1, arg2);}
  */
 
 #ifdef DTRACE_PROBE
diff --git a/include/sys/zfs_context.h.orig b/include/sys/zfs_context.h.orig
new file mode 100644
index 000000000000..4f7e3287f3da
--- /dev/null
+++ b/include/sys/zfs_context.h.orig
@@ -0,0 +1,753 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_ZFS_CONTEXT_H
+#define	_SYS_ZFS_CONTEXT_H
+
+#ifdef __KERNEL__
+
+#include <sys/note.h>
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/atomic.h>
+#include <sys/sysmacros.h>
+#include <sys/bitmap.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/vmem.h>
+#include <sys/taskq.h>
+#include <sys/buf.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuvar.h>
+#include <sys/kobj.h>
+#include <sys/conf.h>
+#include <sys/disp.h>
+#include <sys/debug.h>
+#include <sys/random.h>
+#include <sys/byteorder.h>
+#include <sys/systm.h>
+#include <sys/list.h>
+#include <sys/uio_impl.h>
+#include <sys/dirent.h>
+#include <sys/time.h>
+#include <vm/seg_kmem.h>
+#include <sys/zone.h>
+#include <sys/sdt.h>
+#include <sys/zfs_debug.h>
+#include <sys/zfs_delay.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/sunddi.h>
+#include <sys/ctype.h>
+#include <sys/disp.h>
+#include <sys/trace.h>
+#include <linux/dcache_compat.h>
+#include <linux/utsname_compat.h>
+
+#else /* _KERNEL */
+
+#define	_SYS_MUTEX_H
+#define	_SYS_RWLOCK_H
+#define	_SYS_CONDVAR_H
+#define	_SYS_SYSTM_H
+#define	_SYS_T_LOCK_H
+#define	_SYS_VNODE_H
+#define	_SYS_VFS_H
+#define	_SYS_SUNDDI_H
+#define	_SYS_CALLB_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <strings.h>
+#include <pthread.h>
+#include <synch.h>
+#include <assert.h>
+#include <alloca.h>
+#include <umem.h>
+#include <limits.h>
+#include <atomic.h>
+#include <dirent.h>
+#include <time.h>
+#include <ctype.h>
+#include <signal.h>
+#include <sys/mman.h>
+#include <sys/note.h>
+#include <sys/types.h>
+#include <sys/cred.h>
+#include <sys/sysmacros.h>
+#include <sys/bitmap.h>
+#include <sys/resource.h>
+#include <sys/byteorder.h>
+#include <sys/list.h>
+#include <sys/uio.h>
+#include <sys/zfs_debug.h>
+#include <sys/sdt.h>
+#include <sys/kstat.h>
+#include <sys/u8_textprep.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/sunddi.h>
+#include <sys/debug.h>
+#include <sys/utsname.h>
+
+/*
+ * Stack
+ */
+
+#define	noinline	__attribute__((noinline))
+
+/*
+ * Debugging
+ */
+
+/*
+ * Note that we are not using the debugging levels.
+ */
+
+#define	CE_CONT		0	/* continuation		*/
+#define	CE_NOTE		1	/* notice		*/
+#define	CE_WARN		2	/* warning		*/
+#define	CE_PANIC	3	/* panic		*/
+#define	CE_IGNORE	4	/* print nothing	*/
+
+/*
+ * ZFS debugging
+ */
+
+extern void dprintf_setup(int *argc, char **argv);
+
+extern void cmn_err(int, const char *, ...);
+extern void vcmn_err(int, const char *, va_list);
+extern void panic(const char *, ...);
+extern void vpanic(const char *, va_list);
+
+#define	fm_panic	panic
+
+extern int aok;
+
+/*
+ * DTrace SDT probes have different signatures in userland than they do in
+ * kernel.  If they're being used in kernel code, re-define them out of
+ * existence for their counterparts in libzpool.
+ */
+
+#ifdef DTRACE_PROBE
+#undef	DTRACE_PROBE
+#endif	/* DTRACE_PROBE */
+#define	DTRACE_PROBE(a) \
+	ZFS_PROBE0(#a)
+
+#ifdef DTRACE_PROBE1
+#undef	DTRACE_PROBE1
+#endif	/* DTRACE_PROBE1 */
+#define	DTRACE_PROBE1(a, b, c) \
+	ZFS_PROBE1(#a, (unsigned long)c)
+
+#ifdef DTRACE_PROBE2
+#undef	DTRACE_PROBE2
+#endif	/* DTRACE_PROBE2 */
+#define	DTRACE_PROBE2(a, b, c, d, e) \
+	ZFS_PROBE2(#a, (unsigned long)c, (unsigned long)e)
+
+#ifdef DTRACE_PROBE3
+#undef	DTRACE_PROBE3
+#endif	/* DTRACE_PROBE3 */
+#define	DTRACE_PROBE3(a, b, c, d, e, f, g) \
+	ZFS_PROBE3(#a, (unsigned long)c, (unsigned long)e, (unsigned long)g)
+
+#ifdef DTRACE_PROBE4
+#undef	DTRACE_PROBE4
+#endif	/* DTRACE_PROBE4 */
+#define	DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) \
+	ZFS_PROBE4(#a, (unsigned long)c, (unsigned long)e, (unsigned long)g, \
+	(unsigned long)i)
+
+/*
+ * We use the comma operator so that this macro can be used without much
+ * additional code.  For example, "return (EINVAL);" becomes
+ * "return (SET_ERROR(EINVAL));".  Note that the argument will be evaluated
+ * twice, so it should not have side effects (e.g. something like:
+ * "return (SET_ERROR(log_error(EINVAL, info)));" would log the error twice).
+ */
+#define	SET_ERROR(err) (ZFS_SET_ERROR(err), err)
+
+/*
+ * Threads.  TS_STACK_MIN is dictated by the minimum allowed pthread stack
+ * size.  While TS_STACK_MAX is somewhat arbitrary, it was selected to be
+ * large enough for the expected stack depth while small enough to avoid
+ * exhausting address space with high thread counts.
+ */
+#define	TS_MAGIC		0x72f158ab4261e538ull
+#define	TS_RUN			0x00000002
+#define	TS_STACK_MIN		PTHREAD_STACK_MIN
+#define	TS_STACK_MAX		(256 * 1024)
+
+/* in libzpool, p0 exists only to have its address taken */
+typedef struct proc {
+	uintptr_t	this_is_never_used_dont_dereference_it;
+} proc_t;
+
+extern struct proc p0;
+#define	curproc		(&p0)
+
+typedef void (*thread_func_t)(void *);
+typedef void (*thread_func_arg_t)(void *);
+typedef pthread_t kt_did_t;
+
+#define	kpreempt(x)	((void)0)
+
+typedef struct kthread {
+	kt_did_t	t_tid;
+	thread_func_t	t_func;
+	void *		t_arg;
+	pri_t		t_pri;
+} kthread_t;
+
+#define	curthread			zk_thread_current()
+#define	getcomm()			"unknown"
+#define	thread_exit			zk_thread_exit
+#define	thread_create(stk, stksize, func, arg, len, pp, state, pri)	\
+	zk_thread_create(stk, stksize, (thread_func_t)func, arg,	\
+	    len, NULL, state, pri, PTHREAD_CREATE_DETACHED)
+#define	thread_join(t)			zk_thread_join(t)
+#define	newproc(f, a, cid, pri, ctp, pid)	(ENOSYS)
+
+extern kthread_t *zk_thread_current(void);
+extern void zk_thread_exit(void);
+extern kthread_t *zk_thread_create(caddr_t stk, size_t  stksize,
+	thread_func_t func, void *arg, size_t len,
+	proc_t *pp, int state, pri_t pri, int detachstate);
+extern void zk_thread_join(kt_did_t tid);
+
+#define	kpreempt_disable()	((void)0)
+#define	kpreempt_enable()	((void)0)
+
+#define	PS_NONE		-1
+
+#define	issig(why)	(FALSE)
+#define	ISSIG(thr, why)	(FALSE)
+
+/*
+ * Mutexes
+ */
+#define	MTX_MAGIC	0x9522f51362a6e326ull
+#define	MTX_INIT	((void *)NULL)
+#define	MTX_DEST	((void *)-1UL)
+
+typedef struct kmutex {
+	void		*m_owner;
+	uint64_t	m_magic;
+	pthread_mutex_t	m_lock;
+} kmutex_t;
+
+#define	MUTEX_DEFAULT	0
+#define	MUTEX_HELD(m)	((m)->m_owner == curthread)
+#define	MUTEX_NOT_HELD(m) (!MUTEX_HELD(m))
+
+extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie);
+extern void mutex_destroy(kmutex_t *mp);
+extern void mutex_enter(kmutex_t *mp);
+extern void mutex_exit(kmutex_t *mp);
+extern int mutex_tryenter(kmutex_t *mp);
+extern void *mutex_owner(kmutex_t *mp);
+extern int mutex_held(kmutex_t *mp);
+
+/*
+ * RW locks
+ */
+#define	RW_MAGIC	0x4d31fb123648e78aull
+#define	RW_INIT		((void *)NULL)
+#define	RW_DEST		((void *)-1UL)
+
+typedef struct krwlock {
+	void			*rw_owner;
+	void			*rw_wr_owner;
+	uint64_t		rw_magic;
+	pthread_rwlock_t	rw_lock;
+	uint_t			rw_readers;
+} krwlock_t;
+
+typedef int krw_t;
+
+#define	RW_READER	0
+#define	RW_WRITER	1
+#define	RW_DEFAULT	RW_READER
+
+#define	RW_READ_HELD(x)		((x)->rw_readers > 0)
+#define	RW_WRITE_HELD(x)	((x)->rw_wr_owner == curthread)
+#define	RW_LOCK_HELD(x)		(RW_READ_HELD(x) || RW_WRITE_HELD(x))
+
+#undef RW_LOCK_HELD
+#define	RW_LOCK_HELD(x)		(RW_READ_HELD(x) || RW_WRITE_HELD(x))
+
+#undef RW_LOCK_HELD
+#define	RW_LOCK_HELD(x)		(RW_READ_HELD(x) || RW_WRITE_HELD(x))
+
+extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg);
+extern void rw_destroy(krwlock_t *rwlp);
+extern void rw_enter(krwlock_t *rwlp, krw_t rw);
+extern int rw_tryenter(krwlock_t *rwlp, krw_t rw);
+extern int rw_tryupgrade(krwlock_t *rwlp);
+extern void rw_exit(krwlock_t *rwlp);
+#define	rw_downgrade(rwlp) do { } while (0)
+
+extern uid_t crgetuid(cred_t *cr);
+extern uid_t crgetruid(cred_t *cr);
+extern gid_t crgetgid(cred_t *cr);
+extern int crgetngroups(cred_t *cr);
+extern gid_t *crgetgroups(cred_t *cr);
+
+/*
+ * Condition variables
+ */
+#define	CV_MAGIC	0xd31ea9a83b1b30c4ull
+
+typedef struct kcondvar {
+	uint64_t		cv_magic;
+	pthread_cond_t		cv;
+} kcondvar_t;
+
+#define	CV_DEFAULT	0
+
+extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg);
+extern void cv_destroy(kcondvar_t *cv);
+extern void cv_wait(kcondvar_t *cv, kmutex_t *mp);
+extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime);
+extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+    hrtime_t res, int flag);
+extern void cv_signal(kcondvar_t *cv);
+extern void cv_broadcast(kcondvar_t *cv);
+#define	cv_timedwait_sig(cv, mp, at)		cv_timedwait(cv, mp, at)
+#define	cv_wait_sig(cv, mp)			cv_wait(cv, mp)
+#define	cv_wait_io(cv, mp)			cv_wait(cv, mp)
+
+/*
+ * Thread-specific data
+ */
+#define	tsd_get(k) pthread_getspecific(k)
+#define	tsd_set(k, v) pthread_setspecific(k, v)
+#define	tsd_create(kp, d) pthread_key_create(kp, d)
+#define	tsd_destroy(kp) /* nothing */
+
+/*
+ * Thread-specific data
+ */
+#define	tsd_get(k) pthread_getspecific(k)
+#define	tsd_set(k, v) pthread_setspecific(k, v)
+#define	tsd_create(kp, d) pthread_key_create(kp, d)
+#define	tsd_destroy(kp) /* nothing */
+
+/*
+ * kstat creation, installation and deletion
+ */
+extern kstat_t *kstat_create(const char *, int,
+    const char *, const char *, uchar_t, ulong_t, uchar_t);
+extern void kstat_install(kstat_t *);
+extern void kstat_delete(kstat_t *);
+extern void kstat_waitq_enter(kstat_io_t *);
+extern void kstat_waitq_exit(kstat_io_t *);
+extern void kstat_runq_enter(kstat_io_t *);
+extern void kstat_runq_exit(kstat_io_t *);
+extern void kstat_waitq_to_runq(kstat_io_t *);
+extern void kstat_runq_back_to_waitq(kstat_io_t *);
+extern void kstat_set_raw_ops(kstat_t *ksp,
+    int (*headers)(char *buf, size_t size),
+    int (*data)(char *buf, size_t size, void *data),
+    void *(*addr)(kstat_t *ksp, loff_t index));
+
+/*
+ * Kernel memory
+ */
+#define	KM_SLEEP		UMEM_NOFAIL
+#define	KM_PUSHPAGE		KM_SLEEP
+#define	KM_NOSLEEP		UMEM_DEFAULT
+#define	KMC_NODEBUG		UMC_NODEBUG
+#define	KMC_KMEM		0x0
+#define	KMC_VMEM		0x0
+#define	kmem_alloc(_s, _f)	umem_alloc(_s, _f)
+#define	kmem_zalloc(_s, _f)	umem_zalloc(_s, _f)
+#define	kmem_free(_b, _s)	umem_free(_b, _s)
+#define	vmem_alloc(_s, _f)	kmem_alloc(_s, _f)
+#define	vmem_zalloc(_s, _f)	kmem_zalloc(_s, _f)
+#define	vmem_free(_b, _s)	kmem_free(_b, _s)
+#define	kmem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) \
+	umem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i)
+#define	kmem_cache_destroy(_c)	umem_cache_destroy(_c)
+#define	kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f)
+#define	kmem_cache_free(_c, _b)	umem_cache_free(_c, _b)
+#define	kmem_debugging()	0
+#define	kmem_cache_reap_now(_c)	umem_cache_reap_now(_c);
+#define	kmem_cache_set_move(_c, _cb)	/* nothing */
+#define	vmem_qcache_reap(_v)		/* nothing */
+#define	POINTER_INVALIDATE(_pp)		/* nothing */
+#define	POINTER_IS_VALID(_p)	0
+
+extern vmem_t *zio_arena;
+
+typedef umem_cache_t kmem_cache_t;
+
+typedef enum kmem_cbrc {
+	KMEM_CBRC_YES,
+	KMEM_CBRC_NO,
+	KMEM_CBRC_LATER,
+	KMEM_CBRC_DONT_NEED,
+	KMEM_CBRC_DONT_KNOW
+} kmem_cbrc_t;
+
+/*
+ * Task queues
+ */
+typedef struct taskq taskq_t;
+typedef uintptr_t taskqid_t;
+typedef void (task_func_t)(void *);
+
+typedef struct taskq_ent {
+	struct taskq_ent	*tqent_next;
+	struct taskq_ent	*tqent_prev;
+	task_func_t		*tqent_func;
+	void			*tqent_arg;
+	uintptr_t		tqent_flags;
+} taskq_ent_t;
+
+#define	TQENT_FLAG_PREALLOC	0x1	/* taskq_dispatch_ent used */
+
+#define	TASKQ_PREPOPULATE	0x0001
+#define	TASKQ_CPR_SAFE		0x0002	/* Use CPR safe protocol */
+#define	TASKQ_DYNAMIC		0x0004	/* Use dynamic thread scheduling */
+#define	TASKQ_THREADS_CPU_PCT	0x0008	/* Scale # threads by # cpus */
+#define	TASKQ_DC_BATCH		0x0010	/* Mark threads as batch */
+
+#define	TQ_SLEEP	KM_SLEEP	/* Can block for memory */
+#define	TQ_NOSLEEP	KM_NOSLEEP	/* cannot block for memory; may fail */
+#define	TQ_NOQUEUE	0x02		/* Do not enqueue if can't dispatch */
+#define	TQ_FRONT	0x08		/* Queue in front */
+
+extern taskq_t *system_taskq;
+
+extern taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
+#define	taskq_create_proc(a, b, c, d, e, p, f) \
+	    (taskq_create(a, b, c, d, e, f))
+#define	taskq_create_sysdc(a, b, d, e, p, dc, f) \
+	    (taskq_create(a, b, maxclsyspri, d, e, f))
+extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
+extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t,
+    clock_t);
+extern void	taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
+    taskq_ent_t *);
+extern int	taskq_empty_ent(taskq_ent_t *);
+extern void	taskq_init_ent(taskq_ent_t *);
+extern void	taskq_destroy(taskq_t *);
+extern void	taskq_wait(taskq_t *);
+extern void	taskq_wait_id(taskq_t *, taskqid_t);
+extern void	taskq_wait_outstanding(taskq_t *, taskqid_t);
+extern int	taskq_member(taskq_t *, kthread_t *);
+extern int	taskq_cancel_id(taskq_t *, taskqid_t);
+extern void	system_taskq_init(void);
+extern void	system_taskq_fini(void);
+
+#define	XVA_MAPSIZE	3
+#define	XVA_MAGIC	0x78766174
+
+/*
+ * vnodes
+ */
+typedef struct vnode {
+	uint64_t	v_size;
+	int		v_fd;
+	char		*v_path;
+} vnode_t;
+
+#define	AV_SCANSTAMP_SZ	32		/* length of anti-virus scanstamp */
+
+typedef struct xoptattr {
+	timestruc_t	xoa_createtime;	/* Create time of file */
+	uint8_t		xoa_archive;
+	uint8_t		xoa_system;
+	uint8_t		xoa_readonly;
+	uint8_t		xoa_hidden;
+	uint8_t		xoa_nounlink;
+	uint8_t		xoa_immutable;
+	uint8_t		xoa_appendonly;
+	uint8_t		xoa_nodump;
+	uint8_t		xoa_settable;
+	uint8_t		xoa_opaque;
+	uint8_t		xoa_av_quarantined;
+	uint8_t		xoa_av_modified;
+	uint8_t		xoa_av_scanstamp[AV_SCANSTAMP_SZ];
+	uint8_t		xoa_reparse;
+	uint8_t		xoa_offline;
+	uint8_t		xoa_sparse;
+} xoptattr_t;
+
+typedef struct vattr {
+	uint_t		va_mask;	/* bit-mask of attributes */
+	u_offset_t	va_size;	/* file size in bytes */
+} vattr_t;
+
+
+typedef struct xvattr {
+	vattr_t		xva_vattr;	/* Embedded vattr structure */
+	uint32_t	xva_magic;	/* Magic Number */
+	uint32_t	xva_mapsize;	/* Size of attr bitmap (32-bit words) */
+	uint32_t	*xva_rtnattrmapp;	/* Ptr to xva_rtnattrmap[] */
+	uint32_t	xva_reqattrmap[XVA_MAPSIZE];	/* Requested attrs */
+	uint32_t	xva_rtnattrmap[XVA_MAPSIZE];	/* Returned attrs */
+	xoptattr_t	xva_xoptattrs;	/* Optional attributes */
+} xvattr_t;
+
+typedef struct vsecattr {
+	uint_t		vsa_mask;	/* See below */
+	int		vsa_aclcnt;	/* ACL entry count */
+	void		*vsa_aclentp;	/* pointer to ACL entries */
+	int		vsa_dfaclcnt;	/* default ACL entry count */
+	void		*vsa_dfaclentp;	/* pointer to default ACL entries */
+	size_t		vsa_aclentsz;	/* ACE size in bytes of vsa_aclentp */
+} vsecattr_t;
+
+#define	AT_TYPE		0x00001
+#define	AT_MODE		0x00002
+#define	AT_UID		0x00004
+#define	AT_GID		0x00008
+#define	AT_FSID		0x00010
+#define	AT_NODEID	0x00020
+#define	AT_NLINK	0x00040
+#define	AT_SIZE		0x00080
+#define	AT_ATIME	0x00100
+#define	AT_MTIME	0x00200
+#define	AT_CTIME	0x00400
+#define	AT_RDEV		0x00800
+#define	AT_BLKSIZE	0x01000
+#define	AT_NBLOCKS	0x02000
+#define	AT_SEQ		0x08000
+#define	AT_XVATTR	0x10000
+
+#define	CRCREAT		0
+
+extern int fop_getattr(vnode_t *vp, vattr_t *vap);
+
+#define	VOP_CLOSE(vp, f, c, o, cr, ct)	vn_close(vp)
+#define	VOP_PUTPAGE(vp, of, sz, fl, cr, ct)	0
+#define	VOP_GETATTR(vp, vap, fl, cr, ct)  fop_getattr((vp), (vap));
+
+#define	VOP_FSYNC(vp, f, cr, ct)	fsync((vp)->v_fd)
+
+#define	VN_RELE(vp)	vn_close(vp)
+
+extern int vn_open(char *path, int x1, int oflags, int mode, vnode_t **vpp,
+    int x2, int x3);
+extern int vn_openat(char *path, int x1, int oflags, int mode, vnode_t **vpp,
+    int x2, int x3, vnode_t *vp, int fd);
+extern int vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len,
+    offset_t offset, int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp);
+extern void vn_close(vnode_t *vp);
+
+#define	vn_remove(path, x1, x2)		remove(path)
+#define	vn_rename(from, to, seg)	rename((from), (to))
+#define	vn_is_readonly(vp)		B_FALSE
+
+extern vnode_t *rootdir;
+
+#include <sys/file.h>		/* for FREAD, FWRITE, etc */
+
+/*
+ * Random stuff
+ */
+#define	ddi_get_lbolt()		(gethrtime() >> 23)
+#define	ddi_get_lbolt64()	(gethrtime() >> 23)
+#define	hz	119	/* frequency when using gethrtime() >> 23 for lbolt */
+
+#define	ddi_time_before(a, b)		(a < b)
+#define	ddi_time_after(a, b)		ddi_time_before(b, a)
+#define	ddi_time_before_eq(a, b)	(!ddi_time_after(a, b))
+#define	ddi_time_after_eq(a, b)		ddi_time_before_eq(b, a)
+
+#define	ddi_time_before64(a, b)		(a < b)
+#define	ddi_time_after64(a, b)		ddi_time_before64(b, a)
+#define	ddi_time_before_eq64(a, b)	(!ddi_time_after64(a, b))
+#define	ddi_time_after_eq64(a, b)	ddi_time_before_eq64(b, a)
+
+extern void delay(clock_t ticks);
+
+#define	SEC_TO_TICK(sec)	((sec) * hz)
+#define	MSEC_TO_TICK(msec)	((msec) / (MILLISEC / hz))
+#define	USEC_TO_TICK(usec)	((usec) / (MICROSEC / hz))
+#define	NSEC_TO_TICK(usec)	((usec) / (NANOSEC / hz))
+
+#define	gethrestime_sec() time(NULL)
+#define	gethrestime(t) \
+	do {\
+		(t)->tv_sec = gethrestime_sec();\
+		(t)->tv_nsec = 0;\
+	} while (0);
+
+#define	max_ncpus	64
+#define	boot_ncpus	(sysconf(_SC_NPROCESSORS_ONLN))
+
+/*
+ * Process priorities as defined by setpriority(2) and getpriority(2).
+ */
+#define	minclsyspri	19
+#define	maxclsyspri	-20
+#define	defclsyspri	0
+
+#define	CPU_SEQID	(pthread_self() & (max_ncpus - 1))
+
+#define	kcred		NULL
+#define	CRED()		NULL
+
+#define	ptob(x)		((x) * PAGESIZE)
+
+extern uint64_t physmem;
+
+extern int highbit64(uint64_t i);
+extern int random_get_bytes(uint8_t *ptr, size_t len);
+extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len);
+
+extern void kernel_init(int);
+extern void kernel_fini(void);
+
+struct spa;
+extern void nicenum(uint64_t num, char *buf);
+extern void show_pool_stats(struct spa *);
+
+typedef struct callb_cpr {
+	kmutex_t	*cc_lockp;
+} callb_cpr_t;
+
+#define	CALLB_CPR_INIT(cp, lockp, func, name)	{		\
+	(cp)->cc_lockp = lockp;					\
+}
+
+#define	CALLB_CPR_SAFE_BEGIN(cp) {				\
+	ASSERT(MUTEX_HELD((cp)->cc_lockp));			\
+}
+
+#define	CALLB_CPR_SAFE_END(cp, lockp) {				\
+	ASSERT(MUTEX_HELD((cp)->cc_lockp));			\
+}
+
+#define	CALLB_CPR_EXIT(cp) {					\
+	ASSERT(MUTEX_HELD((cp)->cc_lockp));			\
+	mutex_exit((cp)->cc_lockp);				\
+}
+
+#define	zone_dataset_visible(x, y)	(1)
+#define	INGLOBALZONE(z)			(1)
+
+extern char *kmem_vasprintf(const char *fmt, va_list adx);
+extern char *kmem_asprintf(const char *fmt, ...);
+#define	strfree(str) kmem_free((str), strlen(str) + 1)
+
+/*
+ * Hostname information
+ */
+extern char hw_serial[];	/* for userland-emulated hostid access */
+extern int ddi_strtoul(const char *str, char **nptr, int base,
+    unsigned long *result);
+
+extern int ddi_strtoull(const char *str, char **nptr, int base,
+    u_longlong_t *result);
+
+typedef struct utsname	utsname_t;
+extern utsname_t *utsname(void);
+
+/* ZFS Boot Related stuff. */
+
+struct _buf {
+	intptr_t	_fd;
+};
+
+struct bootstat {
+	uint64_t st_size;
+};
+
+typedef struct ace_object {
+	uid_t		a_who;
+	uint32_t	a_access_mask;
+	uint16_t	a_flags;
+	uint16_t	a_type;
+	uint8_t		a_obj_type[16];
+	uint8_t		a_inherit_obj_type[16];
+} ace_object_t;
+
+
+#define	ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE	0x05
+#define	ACE_ACCESS_DENIED_OBJECT_ACE_TYPE	0x06
+#define	ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE	0x07
+#define	ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE	0x08
+
+extern struct _buf *kobj_open_file(char *name);
+extern int kobj_read_file(struct _buf *file, char *buf, unsigned size,
+    unsigned off);
+extern void kobj_close_file(struct _buf *file);
+extern int kobj_get_filesize(struct _buf *file, uint64_t *size);
+extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr);
+extern int zfs_secpolicy_rename_perms(const char *from, const char *to,
+    cred_t *cr);
+extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
+extern zoneid_t getzoneid(void);
+
+/* SID stuff */
+typedef struct ksiddomain {
+	uint_t	kd_ref;
+	uint_t	kd_len;
+	char	*kd_name;
+} ksiddomain_t;
+
+ksiddomain_t *ksid_lookupdomain(const char *);
+void ksiddomain_rele(ksiddomain_t *);
+
+#define	DDI_SLEEP	KM_SLEEP
+#define	ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) \
+	sysevent_post_event(_c, _d, _b, "libzpool", _e, _f)
+
+#define	zfs_sleep_until(wakeup)						\
+	do {								\
+		hrtime_t delta = wakeup - gethrtime();			\
+		struct timespec ts;					\
+		ts.tv_sec = delta / NANOSEC;				\
+		ts.tv_nsec = delta % NANOSEC;				\
+		(void) nanosleep(&ts, NULL);				\
+	} while (0)
+
+typedef int fstrans_cookie_t;
+
+extern fstrans_cookie_t spl_fstrans_mark(void);
+extern void spl_fstrans_unmark(fstrans_cookie_t);
+extern int spl_fstrans_check(void);
+
+#endif /* _KERNEL */
+#endif	/* _SYS_ZFS_CONTEXT_H */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 278b6e0868a9..432a412638b3 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -29,6 +29,7 @@
 #ifndef _ZIO_H
 #define	_ZIO_H
 
+#include <sys/zio_priority.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
@@ -147,17 +148,6 @@ enum zio_compress {
 #define	ZIO_FAILURE_MODE_CONTINUE	1
 #define	ZIO_FAILURE_MODE_PANIC		2
 
-typedef enum zio_priority {
-	ZIO_PRIORITY_SYNC_READ,
-	ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */
-	ZIO_PRIORITY_ASYNC_READ,	/* prefetch */
-	ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */
-	ZIO_PRIORITY_SCRUB,		/* asynchronous scrub/resilver reads */
-	ZIO_PRIORITY_NUM_QUEUEABLE,
-
-	ZIO_PRIORITY_NOW		/* non-queued i/os (e.g. free) */
-} zio_priority_t;
-
 enum zio_flag {
 	/*
 	 * Flags inherited by gang, ddt, and vdev children,
@@ -262,6 +252,7 @@ extern const char *zio_type_name[ZIO_TYPES];
  * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>.
  * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
  * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
+ * dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
  *
  * Note: this structure is called a bookmark because its original purpose
  * was to remember where to resume a pool-wide traverse.
@@ -294,6 +285,9 @@ struct zbookmark_phys {
 #define	ZB_ZIL_OBJECT		(0ULL)
 #define	ZB_ZIL_LEVEL		(-2LL)
 
+#define	ZB_DNODE_LEVEL		(-3LL)
+#define	ZB_DNODE_BLKID		(0ULL)
+
 #define	ZB_IS_ZERO(zb)						\
 	((zb)->zb_objset == 0 && (zb)->zb_object == 0 &&	\
 	(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
@@ -598,8 +592,10 @@ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
 extern void spa_handle_ignored_writes(spa_t *spa);
 
 /* zbookmark_phys functions */
-boolean_t zbookmark_is_before(const struct dnode_phys *dnp,
-    const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
+boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
+    const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
+int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
+    uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
 
 #ifdef	__cplusplus
 }
diff --git a/include/sys/zio.h.orig b/include/sys/zio.h.orig
new file mode 100644
index 000000000000..278b6e0868a9
--- /dev/null
+++ b/include/sys/zio.h.orig
@@ -0,0 +1,608 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ */
+
+#ifndef _ZIO_H
+#define	_ZIO_H
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio_impl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Embedded checksum
+ */
+#define	ZEC_MAGIC	0x210da7ab10c7a11ULL
+
+typedef struct zio_eck {
+	uint64_t	zec_magic;	/* for validation, endianness	*/
+	zio_cksum_t	zec_cksum;	/* 256-bit checksum		*/
+} zio_eck_t;
+
+/*
+ * Gang block headers are self-checksumming and contain an array
+ * of block pointers.
+ */
+#define	SPA_GANGBLOCKSIZE	SPA_MINBLOCKSIZE
+#define	SPA_GBH_NBLKPTRS	((SPA_GANGBLOCKSIZE - \
+	sizeof (zio_eck_t)) / sizeof (blkptr_t))
+#define	SPA_GBH_FILLER		((SPA_GANGBLOCKSIZE - \
+	sizeof (zio_eck_t) - \
+	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
+	sizeof (uint64_t))
+
+typedef struct zio_gbh {
+	blkptr_t		zg_blkptr[SPA_GBH_NBLKPTRS];
+	uint64_t		zg_filler[SPA_GBH_FILLER];
+	zio_eck_t		zg_tail;
+} zio_gbh_phys_t;
+
+enum zio_checksum {
+	ZIO_CHECKSUM_INHERIT = 0,
+	ZIO_CHECKSUM_ON,
+	ZIO_CHECKSUM_OFF,
+	ZIO_CHECKSUM_LABEL,
+	ZIO_CHECKSUM_GANG_HEADER,
+	ZIO_CHECKSUM_ZILOG,
+	ZIO_CHECKSUM_FLETCHER_2,
+	ZIO_CHECKSUM_FLETCHER_4,
+	ZIO_CHECKSUM_SHA256,
+	ZIO_CHECKSUM_ZILOG2,
+	ZIO_CHECKSUM_FUNCTIONS
+};
+
+/*
+ * The number of "legacy" compression functions which can be set on individual
+ * objects.
+ */
+#define	ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
+
+#define	ZIO_CHECKSUM_ON_VALUE	ZIO_CHECKSUM_FLETCHER_4
+#define	ZIO_CHECKSUM_DEFAULT	ZIO_CHECKSUM_ON
+
+#define	ZIO_CHECKSUM_MASK	0xffULL
+#define	ZIO_CHECKSUM_VERIFY	(1 << 8)
+
+#define	ZIO_DEDUPCHECKSUM	ZIO_CHECKSUM_SHA256
+#define	ZIO_DEDUPDITTO_MIN	100
+
+enum zio_compress {
+	ZIO_COMPRESS_INHERIT = 0,
+	ZIO_COMPRESS_ON,
+	ZIO_COMPRESS_OFF,
+	ZIO_COMPRESS_LZJB,
+	ZIO_COMPRESS_EMPTY,
+	ZIO_COMPRESS_GZIP_1,
+	ZIO_COMPRESS_GZIP_2,
+	ZIO_COMPRESS_GZIP_3,
+	ZIO_COMPRESS_GZIP_4,
+	ZIO_COMPRESS_GZIP_5,
+	ZIO_COMPRESS_GZIP_6,
+	ZIO_COMPRESS_GZIP_7,
+	ZIO_COMPRESS_GZIP_8,
+	ZIO_COMPRESS_GZIP_9,
+	ZIO_COMPRESS_ZLE,
+	ZIO_COMPRESS_LZ4,
+	ZIO_COMPRESS_FUNCTIONS
+};
+
+/*
+ * The number of "legacy" compression functions which can be set on individual
+ * objects.
+ */
+#define	ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
+
+/*
+ * The meaning of "compress = on" selected by the compression features enabled
+ * on a given pool.
+ */
+#define	ZIO_COMPRESS_LEGACY_ON_VALUE	ZIO_COMPRESS_LZJB
+#define	ZIO_COMPRESS_LZ4_ON_VALUE	ZIO_COMPRESS_LZ4
+
+#define	ZIO_COMPRESS_DEFAULT		ZIO_COMPRESS_OFF
+
+#define	BOOTFS_COMPRESS_VALID(compress)			\
+	((compress) == ZIO_COMPRESS_LZJB ||		\
+	(compress) == ZIO_COMPRESS_LZ4 ||		\
+	(compress) == ZIO_COMPRESS_ON ||		\
+	(compress) == ZIO_COMPRESS_OFF)
+
+/*
+ * Default Linux timeout for a sd device.
+ */
+#define	ZIO_DELAY_MAX			(30 * MILLISEC)
+
+#define	ZIO_FAILURE_MODE_WAIT		0
+#define	ZIO_FAILURE_MODE_CONTINUE	1
+#define	ZIO_FAILURE_MODE_PANIC		2
+
+typedef enum zio_priority {
+	ZIO_PRIORITY_SYNC_READ,
+	ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */
+	ZIO_PRIORITY_ASYNC_READ,	/* prefetch */
+	ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */
+	ZIO_PRIORITY_SCRUB,		/* asynchronous scrub/resilver reads */
+	ZIO_PRIORITY_NUM_QUEUEABLE,
+
+	ZIO_PRIORITY_NOW		/* non-queued i/os (e.g. free) */
+} zio_priority_t;
+
+enum zio_flag {
+	/*
+	 * Flags inherited by gang, ddt, and vdev children,
+	 * and that must be equal for two zios to aggregate
+	 */
+	ZIO_FLAG_DONT_AGGREGATE	= 1 << 0,
+	ZIO_FLAG_IO_REPAIR	= 1 << 1,
+	ZIO_FLAG_SELF_HEAL	= 1 << 2,
+	ZIO_FLAG_RESILVER	= 1 << 3,
+	ZIO_FLAG_SCRUB		= 1 << 4,
+	ZIO_FLAG_SCAN_THREAD	= 1 << 5,
+	ZIO_FLAG_PHYSICAL	= 1 << 6,
+
+#define	ZIO_FLAG_AGG_INHERIT	(ZIO_FLAG_CANFAIL - 1)
+
+	/*
+	 * Flags inherited by ddt, gang, and vdev children.
+	 */
+	ZIO_FLAG_CANFAIL	= 1 << 7,	/* must be first for INHERIT */
+	ZIO_FLAG_SPECULATIVE	= 1 << 8,
+	ZIO_FLAG_CONFIG_WRITER	= 1 << 9,
+	ZIO_FLAG_DONT_RETRY	= 1 << 10,
+	ZIO_FLAG_DONT_CACHE	= 1 << 11,
+	ZIO_FLAG_NODATA		= 1 << 12,
+	ZIO_FLAG_INDUCE_DAMAGE	= 1 << 13,
+
+#define	ZIO_FLAG_DDT_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
+#define	ZIO_FLAG_GANG_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
+
+	/*
+	 * Flags inherited by vdev children.
+	 */
+	ZIO_FLAG_IO_RETRY	= 1 << 14,	/* must be first for INHERIT */
+	ZIO_FLAG_PROBE		= 1 << 15,
+	ZIO_FLAG_TRYHARD	= 1 << 16,
+	ZIO_FLAG_OPTIONAL	= 1 << 17,
+
+#define	ZIO_FLAG_VDEV_INHERIT	(ZIO_FLAG_DONT_QUEUE - 1)
+
+	/*
+	 * Flags not inherited by any children.
+	 */
+	ZIO_FLAG_DONT_QUEUE	= 1 << 18,	/* must be first for INHERIT */
+	ZIO_FLAG_DONT_PROPAGATE	= 1 << 19,
+	ZIO_FLAG_IO_BYPASS	= 1 << 20,
+	ZIO_FLAG_IO_REWRITE	= 1 << 21,
+	ZIO_FLAG_RAW		= 1 << 22,
+	ZIO_FLAG_GANG_CHILD	= 1 << 23,
+	ZIO_FLAG_DDT_CHILD	= 1 << 24,
+	ZIO_FLAG_GODFATHER	= 1 << 25,
+	ZIO_FLAG_NOPWRITE	= 1 << 26,
+	ZIO_FLAG_REEXECUTED	= 1 << 27,
+	ZIO_FLAG_DELEGATED	= 1 << 28,
+	ZIO_FLAG_FASTWRITE	= 1 << 29,
+};
+
+#define	ZIO_FLAG_MUSTSUCCEED		0
+
+#define	ZIO_DDT_CHILD_FLAGS(zio)				\
+	(((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) |		\
+	ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
+
+#define	ZIO_GANG_CHILD_FLAGS(zio)				\
+	(((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) |		\
+	ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
+
+#define	ZIO_VDEV_CHILD_FLAGS(zio)				\
+	(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) |		\
+	ZIO_FLAG_CANFAIL)
+
+enum zio_child {
+	ZIO_CHILD_VDEV = 0,
+	ZIO_CHILD_GANG,
+	ZIO_CHILD_DDT,
+	ZIO_CHILD_LOGICAL,
+	ZIO_CHILD_TYPES
+};
+
+enum zio_wait_type {
+	ZIO_WAIT_READY = 0,
+	ZIO_WAIT_DONE,
+	ZIO_WAIT_TYPES
+};
+
+/*
+ * We'll take the unused errnos, 'EBADE' and 'EBADR' (from the Convergent
+ * graveyard) to indicate checksum errors and fragmentation.
+ */
+#define	ECKSUM	EBADE
+#define	EFRAGS	EBADR
+
+typedef void zio_done_func_t(zio_t *zio);
+
+extern const char *zio_type_name[ZIO_TYPES];
+
+/*
+ * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
+ * identifies any block in the pool.  By convention, the meta-objset (MOS)
+ * is objset 0, and the meta-dnode is object 0.  This covers all blocks
+ * except root blocks and ZIL blocks, which are defined as follows:
+ *
+ * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>.
+ * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
+ * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
+ *
+ * Note: this structure is called a bookmark because its original purpose
+ * was to remember where to resume a pool-wide traverse.
+ *
+ * Note: this structure is passed between userland and the kernel, and is
+ * stored on disk (by virtue of being incorporated into other on-disk
+ * structures, e.g. dsl_scan_phys_t).
+ */
+struct zbookmark_phys {
+	uint64_t	zb_objset;
+	uint64_t	zb_object;
+	int64_t		zb_level;
+	uint64_t	zb_blkid;
+};
+
+#define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
+{                                                       \
+	(zb)->zb_objset = objset;                       \
+	(zb)->zb_object = object;                       \
+	(zb)->zb_level = level;                         \
+	(zb)->zb_blkid = blkid;                         \
+}
+
+#define	ZB_DESTROYED_OBJSET	(-1ULL)
+
+#define	ZB_ROOT_OBJECT		(0ULL)
+#define	ZB_ROOT_LEVEL		(-1LL)
+#define	ZB_ROOT_BLKID		(0ULL)
+
+#define	ZB_ZIL_OBJECT		(0ULL)
+#define	ZB_ZIL_LEVEL		(-2LL)
+
+#define	ZB_IS_ZERO(zb)						\
+	((zb)->zb_objset == 0 && (zb)->zb_object == 0 &&	\
+	(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
+#define	ZB_IS_ROOT(zb)				\
+	((zb)->zb_object == ZB_ROOT_OBJECT &&	\
+	(zb)->zb_level == ZB_ROOT_LEVEL &&	\
+	(zb)->zb_blkid == ZB_ROOT_BLKID)
+
+typedef struct zio_prop {
+	enum zio_checksum	zp_checksum;
+	enum zio_compress	zp_compress;
+	dmu_object_type_t	zp_type;
+	uint8_t			zp_level;
+	uint8_t			zp_copies;
+	boolean_t		zp_dedup;
+	boolean_t		zp_dedup_verify;
+	boolean_t		zp_nopwrite;
+} zio_prop_t;
+
+typedef struct zio_cksum_report zio_cksum_report_t;
+
+typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
+    const void *good_data);
+typedef void zio_cksum_free_f(void *cbdata, size_t size);
+
+struct zio_bad_cksum;				/* defined in zio_checksum.h */
+struct dnode_phys;
+
+struct zio_cksum_report {
+	struct zio_cksum_report *zcr_next;
+	nvlist_t		*zcr_ereport;
+	nvlist_t		*zcr_detector;
+	void			*zcr_cbdata;
+	size_t			zcr_cbinfo;	/* passed to zcr_free() */
+	uint64_t		zcr_align;
+	uint64_t		zcr_length;
+	zio_cksum_finish_f	*zcr_finish;
+	zio_cksum_free_f	*zcr_free;
+
+	/* internal use only */
+	struct zio_bad_cksum	*zcr_ckinfo;	/* information from failure */
+};
+
+typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr,
+    void *arg);
+
+zio_vsd_cksum_report_f	zio_vsd_default_cksum_report;
+
+typedef struct zio_vsd_ops {
+	zio_done_func_t		*vsd_free;
+	zio_vsd_cksum_report_f	*vsd_cksum_report;
+} zio_vsd_ops_t;
+
+typedef struct zio_gang_node {
+	zio_gbh_phys_t		*gn_gbh;
+	struct zio_gang_node	*gn_child[SPA_GBH_NBLKPTRS];
+} zio_gang_node_t;
+
+typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
+    zio_gang_node_t *gn, void *data);
+
+typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size);
+
+typedef struct zio_transform {
+	void			*zt_orig_data;
+	uint64_t		zt_orig_size;
+	uint64_t		zt_bufsize;
+	zio_transform_func_t	*zt_transform;
+	struct zio_transform	*zt_next;
+} zio_transform_t;
+
+typedef int zio_pipe_stage_t(zio_t *zio);
+
+/*
+ * The io_reexecute flags are distinct from io_flags because the child must
+ * be able to propagate them to the parent.  The normal io_flags are local
+ * to the zio, not protected by any lock, and not modifiable by children;
+ * the reexecute flags are protected by io_lock, modifiable by children,
+ * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
+ */
+#define	ZIO_REEXECUTE_NOW	0x01
+#define	ZIO_REEXECUTE_SUSPEND	0x02
+
+typedef struct zio_link {
+	zio_t		*zl_parent;
+	zio_t		*zl_child;
+	list_node_t	zl_parent_node;
+	list_node_t	zl_child_node;
+} zio_link_t;
+
+struct zio {
+	/* Core information about this I/O */
+	zbookmark_phys_t	io_bookmark;
+	zio_prop_t	io_prop;
+	zio_type_t	io_type;
+	enum zio_child	io_child_type;
+	int		io_cmd;
+	zio_priority_t	io_priority;
+	uint8_t		io_reexecute;
+	uint8_t		io_state[ZIO_WAIT_TYPES];
+	uint64_t	io_txg;
+	spa_t		*io_spa;
+	blkptr_t	*io_bp;
+	blkptr_t	*io_bp_override;
+	blkptr_t	io_bp_copy;
+	list_t		io_parent_list;
+	list_t		io_child_list;
+	zio_link_t	*io_walk_link;
+	zio_t		*io_logical;
+	zio_transform_t *io_transform_stack;
+
+	/* Callback info */
+	zio_done_func_t *io_ready;
+	zio_done_func_t	*io_physdone;
+	zio_done_func_t	*io_done;
+	void		*io_private;
+	int64_t		io_prev_space_delta;	/* DMU private */
+	blkptr_t	io_bp_orig;
+
+	/* Data represented by this I/O */
+	void		*io_data;
+	void		*io_orig_data;
+	uint64_t	io_size;
+	uint64_t	io_orig_size;
+
+	/* Stuff for the vdev stack */
+	vdev_t		*io_vd;
+	void		*io_vsd;
+	const zio_vsd_ops_t *io_vsd_ops;
+
+	uint64_t	io_offset;
+	hrtime_t	io_timestamp;	/* submitted at */
+	hrtime_t	io_delta;	/* vdev queue service delta */
+	uint64_t	io_delay;	/* vdev disk service delta (ticks) */
+	avl_node_t	io_queue_node;
+	avl_node_t	io_offset_node;
+
+	/* Internal pipeline state */
+	enum zio_flag	io_flags;
+	enum zio_stage	io_stage;
+	enum zio_stage	io_pipeline;
+	enum zio_flag	io_orig_flags;
+	enum zio_stage	io_orig_stage;
+	enum zio_stage	io_orig_pipeline;
+	int		io_error;
+	int		io_child_error[ZIO_CHILD_TYPES];
+	uint64_t	io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
+	uint64_t	io_child_count;
+	uint64_t	io_phys_children;
+	uint64_t	io_parent_count;
+	uint64_t	*io_stall;
+	zio_t		*io_gang_leader;
+	zio_gang_node_t	*io_gang_tree;
+	void		*io_executor;
+	void		*io_waiter;
+	kmutex_t	io_lock;
+	kcondvar_t	io_cv;
+
+	/* FMA state */
+	zio_cksum_report_t *io_cksum_report;
+	uint64_t	io_ena;
+
+	/* Taskq dispatching state */
+	taskq_ent_t	io_tqent;
+};
+
+extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
+    zio_done_func_t *done, void *private, enum zio_flag flags);
+
+extern zio_t *zio_root(spa_t *spa,
+    zio_done_func_t *done, void *private, enum zio_flag flags);
+
+extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
+    uint64_t size, zio_done_func_t *done, void *private,
+    zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
+
+extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    void *data, uint64_t size, const zio_prop_t *zp,
+    zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
+    void *private,
+    zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
+
+extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    void *data, uint64_t size, zio_done_func_t *done, void *private,
+    zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);
+
+extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
+    boolean_t nopwrite);
+
+extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
+
+extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
+    const blkptr_t *bp,
+    zio_done_func_t *done, void *private, enum zio_flag flags);
+
+extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+    zio_done_func_t *done, void *private, enum zio_flag flags);
+
+extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
+    uint64_t size, void *data, int checksum,
+    zio_done_func_t *done, void *private, zio_priority_t priority,
+    enum zio_flag flags, boolean_t labels);
+
+extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
+    uint64_t size, void *data, int checksum,
+    zio_done_func_t *done, void *private, zio_priority_t priority,
+    enum zio_flag flags, boolean_t labels);
+
+extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
+    const blkptr_t *bp, enum zio_flag flags);
+
+extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
+    uint64_t size, boolean_t use_slog);
+extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
+extern void zio_flush(zio_t *zio, vdev_t *vd);
+extern void zio_shrink(zio_t *zio, uint64_t size);
+
+extern int zio_wait(zio_t *zio);
+extern void zio_nowait(zio_t *zio);
+extern void zio_execute(zio_t *zio);
+extern void zio_interrupt(zio_t *zio);
+
+extern zio_t *zio_walk_parents(zio_t *cio);
+extern zio_t *zio_walk_children(zio_t *pio);
+extern zio_t *zio_unique_parent(zio_t *cio);
+extern void zio_add_child(zio_t *pio, zio_t *cio);
+
+extern void *zio_buf_alloc(size_t size);
+extern void zio_buf_free(void *buf, size_t size);
+extern void *zio_data_buf_alloc(size_t size);
+extern void zio_data_buf_free(void *buf, size_t size);
+
+extern void zio_resubmit_stage_async(void *);
+
+extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
+    uint64_t offset, void *data, uint64_t size, int type,
+    zio_priority_t priority, enum zio_flag flags,
+    zio_done_func_t *done, void *private);
+
+extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
+    void *data, uint64_t size, int type, zio_priority_t priority,
+    enum zio_flag flags, zio_done_func_t *done, void *private);
+
+extern void zio_vdev_io_bypass(zio_t *zio);
+extern void zio_vdev_io_reissue(zio_t *zio);
+extern void zio_vdev_io_redone(zio_t *zio);
+
+extern void zio_checksum_verified(zio_t *zio);
+extern int zio_worst_error(int e1, int e2);
+
+extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
+    enum zio_checksum parent);
+extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
+    enum zio_checksum child, enum zio_checksum parent);
+extern enum zio_compress zio_compress_select(spa_t *spa,
+    enum zio_compress child, enum zio_compress parent);
+
+extern void zio_suspend(spa_t *spa, zio_t *zio);
+extern int zio_resume(spa_t *spa);
+extern void zio_resume_wait(spa_t *spa);
+
+/*
+ * Initial setup and teardown.
+ */
+extern void zio_init(void);
+extern void zio_fini(void);
+
+/*
+ * Fault injection
+ */
+struct zinject_record;
+extern uint32_t zio_injection_enabled;
+extern int zio_inject_fault(char *name, int flags, int *id,
+    struct zinject_record *record);
+extern int zio_inject_list_next(int *id, char *name, size_t buflen,
+    struct zinject_record *record);
+extern int zio_clear_fault(int id);
+extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type);
+extern int zio_handle_fault_injection(zio_t *zio, int error);
+extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
+extern int zio_handle_label_injection(zio_t *zio, int error);
+extern void zio_handle_ignored_writes(zio_t *zio);
+extern uint64_t zio_handle_io_delay(zio_t *zio);
+
+/*
+ * Checksum ereport functions
+ */
+extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio,
+    uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info);
+extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
+    const void *good_data, const void *bad_data, boolean_t drop_if_identical);
+
+extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report);
+extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
+
+/* If we have the good data in hand, this function can be used */
+extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
+    struct zio *zio, uint64_t offset, uint64_t length,
+    const void *good_data, const void *bad_data, struct zio_bad_cksum *info);
+
+/* Called from spa_sync(), but primarily an injection handler */
+extern void spa_handle_ignored_writes(spa_t *spa);
+
+/* zbookmark_phys functions */
+boolean_t zbookmark_is_before(const struct dnode_phys *dnp,
+    const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZIO_H */
diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h
index 56b83b559377..9fcfd521f4ad 100644
--- a/include/sys/zio_checksum.h
+++ b/include/sys/zio_checksum.h
@@ -44,7 +44,7 @@ typedef const struct zio_checksum_info {
 	zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */
 	int		ci_correctable;	/* number of correctable bits	*/
 	int		ci_eck;		/* uses zio embedded checksum? */
-	int		ci_dedup;	/* strong enough for dedup? */
+	boolean_t	ci_dedup;	/* strong enough for dedup? */
 	char		*ci_name;	/* descriptive name */
 } zio_checksum_info_t;
 
diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h
new file mode 100644
index 000000000000..e33b9585b1c0
--- /dev/null
+++ b/include/sys/zio_priority.h
@@ -0,0 +1,40 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+#ifndef	_ZIO_PRIORITY_H
+#define	_ZIO_PRIORITY_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum zio_priority {
+	ZIO_PRIORITY_SYNC_READ,
+	ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */
+	ZIO_PRIORITY_ASYNC_READ,	/* prefetch */
+	ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */
+	ZIO_PRIORITY_SCRUB,		/* asynchronous scrub/resilver reads */
+	ZIO_PRIORITY_NUM_QUEUEABLE,
+
+	ZIO_PRIORITY_NOW		/* non-queued i/os (e.g. free) */
+} zio_priority_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZIO_PRIORITY_H */
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index a730a94e081a..60213a6386dd 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -3529,7 +3529,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
 }
 
 static int
-zbookmark_compare(const void *a, const void *b)
+zbookmark_mem_compare(const void *a, const void *b)
 {
 	return (memcmp(a, b, sizeof (zbookmark_phys_t)));
 }
@@ -3592,7 +3592,7 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
 	    zc.zc_nvlist_dst_size;
 	count -= zc.zc_nvlist_dst_size;
 
-	qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_compare);
+	qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
 
 	verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);
 
diff --git a/lib/libzfs/libzfs_pool.c.orig b/lib/libzfs/libzfs_pool.c.orig
new file mode 100644
index 000000000000..a730a94e081a
--- /dev/null
+++ b/lib/libzfs/libzfs_pool.c.orig
@@ -0,0 +1,4261 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ */
+
+#include <ctype.h>
+#include <errno.h>
+#include <devid.h>
+#include <fcntl.h>
+#include <libintl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <zone.h>
+#include <sys/stat.h>
+#include <sys/efi_partition.h>
+#include <sys/vtoc.h>
+#include <sys/zfs_ioctl.h>
+#include <dlfcn.h>
+
+#include "zfs_namecheck.h"
+#include "zfs_prop.h"
+#include "libzfs_impl.h"
+#include "zfs_comutil.h"
+#include "zfeature_common.h"
+
+static int read_efi_label(nvlist_t *config, diskaddr_t *sb);
+
+typedef struct prop_flags {
+	int create:1;	/* Validate property on creation */
+	int import:1;	/* Validate property on import */
+} prop_flags_t;
+
+/*
+ * ====================================================================
+ *   zpool property functions
+ * ====================================================================
+ */
+
+static int
+zpool_get_all_props(zpool_handle_t *zhp)
+{
+	zfs_cmd_t zc = {"\0"};
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+
+	if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
+		return (-1);
+
+	while (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) {
+		if (errno == ENOMEM) {
+			if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
+				zcmd_free_nvlists(&zc);
+				return (-1);
+			}
+		} else {
+			zcmd_free_nvlists(&zc);
+			return (-1);
+		}
+	}
+
+	if (zcmd_read_dst_nvlist(hdl, &zc, &zhp->zpool_props) != 0) {
+		zcmd_free_nvlists(&zc);
+		return (-1);
+	}
+
+	zcmd_free_nvlists(&zc);
+
+	return (0);
+}
+
+static int
+zpool_props_refresh(zpool_handle_t *zhp)
+{
+	nvlist_t *old_props;
+
+	old_props = zhp->zpool_props;
+
+	if (zpool_get_all_props(zhp) != 0)
+		return (-1);
+
+	nvlist_free(old_props);
+	return (0);
+}
+
+static char *
+zpool_get_prop_string(zpool_handle_t *zhp, zpool_prop_t prop,
+    zprop_source_t *src)
+{
+	nvlist_t *nv, *nvl;
+	uint64_t ival;
+	char *value;
+	zprop_source_t source;
+
+	nvl = zhp->zpool_props;
+	if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) {
+		verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0);
+		source = ival;
+		verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0);
+	} else {
+		source = ZPROP_SRC_DEFAULT;
+		if ((value = (char *)zpool_prop_default_string(prop)) == NULL)
+			value = "-";
+	}
+
+	if (src)
+		*src = source;
+
+	return (value);
+}
+
+uint64_t
+zpool_get_prop_int(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src)
+{
+	nvlist_t *nv, *nvl;
+	uint64_t value;
+	zprop_source_t source;
+
+	if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) {
+		/*
+		 * zpool_get_all_props() has most likely failed because
+		 * the pool is faulted, but if all we need is the top level
+		 * vdev's guid then get it from the zhp config nvlist.
+		 */
+		if ((prop == ZPOOL_PROP_GUID) &&
+		    (nvlist_lookup_nvlist(zhp->zpool_config,
+		    ZPOOL_CONFIG_VDEV_TREE, &nv) == 0) &&
+		    (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value)
+		    == 0)) {
+			return (value);
+		}
+		return (zpool_prop_default_numeric(prop));
+	}
+
+	nvl = zhp->zpool_props;
+	if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) {
+		verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &value) == 0);
+		source = value;
+		verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0);
+	} else {
+		source = ZPROP_SRC_DEFAULT;
+		value = zpool_prop_default_numeric(prop);
+	}
+
+	if (src)
+		*src = source;
+
+	return (value);
+}
+
+/*
+ * Map VDEV STATE to printed strings.
+ */
+char *
+zpool_state_to_name(vdev_state_t state, vdev_aux_t aux)
+{
+	switch (state) {
+	default:
+		break;
+	case VDEV_STATE_CLOSED:
+	case VDEV_STATE_OFFLINE:
+		return (gettext("OFFLINE"));
+	case VDEV_STATE_REMOVED:
+		return (gettext("REMOVED"));
+	case VDEV_STATE_CANT_OPEN:
+		if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG)
+			return (gettext("FAULTED"));
+		else if (aux == VDEV_AUX_SPLIT_POOL)
+			return (gettext("SPLIT"));
+		else
+			return (gettext("UNAVAIL"));
+	case VDEV_STATE_FAULTED:
+		return (gettext("FAULTED"));
+	case VDEV_STATE_DEGRADED:
+		return (gettext("DEGRADED"));
+	case VDEV_STATE_HEALTHY:
+		return (gettext("ONLINE"));
+	}
+
+	return (gettext("UNKNOWN"));
+}
+
+/*
+ * Map POOL STATE to printed strings.
+ */
+const char *
+zpool_pool_state_to_name(pool_state_t state)
+{
+	switch (state) {
+	default:
+		break;
+	case POOL_STATE_ACTIVE:
+		return (gettext("ACTIVE"));
+	case POOL_STATE_EXPORTED:
+		return (gettext("EXPORTED"));
+	case POOL_STATE_DESTROYED:
+		return (gettext("DESTROYED"));
+	case POOL_STATE_SPARE:
+		return (gettext("SPARE"));
+	case POOL_STATE_L2CACHE:
+		return (gettext("L2CACHE"));
+	case POOL_STATE_UNINITIALIZED:
+		return (gettext("UNINITIALIZED"));
+	case POOL_STATE_UNAVAIL:
+		return (gettext("UNAVAIL"));
+	case POOL_STATE_POTENTIALLY_ACTIVE:
+		return (gettext("POTENTIALLY_ACTIVE"));
+	}
+
+	return (gettext("UNKNOWN"));
+}
+
+/*
+ * API compatibility wrapper around zpool_get_prop_literal
+ */
+int
+zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
+    zprop_source_t *srctype)
+{
+	return (zpool_get_prop_literal(zhp, prop, buf, len, srctype, B_FALSE));
+}
+
+/*
+ * Get a zpool property value for 'prop' and return the value in
+ * a pre-allocated buffer.
+ */
+int
+zpool_get_prop_literal(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
+    size_t len, zprop_source_t *srctype, boolean_t literal)
+{
+	uint64_t intval;
+	const char *strval;
+	zprop_source_t src = ZPROP_SRC_NONE;
+	nvlist_t *nvroot;
+	vdev_stat_t *vs;
+	uint_t vsc;
+
+	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
+		switch (prop) {
+		case ZPOOL_PROP_NAME:
+			(void) strlcpy(buf, zpool_get_name(zhp), len);
+			break;
+
+		case ZPOOL_PROP_HEALTH:
+			(void) strlcpy(buf, "FAULTED", len);
+			break;
+
+		case ZPOOL_PROP_GUID:
+			intval = zpool_get_prop_int(zhp, prop, &src);
+			(void) snprintf(buf, len, "%llu", (u_longlong_t)intval);
+			break;
+
+		case ZPOOL_PROP_ALTROOT:
+		case ZPOOL_PROP_CACHEFILE:
+		case ZPOOL_PROP_COMMENT:
+			if (zhp->zpool_props != NULL ||
+			    zpool_get_all_props(zhp) == 0) {
+				(void) strlcpy(buf,
+				    zpool_get_prop_string(zhp, prop, &src),
+				    len);
+				if (srctype != NULL)
+					*srctype = src;
+				return (0);
+			}
+			/* FALLTHROUGH */
+		default:
+			(void) strlcpy(buf, "-", len);
+			break;
+		}
+
+		if (srctype != NULL)
+			*srctype = src;
+		return (0);
+	}
+
+	if (zhp->zpool_props == NULL && zpool_get_all_props(zhp) &&
+	    prop != ZPOOL_PROP_NAME)
+		return (-1);
+
+	switch (zpool_prop_get_type(prop)) {
+	case PROP_TYPE_STRING:
+		(void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src),
+		    len);
+		break;
+
+	case PROP_TYPE_NUMBER:
+		intval = zpool_get_prop_int(zhp, prop, &src);
+
+		switch (prop) {
+		case ZPOOL_PROP_SIZE:
+		case ZPOOL_PROP_ALLOCATED:
+		case ZPOOL_PROP_FREE:
+		case ZPOOL_PROP_FREEING:
+		case ZPOOL_PROP_LEAKED:
+		case ZPOOL_PROP_ASHIFT:
+			if (literal)
+				(void) snprintf(buf, len, "%llu",
+					(u_longlong_t)intval);
+			else
+				(void) zfs_nicenum(intval, buf, len);
+			break;
+
+		case ZPOOL_PROP_EXPANDSZ:
+			if (intval == 0) {
+				(void) strlcpy(buf, "-", len);
+			} else if (literal) {
+				(void) snprintf(buf, len, "%llu",
+				    (u_longlong_t)intval);
+			} else {
+				(void) zfs_nicenum(intval, buf, len);
+			}
+			break;
+
+		case ZPOOL_PROP_CAPACITY:
+			(void) snprintf(buf, len, "%llu%%",
+			    (u_longlong_t)intval);
+			break;
+
+		case ZPOOL_PROP_FRAGMENTATION:
+			if (intval == UINT64_MAX) {
+				(void) strlcpy(buf, "-", len);
+			} else {
+				(void) snprintf(buf, len, "%llu%%",
+				    (u_longlong_t)intval);
+			}
+			break;
+
+		case ZPOOL_PROP_DEDUPRATIO:
+			(void) snprintf(buf, len, "%llu.%02llux",
+			    (u_longlong_t)(intval / 100),
+			    (u_longlong_t)(intval % 100));
+			break;
+
+		case ZPOOL_PROP_HEALTH:
+			verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
+			    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+			verify(nvlist_lookup_uint64_array(nvroot,
+			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
+			    == 0);
+
+			(void) strlcpy(buf, zpool_state_to_name(intval,
+			    vs->vs_aux), len);
+			break;
+		case ZPOOL_PROP_VERSION:
+			if (intval >= SPA_VERSION_FEATURES) {
+				(void) snprintf(buf, len, "-");
+				break;
+			}
+			/* FALLTHROUGH */
+		default:
+			(void) snprintf(buf, len, "%llu", (u_longlong_t)intval);
+		}
+		break;
+
+	case PROP_TYPE_INDEX:
+		intval = zpool_get_prop_int(zhp, prop, &src);
+		if (zpool_prop_index_to_string(prop, intval, &strval)
+		    != 0)
+			return (-1);
+		(void) strlcpy(buf, strval, len);
+		break;
+
+	default:
+		abort();
+	}
+
+	if (srctype)
+		*srctype = src;
+
+	return (0);
+}
+
+/*
+ * Check if the bootfs name has the same pool name as it is set to.
+ * Assuming bootfs is a valid dataset name.
+ */
+static boolean_t
+bootfs_name_valid(const char *pool, char *bootfs)
+{
+	int len = strlen(pool);
+
+	if (!zfs_name_valid(bootfs, ZFS_TYPE_FILESYSTEM|ZFS_TYPE_SNAPSHOT))
+		return (B_FALSE);
+
+	if (strncmp(pool, bootfs, len) == 0 &&
+	    (bootfs[len] == '/' || bootfs[len] == '\0'))
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+#if defined(__sun__) || defined(__sun)
+/*
+ * Inspect the configuration to determine if any of the devices contain
+ * an EFI label.
+ */
+static boolean_t
+pool_uses_efi(nvlist_t *config)
+{
+	nvlist_t **child;
+	uint_t c, children;
+
+	if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		return (read_efi_label(config, NULL) >= 0);
+
+	for (c = 0; c < children; c++) {
+		if (pool_uses_efi(child[c]))
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+#endif
+
+boolean_t
+zpool_is_bootable(zpool_handle_t *zhp)
+{
+	char bootfs[ZPOOL_MAXNAMELEN];
+
+	return (zpool_get_prop(zhp, ZPOOL_PROP_BOOTFS, bootfs,
+	    sizeof (bootfs), NULL) == 0 && strncmp(bootfs, "-",
+	    sizeof (bootfs)) != 0);
+}
+
+
+/*
+ * Given an nvlist of zpool properties to be set, validate that they are
+ * correct, and parse any numeric properties (index, boolean, etc) if they are
+ * specified as strings.
+ */
+static nvlist_t *
+zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
+    nvlist_t *props, uint64_t version, prop_flags_t flags, char *errbuf)
+{
+	nvpair_t *elem;
+	nvlist_t *retprops;
+	zpool_prop_t prop;
+	char *strval;
+	uint64_t intval;
+	char *slash, *check;
+	struct stat64 statbuf;
+	zpool_handle_t *zhp;
+	nvlist_t *nvroot;
+
+	if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) {
+		(void) no_memory(hdl);
+		return (NULL);
+	}
+
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+		const char *propname = nvpair_name(elem);
+
+		prop = zpool_name_to_prop(propname);
+		if (prop == ZPROP_INVAL && zpool_prop_feature(propname)) {
+			int err;
+			char *fname = strchr(propname, '@') + 1;
+
+			err = zfeature_lookup_name(fname, NULL);
+			if (err != 0) {
+				ASSERT3U(err, ==, ENOENT);
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "invalid feature '%s'"), fname);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+
+			if (nvpair_type(elem) != DATA_TYPE_STRING) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "'%s' must be a string"), propname);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+
+			(void) nvpair_value_string(elem, &strval);
+			if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "property '%s' can only be set to "
+				    "'enabled'"), propname);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+
+			if (nvlist_add_uint64(retprops, propname, 0) != 0) {
+				(void) no_memory(hdl);
+				goto error;
+			}
+			continue;
+		}
+
+		/*
+		 * Make sure this property is valid and applies to this type.
+		 */
+		if (prop == ZPROP_INVAL) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "invalid property '%s'"), propname);
+			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+			goto error;
+		}
+
+		if (zpool_prop_readonly(prop)) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' "
+			    "is readonly"), propname);
+			(void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
+			goto error;
+		}
+
+		if (zprop_parse_value(hdl, elem, prop, ZFS_TYPE_POOL, retprops,
+		    &strval, &intval, errbuf) != 0)
+			goto error;
+
+		/*
+		 * Perform additional checking for specific properties.
+		 */
+		switch (prop) {
+		default:
+			break;
+		case ZPOOL_PROP_VERSION:
+			if (intval < version ||
+			    !SPA_VERSION_IS_SUPPORTED(intval)) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "property '%s' number %d is invalid."),
+				    propname, intval);
+				(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
+				goto error;
+			}
+			break;
+
+		case ZPOOL_PROP_ASHIFT:
+			if (!flags.create) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "property '%s' can only be set at "
+				    "creation time"), propname);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+
+			if (intval != 0 && (intval < 9 || intval > 13)) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "property '%s' number %d is invalid."),
+				    propname, intval);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+			break;
+
+		case ZPOOL_PROP_BOOTFS:
+			if (flags.create || flags.import) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "property '%s' cannot be set at creation "
+				    "or import time"), propname);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+
+			if (version < SPA_VERSION_BOOTFS) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "pool must be upgraded to support "
+				    "'%s' property"), propname);
+				(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
+				goto error;
+			}
+
+			/*
+			 * bootfs property value has to be a dataset name and
+			 * the dataset has to be in the same pool as it sets to.
+			 */
+			if (strval[0] != '\0' && !bootfs_name_valid(poolname,
+			    strval)) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' "
+				    "is an invalid name"), strval);
+				(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
+				goto error;
+			}
+
+			if ((zhp = zpool_open_canfail(hdl, poolname)) == NULL) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "could not open pool '%s'"), poolname);
+				(void) zfs_error(hdl, EZFS_OPENFAILED, errbuf);
+				goto error;
+			}
+			verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
+			    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+
+#if defined(__sun__) || defined(__sun)
+			/*
+			 * bootfs property cannot be set on a disk which has
+			 * been EFI labeled.
+			 */
+			if (pool_uses_efi(nvroot)) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "property '%s' not supported on "
+				    "EFI labeled devices"), propname);
+				(void) zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf);
+				zpool_close(zhp);
+				goto error;
+			}
+#endif
+			zpool_close(zhp);
+			break;
+
+		case ZPOOL_PROP_ALTROOT:
+			if (!flags.create && !flags.import) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "property '%s' can only be set during pool "
+				    "creation or import"), propname);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+
+			if (strval[0] != '/') {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "bad alternate root '%s'"), strval);
+				(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
+				goto error;
+			}
+			break;
+
+		case ZPOOL_PROP_CACHEFILE:
+			if (strval[0] == '\0')
+				break;
+
+			if (strcmp(strval, "none") == 0)
+				break;
+
+			if (strval[0] != '/') {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "property '%s' must be empty, an "
+				    "absolute path, or 'none'"), propname);
+				(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
+				goto error;
+			}
+
+			slash = strrchr(strval, '/');
+
+			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
+			    strcmp(slash, "/..") == 0) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "'%s' is not a valid file"), strval);
+				(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
+				goto error;
+			}
+
+			*slash = '\0';
+
+			if (strval[0] != '\0' &&
+			    (stat64(strval, &statbuf) != 0 ||
+			    !S_ISDIR(statbuf.st_mode))) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "'%s' is not a valid directory"),
+				    strval);
+				(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
+				goto error;
+			}
+
+			*slash = '/';
+			break;
+
+		case ZPOOL_PROP_COMMENT:
+			for (check = strval; *check != '\0'; check++) {
+				if (!isprint(*check)) {
+					zfs_error_aux(hdl,
+					    dgettext(TEXT_DOMAIN,
+					    "comment may only have printable "
+					    "characters"));
+					(void) zfs_error(hdl, EZFS_BADPROP,
+					    errbuf);
+					goto error;
+				}
+			}
+			if (strlen(strval) > ZPROP_MAX_COMMENT) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "comment must not exceed %d characters"),
+				    ZPROP_MAX_COMMENT);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+			break;
+		case ZPOOL_PROP_READONLY:
+			if (!flags.import) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "property '%s' can only be set at "
+				    "import time"), propname);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+			break;
+		case ZPOOL_PROP_TNAME:
+			if (!flags.create) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "property '%s' can only be set at "
+				    "creation time"), propname);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+			break;
+		}
+	}
+
+	return (retprops);
+error:
+	nvlist_free(retprops);
+	return (NULL);
+}
+
+/*
+ * Set zpool property : propname=propval.
+ */
+int
+zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval)
+{
+	zfs_cmd_t zc = {"\0"};
+	int ret = -1;
+	char errbuf[1024];
+	nvlist_t *nvl = NULL;
+	nvlist_t *realprops;
+	uint64_t version;
+	prop_flags_t flags = { 0 };
+
+	(void) snprintf(errbuf, sizeof (errbuf),
+	    dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
+	    zhp->zpool_name);
+
+	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+		return (no_memory(zhp->zpool_hdl));
+
+	if (nvlist_add_string(nvl, propname, propval) != 0) {
+		nvlist_free(nvl);
+		return (no_memory(zhp->zpool_hdl));
+	}
+
+	version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
+	if ((realprops = zpool_valid_proplist(zhp->zpool_hdl,
+	    zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) {
+		nvlist_free(nvl);
+		return (-1);
+	}
+
+	nvlist_free(nvl);
+	nvl = realprops;
+
+	/*
+	 * Execute the corresponding ioctl() to set this property.
+	 */
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+
+	if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, nvl) != 0) {
+		nvlist_free(nvl);
+		return (-1);
+	}
+
+	ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SET_PROPS, &zc);
+
+	zcmd_free_nvlists(&zc);
+	nvlist_free(nvl);
+
+	if (ret)
+		(void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf);
+	else
+		(void) zpool_props_refresh(zhp);
+
+	return (ret);
+}
+
+int
+zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp)
+{
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	zprop_list_t *entry;
+	char buf[ZFS_MAXPROPLEN];
+	nvlist_t *features = NULL;
+	nvpair_t *nvp;
+	zprop_list_t **last;
+	boolean_t firstexpand = (NULL == *plp);
+	int i;
+
+	if (zprop_expand_list(hdl, plp, ZFS_TYPE_POOL) != 0)
+		return (-1);
+
+	last = plp;
+	while (*last != NULL)
+		last = &(*last)->pl_next;
+
+	if ((*plp)->pl_all)
+		features = zpool_get_features(zhp);
+
+	if ((*plp)->pl_all && firstexpand) {
+		for (i = 0; i < SPA_FEATURES; i++) {
+			zprop_list_t *entry = zfs_alloc(hdl,
+			    sizeof (zprop_list_t));
+			entry->pl_prop = ZPROP_INVAL;
+			entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s",
+			    spa_feature_table[i].fi_uname);
+			entry->pl_width = strlen(entry->pl_user_prop);
+			entry->pl_all = B_TRUE;
+
+			*last = entry;
+			last = &entry->pl_next;
+		}
+	}
+
+	/* add any unsupported features */
+	for (nvp = nvlist_next_nvpair(features, NULL);
+	    nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) {
+		char *propname;
+		boolean_t found;
+		zprop_list_t *entry;
+
+		if (zfeature_is_supported(nvpair_name(nvp)))
+			continue;
+
+		propname = zfs_asprintf(hdl, "unsupported@%s",
+		    nvpair_name(nvp));
+
+		/*
+		 * Before adding the property to the list make sure that no
+		 * other pool already added the same property.
+		 */
+		found = B_FALSE;
+		entry = *plp;
+		while (entry != NULL) {
+			if (entry->pl_user_prop != NULL &&
+			    strcmp(propname, entry->pl_user_prop) == 0) {
+				found = B_TRUE;
+				break;
+			}
+			entry = entry->pl_next;
+		}
+		if (found) {
+			free(propname);
+			continue;
+		}
+
+		entry = zfs_alloc(hdl, sizeof (zprop_list_t));
+		entry->pl_prop = ZPROP_INVAL;
+		entry->pl_user_prop = propname;
+		entry->pl_width = strlen(entry->pl_user_prop);
+		entry->pl_all = B_TRUE;
+
+		*last = entry;
+		last = &entry->pl_next;
+	}
+
+	for (entry = *plp; entry != NULL; entry = entry->pl_next) {
+
+		if (entry->pl_fixed)
+			continue;
+
+		if (entry->pl_prop != ZPROP_INVAL &&
+		    zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf),
+		    NULL) == 0) {
+			if (strlen(buf) > entry->pl_width)
+				entry->pl_width = strlen(buf);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Get the state for the given feature on the given ZFS pool.
+ */
+int
+zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf,
+    size_t len)
+{
+	uint64_t refcount;
+	boolean_t found = B_FALSE;
+	nvlist_t *features = zpool_get_features(zhp);
+	boolean_t supported;
+	const char *feature = strchr(propname, '@') + 1;
+
+	supported = zpool_prop_feature(propname);
+	ASSERT(supported || zpool_prop_unsupported(propname));
+
+	/*
+	 * Convert from feature name to feature guid. This conversion is
+	 * unecessary for unsupported@... properties because they already
+	 * use guids.
+	 */
+	if (supported) {
+		int ret;
+		spa_feature_t fid;
+
+		ret = zfeature_lookup_name(feature, &fid);
+		if (ret != 0) {
+			(void) strlcpy(buf, "-", len);
+			return (ENOTSUP);
+		}
+		feature = spa_feature_table[fid].fi_guid;
+	}
+
+	if (nvlist_lookup_uint64(features, feature, &refcount) == 0)
+		found = B_TRUE;
+
+	if (supported) {
+		if (!found) {
+			(void) strlcpy(buf, ZFS_FEATURE_DISABLED, len);
+		} else  {
+			if (refcount == 0)
+				(void) strlcpy(buf, ZFS_FEATURE_ENABLED, len);
+			else
+				(void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len);
+		}
+	} else {
+		if (found) {
+			if (refcount == 0) {
+				(void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE);
+			} else {
+				(void) strcpy(buf, ZFS_UNSUPPORTED_READONLY);
+			}
+		} else {
+			(void) strlcpy(buf, "-", len);
+			return (ENOTSUP);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Don't start the slice at the default block of 34; many storage
+ * devices will use a stripe width of 128k, other vendors prefer a 1m
+ * alignment.  It is best to play it safe and ensure a 1m alignment
+ * given 512B blocks.  When the block size is larger by a power of 2
+ * we will still be 1m aligned.  Some devices are sensitive to the
+ * partition ending alignment as well.
+ */
+#define	NEW_START_BLOCK		2048
+#define	PARTITION_END_ALIGNMENT	2048
+
+/*
+ * Validate the given pool name, optionally putting an extended error message in
+ * 'buf'.
+ */
+boolean_t
+zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool)
+{
+	namecheck_err_t why;
+	char what;
+	int ret;
+
+	ret = pool_namecheck(pool, &why, &what);
+
+	/*
+	 * The rules for reserved pool names were extended at a later point.
+	 * But we need to support users with existing pools that may now be
+	 * invalid.  So we only check for this expanded set of names during a
+	 * create (or import), and only in userland.
+	 */
+	if (ret == 0 && !isopen &&
+	    (strncmp(pool, "mirror", 6) == 0 ||
+	    strncmp(pool, "raidz", 5) == 0 ||
+	    strncmp(pool, "spare", 5) == 0 ||
+	    strcmp(pool, "log") == 0)) {
+		if (hdl != NULL)
+			zfs_error_aux(hdl,
+			    dgettext(TEXT_DOMAIN, "name is reserved"));
+		return (B_FALSE);
+	}
+
+
+	if (ret != 0) {
+		if (hdl != NULL) {
+			switch (why) {
+			case NAME_ERR_TOOLONG:
+				zfs_error_aux(hdl,
+				    dgettext(TEXT_DOMAIN, "name is too long"));
+				break;
+
+			case NAME_ERR_INVALCHAR:
+				zfs_error_aux(hdl,
+				    dgettext(TEXT_DOMAIN, "invalid character "
+				    "'%c' in pool name"), what);
+				break;
+
+			case NAME_ERR_NOLETTER:
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "name must begin with a letter"));
+				break;
+
+			case NAME_ERR_RESERVED:
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "name is reserved"));
+				break;
+
+			case NAME_ERR_DISKLIKE:
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "pool name is reserved"));
+				break;
+
+			case NAME_ERR_LEADING_SLASH:
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "leading slash in name"));
+				break;
+
+			case NAME_ERR_EMPTY_COMPONENT:
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "empty component in name"));
+				break;
+
+			case NAME_ERR_TRAILING_SLASH:
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "trailing slash in name"));
+				break;
+
+			case NAME_ERR_MULTIPLE_AT:
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "multiple '@' delimiters in name"));
+				break;
+			case NAME_ERR_NO_AT:
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "permission set is missing '@'"));
+				break;
+			}
+		}
+		return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * Open a handle to the given pool, even if the pool is currently in the FAULTED
+ * state.
+ */
+zpool_handle_t *
+zpool_open_canfail(libzfs_handle_t *hdl, const char *pool)
+{
+	zpool_handle_t *zhp;
+	boolean_t missing;
+
+	/*
+	 * Make sure the pool name is valid.
+	 */
+	if (!zpool_name_valid(hdl, B_TRUE, pool)) {
+		(void) zfs_error_fmt(hdl, EZFS_INVALIDNAME,
+		    dgettext(TEXT_DOMAIN, "cannot open '%s'"),
+		    pool);
+		return (NULL);
+	}
+
+	if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL)
+		return (NULL);
+
+	zhp->zpool_hdl = hdl;
+	(void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name));
+
+	if (zpool_refresh_stats(zhp, &missing) != 0) {
+		zpool_close(zhp);
+		return (NULL);
+	}
+
+	if (missing) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool"));
+		(void) zfs_error_fmt(hdl, EZFS_NOENT,
+		    dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool);
+		zpool_close(zhp);
+		return (NULL);
+	}
+
+	return (zhp);
+}
+
+/*
+ * Like the above, but silent on error.  Used when iterating over pools (because
+ * the configuration cache may be out of date).
+ */
+int
+zpool_open_silent(libzfs_handle_t *hdl, const char *pool, zpool_handle_t **ret)
+{
+	zpool_handle_t *zhp;
+	boolean_t missing;
+
+	if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL)
+		return (-1);
+
+	zhp->zpool_hdl = hdl;
+	(void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name));
+
+	if (zpool_refresh_stats(zhp, &missing) != 0) {
+		zpool_close(zhp);
+		return (-1);
+	}
+
+	if (missing) {
+		zpool_close(zhp);
+		*ret = NULL;
+		return (0);
+	}
+
+	*ret = zhp;
+	return (0);
+}
+
+/*
+ * Similar to zpool_open_canfail(), but refuses to open pools in the faulted
+ * state.
+ */
+zpool_handle_t *
+zpool_open(libzfs_handle_t *hdl, const char *pool)
+{
+	zpool_handle_t *zhp;
+
+	if ((zhp = zpool_open_canfail(hdl, pool)) == NULL)
+		return (NULL);
+
+	if (zhp->zpool_state == POOL_STATE_UNAVAIL) {
+		(void) zfs_error_fmt(hdl, EZFS_POOLUNAVAIL,
+		    dgettext(TEXT_DOMAIN, "cannot open '%s'"), zhp->zpool_name);
+		zpool_close(zhp);
+		return (NULL);
+	}
+
+	return (zhp);
+}
+
+/*
+ * Close the handle.  Simply frees the memory associated with the handle.
+ */
+void
+zpool_close(zpool_handle_t *zhp)
+{
+	if (zhp->zpool_config)
+		nvlist_free(zhp->zpool_config);
+	if (zhp->zpool_old_config)
+		nvlist_free(zhp->zpool_old_config);
+	if (zhp->zpool_props)
+		nvlist_free(zhp->zpool_props);
+	free(zhp);
+}
+
+/*
+ * Return the name of the pool.
+ */
+const char *
+zpool_get_name(zpool_handle_t *zhp)
+{
+	return (zhp->zpool_name);
+}
+
+
+/*
+ * Return the state of the pool (ACTIVE or UNAVAILABLE)
+ */
+int
+zpool_get_state(zpool_handle_t *zhp)
+{
+	return (zhp->zpool_state);
+}
+
+/*
+ * Create the named pool, using the provided vdev list.  It is assumed
+ * that the consumer has already validated the contents of the nvlist, so we
+ * don't have to worry about error semantics.
+ */
+int
+zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
+    nvlist_t *props, nvlist_t *fsprops)
+{
+	zfs_cmd_t zc = {"\0"};
+	nvlist_t *zc_fsprops = NULL;
+	nvlist_t *zc_props = NULL;
+	char msg[1024];
+	int ret = -1;
+
+	(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+	    "cannot create '%s'"), pool);
+
+	if (!zpool_name_valid(hdl, B_FALSE, pool))
+		return (zfs_error(hdl, EZFS_INVALIDNAME, msg));
+
+	if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
+		return (-1);
+
+	if (props) {
+		prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE };
+
+		if ((zc_props = zpool_valid_proplist(hdl, pool, props,
+		    SPA_VERSION_1, flags, msg)) == NULL) {
+			goto create_failed;
+		}
+	}
+
+	if (fsprops) {
+		uint64_t zoned;
+		char *zonestr;
+
+		zoned = ((nvlist_lookup_string(fsprops,
+		    zfs_prop_to_name(ZFS_PROP_ZONED), &zonestr) == 0) &&
+		    strcmp(zonestr, "on") == 0);
+
+		if ((zc_fsprops = zfs_valid_proplist(hdl,
+		    ZFS_TYPE_FILESYSTEM, fsprops, zoned, NULL, msg)) == NULL) {
+			goto create_failed;
+		}
+		if (!zc_props &&
+		    (nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) {
+			goto create_failed;
+		}
+		if (nvlist_add_nvlist(zc_props,
+		    ZPOOL_ROOTFS_PROPS, zc_fsprops) != 0) {
+			goto create_failed;
+		}
+	}
+
+	if (zc_props && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0)
+		goto create_failed;
+
+	(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
+
+	if ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_CREATE, &zc)) != 0) {
+
+		zcmd_free_nvlists(&zc);
+		nvlist_free(zc_props);
+		nvlist_free(zc_fsprops);
+
+		switch (errno) {
+		case EBUSY:
+			/*
+			 * This can happen if the user has specified the same
+			 * device multiple times.  We can't reliably detect this
+			 * until we try to add it and see we already have a
+			 * label.  This can also happen under if the device is
+			 * part of an active md or lvm device.
+			 */
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "one or more vdevs refer to the same device, or "
+			    "one of\nthe devices is part of an active md or "
+			    "lvm device"));
+			return (zfs_error(hdl, EZFS_BADDEV, msg));
+
+		case EOVERFLOW:
+			/*
+			 * This occurs when one of the devices is below
+			 * SPA_MINDEVSIZE.  Unfortunately, we can't detect which
+			 * device was the problem device since there's no
+			 * reliable way to determine device size from userland.
+			 */
+			{
+				char buf[64];
+
+				zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf));
+
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "one or more devices is less than the "
+				    "minimum size (%s)"), buf);
+			}
+			return (zfs_error(hdl, EZFS_BADDEV, msg));
+
+		case ENOSPC:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "one or more devices is out of space"));
+			return (zfs_error(hdl, EZFS_BADDEV, msg));
+
+		case ENOTBLK:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "cache device must be a disk or disk slice"));
+			return (zfs_error(hdl, EZFS_BADDEV, msg));
+
+		default:
+			return (zpool_standard_error(hdl, errno, msg));
+		}
+	}
+
+create_failed:
+	zcmd_free_nvlists(&zc);
+	nvlist_free(zc_props);
+	nvlist_free(zc_fsprops);
+	return (ret);
+}
+
+/*
+ * Destroy the given pool.  It is up to the caller to ensure that there are no
+ * datasets left in the pool.
+ */
+int
+zpool_destroy(zpool_handle_t *zhp, const char *log_str)
+{
+	zfs_cmd_t zc = {"\0"};
+	zfs_handle_t *zfp = NULL;
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	char msg[1024];
+
+	if (zhp->zpool_state == POOL_STATE_ACTIVE &&
+	    (zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL)
+		return (-1);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	zc.zc_history = (uint64_t)(uintptr_t)log_str;
+
+	if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
+		(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+		    "cannot destroy '%s'"), zhp->zpool_name);
+
+		if (errno == EROFS) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "one or more devices is read only"));
+			(void) zfs_error(hdl, EZFS_BADDEV, msg);
+		} else {
+			(void) zpool_standard_error(hdl, errno, msg);
+		}
+
+		if (zfp)
+			zfs_close(zfp);
+		return (-1);
+	}
+
+	if (zfp) {
+		remove_mountpoint(zfp);
+		zfs_close(zfp);
+	}
+
+	return (0);
+}
+
+/*
+ * Add the given vdevs to the pool.  The caller must have already performed the
+ * necessary verification to ensure that the vdev specification is well-formed.
+ */
+int
+zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
+{
+	zfs_cmd_t zc = {"\0"};
+	int ret;
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	char msg[1024];
+	nvlist_t **spares, **l2cache;
+	uint_t nspares, nl2cache;
+
+	(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+	    "cannot add to '%s'"), zhp->zpool_name);
+
+	if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) <
+	    SPA_VERSION_SPARES &&
+	    nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+	    &spares, &nspares) == 0) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be "
+		    "upgraded to add hot spares"));
+		return (zfs_error(hdl, EZFS_BADVERSION, msg));
+	}
+
+#if defined(__sun__) || defined(__sun)
+	if (zpool_is_bootable(zhp) && nvlist_lookup_nvlist_array(nvroot,
+	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) {
+		uint64_t s;
+
+		for (s = 0; s < nspares; s++) {
+			char *path;
+
+			if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,
+			    &path) == 0 && pool_uses_efi(spares[s])) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "device '%s' contains an EFI label and "
+				    "cannot be used on root pools."),
+				    zpool_vdev_name(hdl, NULL, spares[s],
+				    B_FALSE));
+				return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg));
+			}
+		}
+	}
+#endif
+
+	if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) <
+	    SPA_VERSION_L2CACHE &&
+	    nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+	    &l2cache, &nl2cache) == 0) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be "
+		    "upgraded to add cache devices"));
+		return (zfs_error(hdl, EZFS_BADVERSION, msg));
+	}
+
+	if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
+		return (-1);
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+
+	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
+		switch (errno) {
+		case EBUSY:
+			/*
+			 * This can happen if the user has specified the same
+			 * device multiple times.  We can't reliably detect this
+			 * until we try to add it and see we already have a
+			 * label.
+			 */
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "one or more vdevs refer to the same device"));
+			(void) zfs_error(hdl, EZFS_BADDEV, msg);
+			break;
+
+		case EOVERFLOW:
+			/*
+			 * This occurrs when one of the devices is below
+			 * SPA_MINDEVSIZE.  Unfortunately, we can't detect which
+			 * device was the problem device since there's no
+			 * reliable way to determine device size from userland.
+			 */
+			{
+				char buf[64];
+
+				zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf));
+
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "device is less than the minimum "
+				    "size (%s)"), buf);
+			}
+			(void) zfs_error(hdl, EZFS_BADDEV, msg);
+			break;
+
+		case ENOTSUP:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "pool must be upgraded to add these vdevs"));
+			(void) zfs_error(hdl, EZFS_BADVERSION, msg);
+			break;
+
+		case ENOTBLK:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "cache device must be a disk or disk slice"));
+			(void) zfs_error(hdl, EZFS_BADDEV, msg);
+			break;
+
+		default:
+			(void) zpool_standard_error(hdl, errno, msg);
+		}
+
+		ret = -1;
+	} else {
+		ret = 0;
+	}
+
+	zcmd_free_nvlists(&zc);
+
+	return (ret);
+}
+
+/*
+ * Exports the pool from the system.  The caller must ensure that there are no
+ * mounted datasets in the pool.
+ */
+static int
+zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce,
+    const char *log_str)
+{
+	zfs_cmd_t zc = {"\0"};
+	char msg[1024];
+
+	(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+	    "cannot export '%s'"), zhp->zpool_name);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	zc.zc_cookie = force;
+	zc.zc_guid = hardforce;
+	zc.zc_history = (uint64_t)(uintptr_t)log_str;
+
+	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) {
+		switch (errno) {
+		case EXDEV:
+			zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN,
+			    "use '-f' to override the following errors:\n"
+			    "'%s' has an active shared spare which could be"
+			    " used by other pools once '%s' is exported."),
+			    zhp->zpool_name, zhp->zpool_name);
+			return (zfs_error(zhp->zpool_hdl, EZFS_ACTIVE_SPARE,
+			    msg));
+		default:
+			return (zpool_standard_error_fmt(zhp->zpool_hdl, errno,
+			    msg));
+		}
+	}
+
+	return (0);
+}
+
+int
+zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str)
+{
+	return (zpool_export_common(zhp, force, B_FALSE, log_str));
+}
+
+int
+zpool_export_force(zpool_handle_t *zhp, const char *log_str)
+{
+	return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str));
+}
+
+static void
+zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun,
+    nvlist_t *config)
+{
+	nvlist_t *nv = NULL;
+	uint64_t rewindto;
+	int64_t loss = -1;
+	struct tm t;
+	char timestr[128];
+
+	if (!hdl->libzfs_printerr || config == NULL)
+		return;
+
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 ||
+	    nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) {
+		return;
+	}
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
+		return;
+	(void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
+
+	if (localtime_r((time_t *)&rewindto, &t) != NULL &&
+	    strftime(timestr, 128, "%c", &t) != 0) {
+		if (dryrun) {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "Would be able to return %s "
+			    "to its state as of %s.\n"),
+			    name, timestr);
+		} else {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "Pool %s returned to its state as of %s.\n"),
+			    name, timestr);
+		}
+		if (loss > 120) {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "%s approximately %lld "),
+			    dryrun ? "Would discard" : "Discarded",
+			    ((longlong_t)loss + 30) / 60);
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "minutes of transactions.\n"));
+		} else if (loss > 0) {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "%s approximately %lld "),
+			    dryrun ? "Would discard" : "Discarded",
+			    (longlong_t)loss);
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "seconds of transactions.\n"));
+		}
+	}
+}
+
+void
+zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason,
+    nvlist_t *config)
+{
+	nvlist_t *nv = NULL;
+	int64_t loss = -1;
+	uint64_t edata = UINT64_MAX;
+	uint64_t rewindto;
+	struct tm t;
+	char timestr[128];
+
+	if (!hdl->libzfs_printerr)
+		return;
+
+	if (reason >= 0)
+		(void) printf(dgettext(TEXT_DOMAIN, "action: "));
+	else
+		(void) printf(dgettext(TEXT_DOMAIN, "\t"));
+
+	/* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 ||
+	    nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 ||
+	    nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
+		goto no_info;
+
+	(void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
+	    &edata);
+
+	(void) printf(dgettext(TEXT_DOMAIN,
+	    "Recovery is possible, but will result in some data loss.\n"));
+
+	if (localtime_r((time_t *)&rewindto, &t) != NULL &&
+	    strftime(timestr, 128, "%c", &t) != 0) {
+		(void) printf(dgettext(TEXT_DOMAIN,
+		    "\tReturning the pool to its state as of %s\n"
+		    "\tshould correct the problem.  "),
+		    timestr);
+	} else {
+		(void) printf(dgettext(TEXT_DOMAIN,
+		    "\tReverting the pool to an earlier state "
+		    "should correct the problem.\n\t"));
+	}
+
+	if (loss > 120) {
+		(void) printf(dgettext(TEXT_DOMAIN,
+		    "Approximately %lld minutes of data\n"
+		    "\tmust be discarded, irreversibly.  "),
+		    ((longlong_t)loss + 30) / 60);
+	} else if (loss > 0) {
+		(void) printf(dgettext(TEXT_DOMAIN,
+		    "Approximately %lld seconds of data\n"
+		    "\tmust be discarded, irreversibly.  "),
+		    (longlong_t)loss);
+	}
+	if (edata != 0 && edata != UINT64_MAX) {
+		if (edata == 1) {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "After rewind, at least\n"
+			    "\tone persistent user-data error will remain.  "));
+		} else {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "After rewind, several\n"
+			    "\tpersistent user-data errors will remain.  "));
+		}
+	}
+	(void) printf(dgettext(TEXT_DOMAIN,
+	    "Recovery can be attempted\n\tby executing 'zpool %s -F %s'.  "),
+	    reason >= 0 ? "clear" : "import", name);
+
+	(void) printf(dgettext(TEXT_DOMAIN,
+	    "A scrub of the pool\n"
+	    "\tis strongly recommended after recovery.\n"));
+	return;
+
+no_info:
+	(void) printf(dgettext(TEXT_DOMAIN,
+	    "Destroy and re-create the pool from\n\ta backup source.\n"));
+}
+
+/*
+ * zpool_import() is a contracted interface. Should be kept the same
+ * if possible.
+ *
+ * Applications should use zpool_import_props() to import a pool with
+ * new properties value to be set.
+ */
+int
+zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
+    char *altroot)
+{
+	nvlist_t *props = NULL;
+	int ret;
+
+	if (altroot != NULL) {
+		if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
+			return (zfs_error_fmt(hdl, EZFS_NOMEM,
+			    dgettext(TEXT_DOMAIN, "cannot import '%s'"),
+			    newname));
+		}
+
+		if (nvlist_add_string(props,
+		    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0 ||
+		    nvlist_add_string(props,
+		    zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), "none") != 0) {
+			nvlist_free(props);
+			return (zfs_error_fmt(hdl, EZFS_NOMEM,
+			    dgettext(TEXT_DOMAIN, "cannot import '%s'"),
+			    newname));
+		}
+	}
+
+	ret = zpool_import_props(hdl, config, newname, props,
+	    ZFS_IMPORT_NORMAL);
+	if (props)
+		nvlist_free(props);
+	return (ret);
+}
+
+static void
+print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv,
+    int indent)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	char *vname;
+	uint64_t is_log = 0;
+
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG,
+	    &is_log);
+
+	if (name != NULL)
+		(void) printf("\t%*s%s%s\n", indent, "", name,
+		    is_log ? " [log]" : "");
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		return;
+
+	for (c = 0; c < children; c++) {
+		vname = zpool_vdev_name(hdl, NULL, child[c], B_TRUE);
+		print_vdev_tree(hdl, vname, child[c], indent + 2);
+		free(vname);
+	}
+}
+
+void
+zpool_print_unsup_feat(nvlist_t *config)
+{
+	nvlist_t *nvinfo, *unsup_feat;
+	nvpair_t *nvp;
+
+	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) ==
+	    0);
+	verify(nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT,
+	    &unsup_feat) == 0);
+
+	for (nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL;
+	    nvp = nvlist_next_nvpair(unsup_feat, nvp)) {
+		char *desc;
+
+		verify(nvpair_type(nvp) == DATA_TYPE_STRING);
+		verify(nvpair_value_string(nvp, &desc) == 0);
+
+		if (strlen(desc) > 0)
+			(void) printf("\t%s (%s)\n", nvpair_name(nvp), desc);
+		else
+			(void) printf("\t%s\n", nvpair_name(nvp));
+	}
+}
+
+/*
+ * Import the given pool using the known configuration and a list of
+ * properties to be set. The configuration should have come from
+ * zpool_find_import(). The 'newname' parameters control whether the pool
+ * is imported with a different name.
+ */
+int
+zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
+    nvlist_t *props, int flags)
+{
+	zfs_cmd_t zc = {"\0"};
+	zpool_rewind_policy_t policy;
+	nvlist_t *nv = NULL;
+	nvlist_t *nvinfo = NULL;
+	nvlist_t *missing = NULL;
+	char *thename;
+	char *origname;
+	int ret;
+	int error = 0;
+	char errbuf[1024];
+
+	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+	    &origname) == 0);
+
+	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+	    "cannot import pool '%s'"), origname);
+
+	if (newname != NULL) {
+		if (!zpool_name_valid(hdl, B_FALSE, newname))
+			return (zfs_error_fmt(hdl, EZFS_INVALIDNAME,
+			    dgettext(TEXT_DOMAIN, "cannot import '%s'"),
+			    newname));
+		thename = (char *)newname;
+	} else {
+		thename = origname;
+	}
+
+	if (props) {
+		uint64_t version;
+		prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE };
+
+		verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+		    &version) == 0);
+
+		if ((props = zpool_valid_proplist(hdl, origname,
+		    props, version, flags, errbuf)) == NULL) {
+			return (-1);
+		} else if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) {
+			nvlist_free(props);
+			return (-1);
+		}
+	}
+
+	(void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name));
+
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+	    &zc.zc_guid) == 0);
+
+	if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) {
+		nvlist_free(props);
+		return (-1);
+	}
+	if (zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2) != 0) {
+		nvlist_free(props);
+		return (-1);
+	}
+
+	zc.zc_cookie = flags;
+	while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 &&
+	    errno == ENOMEM) {
+		if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
+			zcmd_free_nvlists(&zc);
+			return (-1);
+		}
+	}
+	if (ret != 0)
+		error = errno;
+
+	(void) zcmd_read_dst_nvlist(hdl, &zc, &nv);
+	zpool_get_rewind_policy(config, &policy);
+
+	if (error) {
+		char desc[1024];
+
+		/*
+		 * Dry-run failed, but we print out what success
+		 * looks like if we found a best txg
+		 */
+		if (policy.zrp_request & ZPOOL_TRY_REWIND) {
+			zpool_rewind_exclaim(hdl, newname ? origname : thename,
+			    B_TRUE, nv);
+			nvlist_free(nv);
+			return (-1);
+		}
+
+		if (newname == NULL)
+			(void) snprintf(desc, sizeof (desc),
+			    dgettext(TEXT_DOMAIN, "cannot import '%s'"),
+			    thename);
+		else
+			(void) snprintf(desc, sizeof (desc),
+			    dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"),
+			    origname, thename);
+
+		switch (error) {
+		case ENOTSUP:
+			if (nv != NULL && nvlist_lookup_nvlist(nv,
+			    ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 &&
+			    nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) {
+				(void) printf(dgettext(TEXT_DOMAIN, "This "
+				    "pool uses the following feature(s) not "
+				    "supported by this system:\n"));
+				zpool_print_unsup_feat(nv);
+				if (nvlist_exists(nvinfo,
+				    ZPOOL_CONFIG_CAN_RDONLY)) {
+					(void) printf(dgettext(TEXT_DOMAIN,
+					    "All unsupported features are only "
+					    "required for writing to the pool."
+					    "\nThe pool can be imported using "
+					    "'-o readonly=on'.\n"));
+				}
+			}
+			/*
+			 * Unsupported version.
+			 */
+			(void) zfs_error(hdl, EZFS_BADVERSION, desc);
+			break;
+
+		case EINVAL:
+			(void) zfs_error(hdl, EZFS_INVALCONFIG, desc);
+			break;
+
+		case EROFS:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "one or more devices is read only"));
+			(void) zfs_error(hdl, EZFS_BADDEV, desc);
+			break;
+
+		case ENXIO:
+			if (nv && nvlist_lookup_nvlist(nv,
+			    ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 &&
+			    nvlist_lookup_nvlist(nvinfo,
+			    ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) {
+				(void) printf(dgettext(TEXT_DOMAIN,
+				    "The devices below are missing, use "
+				    "'-m' to import the pool anyway:\n"));
+				print_vdev_tree(hdl, NULL, missing, 2);
+				(void) printf("\n");
+			}
+			(void) zpool_standard_error(hdl, error, desc);
+			break;
+
+		case EEXIST:
+			(void) zpool_standard_error(hdl, error, desc);
+			break;
+
+		case EBUSY:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "one or more devices are already in use\n"));
+			(void) zfs_error(hdl, EZFS_BADDEV, desc);
+			break;
+
+		default:
+			(void) zpool_standard_error(hdl, error, desc);
+			zpool_explain_recover(hdl,
+			    newname ? origname : thename, -error, nv);
+			break;
+		}
+
+		nvlist_free(nv);
+		ret = -1;
+	} else {
+		zpool_handle_t *zhp;
+
+		/*
+		 * This should never fail, but play it safe anyway.
+		 */
+		if (zpool_open_silent(hdl, thename, &zhp) != 0)
+			ret = -1;
+		else if (zhp != NULL)
+			zpool_close(zhp);
+		if (policy.zrp_request &
+		    (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
+			zpool_rewind_exclaim(hdl, newname ? origname : thename,
+			    ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0), nv);
+		}
+		nvlist_free(nv);
+		return (0);
+	}
+
+	zcmd_free_nvlists(&zc);
+	nvlist_free(props);
+
+	return (ret);
+}
+
+/*
+ * Scan the pool.
+ */
+int
+zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func)
+{
+	zfs_cmd_t zc = {"\0"};
+	char msg[1024];
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	zc.zc_cookie = func;
+
+	if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0 ||
+	    (errno == ENOENT && func != POOL_SCAN_NONE))
+		return (0);
+
+	if (func == POOL_SCAN_SCRUB) {
+		(void) snprintf(msg, sizeof (msg),
+		    dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name);
+	} else if (func == POOL_SCAN_NONE) {
+		(void) snprintf(msg, sizeof (msg),
+		    dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"),
+		    zc.zc_name);
+	} else {
+		assert(!"unexpected result");
+	}
+
+	if (errno == EBUSY) {
+		nvlist_t *nvroot;
+		pool_scan_stat_t *ps = NULL;
+		uint_t psc;
+
+		verify(nvlist_lookup_nvlist(zhp->zpool_config,
+		    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+		(void) nvlist_lookup_uint64_array(nvroot,
+		    ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc);
+		if (ps && ps->pss_func == POOL_SCAN_SCRUB)
+			return (zfs_error(hdl, EZFS_SCRUBBING, msg));
+		else
+			return (zfs_error(hdl, EZFS_RESILVERING, msg));
+	} else if (errno == ENOENT) {
+		return (zfs_error(hdl, EZFS_NO_SCRUB, msg));
+	} else {
+		return (zpool_standard_error(hdl, errno, msg));
+	}
+}
+
+/*
+ * Find a vdev that matches the search criteria specified. We use the
+ * the nvpair name to determine how we should look for the device.
+ * 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL
+ * spare; but FALSE if its an INUSE spare.
+ */
+static nvlist_t *
+vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare,
+    boolean_t *l2cache, boolean_t *log)
+{
+	uint_t c, children;
+	nvlist_t **child;
+	nvlist_t *ret;
+	uint64_t is_log;
+	char *srchkey;
+	nvpair_t *pair = nvlist_next_nvpair(search, NULL);
+
+	/* Nothing to look for */
+	if (search == NULL || pair == NULL)
+		return (NULL);
+
+	/* Obtain the key we will use to search */
+	srchkey = nvpair_name(pair);
+
+	switch (nvpair_type(pair)) {
+	case DATA_TYPE_UINT64:
+		if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) {
+			uint64_t srchval, theguid;
+
+			verify(nvpair_value_uint64(pair, &srchval) == 0);
+			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
+			    &theguid) == 0);
+			if (theguid == srchval)
+				return (nv);
+		}
+		break;
+
+	case DATA_TYPE_STRING: {
+		char *srchval, *val;
+
+		verify(nvpair_value_string(pair, &srchval) == 0);
+		if (nvlist_lookup_string(nv, srchkey, &val) != 0)
+			break;
+
+		/*
+		 * Search for the requested value. Special cases:
+		 *
+		 * - ZPOOL_CONFIG_PATH for whole disk entries.  These end in
+		 *   "-part1", or "p1".  The suffix is hidden from the user,
+		 *   but included in the string, so this matches around it.
+		 * - ZPOOL_CONFIG_PATH for short names zfs_strcmp_shortname()
+		 *   is used to check all possible expanded paths.
+		 * - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE).
+		 *
+		 * Otherwise, all other searches are simple string compares.
+		 */
+		if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0) {
+			uint64_t wholedisk = 0;
+
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+			    &wholedisk);
+			if (zfs_strcmp_pathname(srchval, val, wholedisk) == 0)
+				return (nv);
+
+		} else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) {
+			char *type, *idx, *end, *p;
+			uint64_t id, vdev_id;
+
+			/*
+			 * Determine our vdev type, keeping in mind
+			 * that the srchval is composed of a type and
+			 * vdev id pair (i.e. mirror-4).
+			 */
+			if ((type = strdup(srchval)) == NULL)
+				return (NULL);
+
+			if ((p = strrchr(type, '-')) == NULL) {
+				free(type);
+				break;
+			}
+			idx = p + 1;
+			*p = '\0';
+
+			/*
+			 * If the types don't match then keep looking.
+			 */
+			if (strncmp(val, type, strlen(val)) != 0) {
+				free(type);
+				break;
+			}
+
+			verify(strncmp(type, VDEV_TYPE_RAIDZ,
+			    strlen(VDEV_TYPE_RAIDZ)) == 0 ||
+			    strncmp(type, VDEV_TYPE_MIRROR,
+			    strlen(VDEV_TYPE_MIRROR)) == 0);
+			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
+			    &id) == 0);
+
+			errno = 0;
+			vdev_id = strtoull(idx, &end, 10);
+
+			free(type);
+			if (errno != 0)
+				return (NULL);
+
+			/*
+			 * Now verify that we have the correct vdev id.
+			 */
+			if (vdev_id == id)
+				return (nv);
+		}
+
+		/*
+		 * Common case
+		 */
+		if (strcmp(srchval, val) == 0)
+			return (nv);
+		break;
+	}
+
+	default:
+		break;
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		return (NULL);
+
+	for (c = 0; c < children; c++) {
+		if ((ret = vdev_to_nvlist_iter(child[c], search,
+		    avail_spare, l2cache, NULL)) != NULL) {
+			/*
+			 * The 'is_log' value is only set for the toplevel
+			 * vdev, not the leaf vdevs.  So we always lookup the
+			 * log device from the root of the vdev tree (where
+			 * 'log' is non-NULL).
+			 */
+			if (log != NULL &&
+			    nvlist_lookup_uint64(child[c],
+			    ZPOOL_CONFIG_IS_LOG, &is_log) == 0 &&
+			    is_log) {
+				*log = B_TRUE;
+			}
+			return (ret);
+		}
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++) {
+			if ((ret = vdev_to_nvlist_iter(child[c], search,
+			    avail_spare, l2cache, NULL)) != NULL) {
+				*avail_spare = B_TRUE;
+				return (ret);
+			}
+		}
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++) {
+			if ((ret = vdev_to_nvlist_iter(child[c], search,
+			    avail_spare, l2cache, NULL)) != NULL) {
+				*l2cache = B_TRUE;
+				return (ret);
+			}
+		}
+	}
+
+	return (NULL);
+}
+
+/*
+ * Given a physical path (minus the "/devices" prefix), find the
+ * associated vdev.
+ */
+nvlist_t *
+zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath,
+    boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log)
+{
+	nvlist_t *search, *nvroot, *ret;
+
+	verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0);
+
+	verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+
+	*avail_spare = B_FALSE;
+	*l2cache = B_FALSE;
+	if (log != NULL)
+		*log = B_FALSE;
+	ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
+	nvlist_free(search);
+
+	return (ret);
+}
+
+/*
+ * Determine if we have an "interior" top-level vdev (i.e mirror/raidz).
+ */
+boolean_t
+zpool_vdev_is_interior(const char *name)
+{
+	if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
+	    strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0)
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
+nvlist_t *
+zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
+    boolean_t *l2cache, boolean_t *log)
+{
+	char *end;
+	nvlist_t *nvroot, *search, *ret;
+	uint64_t guid;
+
+	verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	guid = strtoull(path, &end, 0);
+	if (guid != 0 && *end == '\0') {
+		verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0);
+	} else if (zpool_vdev_is_interior(path)) {
+		verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0);
+	} else {
+		verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0);
+	}
+
+	verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+
+	*avail_spare = B_FALSE;
+	*l2cache = B_FALSE;
+	if (log != NULL)
+		*log = B_FALSE;
+	ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
+	nvlist_free(search);
+
+	return (ret);
+}
+
+static int
+vdev_online(nvlist_t *nv)
+{
+	uint64_t ival;
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 ||
+	    nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 ||
+	    nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0)
+		return (0);
+
+	return (1);
+}
+
+/*
+ * Helper function for zpool_get_physpaths().
+ */
+static int
+vdev_get_one_physpath(nvlist_t *config, char *physpath, size_t physpath_size,
+    size_t *bytes_written)
+{
+	size_t bytes_left, pos, rsz;
+	char *tmppath;
+	const char *format;
+
+	if (nvlist_lookup_string(config, ZPOOL_CONFIG_PHYS_PATH,
+	    &tmppath) != 0)
+		return (EZFS_NODEVICE);
+
+	pos = *bytes_written;
+	bytes_left = physpath_size - pos;
+	format = (pos == 0) ? "%s" : " %s";
+
+	rsz = snprintf(physpath + pos, bytes_left, format, tmppath);
+	*bytes_written += rsz;
+
+	if (rsz >= bytes_left) {
+		/* if physpath was not copied properly, clear it */
+		if (bytes_left != 0) {
+			physpath[pos] = 0;
+		}
+		return (EZFS_NOSPC);
+	}
+	return (0);
+}
+
+static int
+vdev_get_physpaths(nvlist_t *nv, char *physpath, size_t phypath_size,
+    size_t *rsz, boolean_t is_spare)
+{
+	char *type;
+	int ret;
+
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
+		return (EZFS_INVALCONFIG);
+
+	if (strcmp(type, VDEV_TYPE_DISK) == 0) {
+		/*
+		 * An active spare device has ZPOOL_CONFIG_IS_SPARE set.
+		 * For a spare vdev, we only want to boot from the active
+		 * spare device.
+		 */
+		if (is_spare) {
+			uint64_t spare = 0;
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
+			    &spare);
+			if (!spare)
+				return (EZFS_INVALCONFIG);
+		}
+
+		if (vdev_online(nv)) {
+			if ((ret = vdev_get_one_physpath(nv, physpath,
+			    phypath_size, rsz)) != 0)
+				return (ret);
+		}
+	} else if (strcmp(type, VDEV_TYPE_MIRROR) == 0 ||
+	    strcmp(type, VDEV_TYPE_REPLACING) == 0 ||
+	    (is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) {
+		nvlist_t **child;
+		uint_t count;
+		int i, ret;
+
+		if (nvlist_lookup_nvlist_array(nv,
+		    ZPOOL_CONFIG_CHILDREN, &child, &count) != 0)
+			return (EZFS_INVALCONFIG);
+
+		for (i = 0; i < count; i++) {
+			ret = vdev_get_physpaths(child[i], physpath,
+			    phypath_size, rsz, is_spare);
+			if (ret == EZFS_NOSPC)
+				return (ret);
+		}
+	}
+
+	return (EZFS_POOL_INVALARG);
+}
+
+/*
+ * Get phys_path for a root pool config.
+ * Return 0 on success; non-zero on failure.
+ */
+static int
+zpool_get_config_physpath(nvlist_t *config, char *physpath, size_t phypath_size)
+{
+	size_t rsz;
+	nvlist_t *vdev_root;
+	nvlist_t **child;
+	uint_t count;
+	char *type;
+
+	rsz = 0;
+
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &vdev_root) != 0)
+		return (EZFS_INVALCONFIG);
+
+	if (nvlist_lookup_string(vdev_root, ZPOOL_CONFIG_TYPE, &type) != 0 ||
+	    nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN,
+	    &child, &count) != 0)
+		return (EZFS_INVALCONFIG);
+
+#if defined(__sun__) || defined(__sun)
+	/*
+	 * root pool can not have EFI labeled disks and can only have
+	 * a single top-level vdev.
+	 */
+	if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1 ||
+	    pool_uses_efi(vdev_root))
+		return (EZFS_POOL_INVALARG);
+#endif
+
+	(void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz,
+	    B_FALSE);
+
+	/* No online devices */
+	if (rsz == 0)
+		return (EZFS_NODEVICE);
+
+	return (0);
+}
+
+/*
+ * Get phys_path for a root pool
+ * Return 0 on success; non-zero on failure.
+ */
+int
+zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size)
+{
+	return (zpool_get_config_physpath(zhp->zpool_config, physpath,
+	    phypath_size));
+}
+
+/*
+ * If the device has being dynamically expanded then we need to relabel
+ * the disk to use the new unallocated space.
+ */
+static int
+zpool_relabel_disk(libzfs_handle_t *hdl, const char *path, const char *msg)
+{
+	int fd, error;
+
+	if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
+		    "relabel '%s': unable to open device: %d"), path, errno);
+		return (zfs_error(hdl, EZFS_OPENFAILED, msg));
+	}
+
+	/*
+	 * It's possible that we might encounter an error if the device
+	 * does not have any unallocated space left. If so, we simply
+	 * ignore that error and continue on.
+	 *
+	 * Also, we don't call efi_rescan() - that would just return EBUSY.
+	 * The module will do it for us in vdev_disk_open().
+	 */
+	error = efi_use_whole_disk(fd);
+	(void) close(fd);
+	if (error && error != VT_ENOSPC) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
+		    "relabel '%s': unable to read disk capacity"), path);
+		return (zfs_error(hdl, EZFS_NOCAP, msg));
+	}
+	return (0);
+}
+
+/*
+ * Bring the specified vdev online.   The 'flags' parameter is a set of the
+ * ZFS_ONLINE_* flags.
+ */
+int
+zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags,
+    vdev_state_t *newstate)
+{
+	zfs_cmd_t zc = {"\0"};
+	char msg[1024];
+	nvlist_t *tgt;
+	boolean_t avail_spare, l2cache, islog;
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	int error;
+
+	if (flags & ZFS_ONLINE_EXPAND) {
+		(void) snprintf(msg, sizeof (msg),
+		    dgettext(TEXT_DOMAIN, "cannot expand %s"), path);
+	} else {
+		(void) snprintf(msg, sizeof (msg),
+		    dgettext(TEXT_DOMAIN, "cannot online %s"), path);
+	}
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
+	    &islog)) == NULL)
+		return (zfs_error(hdl, EZFS_NODEVICE, msg));
+
+	verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
+
+	if (avail_spare)
+		return (zfs_error(hdl, EZFS_ISSPARE, msg));
+
+	if (flags & ZFS_ONLINE_EXPAND ||
+	    zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
+		uint64_t wholedisk = 0;
+
+		(void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
+		    &wholedisk);
+
+		/*
+		 * XXX - L2ARC 1.0 devices can't support expansion.
+		 */
+		if (l2cache) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "cannot expand cache devices"));
+			return (zfs_error(hdl, EZFS_VDEVNOTSUP, msg));
+		}
+
+		if (wholedisk) {
+			const char *fullpath = path;
+			char buf[MAXPATHLEN];
+
+			if (path[0] != '/') {
+				error = zfs_resolve_shortname(path, buf,
+				    sizeof (buf));
+				if (error != 0)
+					return (zfs_error(hdl, EZFS_NODEVICE,
+					    msg));
+
+				fullpath = buf;
+			}
+
+			error = zpool_relabel_disk(hdl, fullpath, msg);
+			if (error != 0)
+				return (error);
+		}
+	}
+
+	zc.zc_cookie = VDEV_STATE_ONLINE;
+	zc.zc_obj = flags;
+
+	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) {
+		if (errno == EINVAL) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split "
+			    "from this pool into a new one.  Use '%s' "
+			    "instead"), "zpool detach");
+			return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, msg));
+		}
+		return (zpool_standard_error(hdl, errno, msg));
+	}
+
+	*newstate = zc.zc_cookie;
+	return (0);
+}
+
+/*
+ * Take the specified vdev offline
+ */
+int
+zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
+{
+	zfs_cmd_t zc = {"\0"};
+	char msg[1024];
+	nvlist_t *tgt;
+	boolean_t avail_spare, l2cache;
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot offline %s"), path);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
+	    NULL)) == NULL)
+		return (zfs_error(hdl, EZFS_NODEVICE, msg));
+
+	verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
+
+	if (avail_spare)
+		return (zfs_error(hdl, EZFS_ISSPARE, msg));
+
+	zc.zc_cookie = VDEV_STATE_OFFLINE;
+	zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0;
+
+	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+		return (0);
+
+	switch (errno) {
+	case EBUSY:
+
+		/*
+		 * There are no other replicas of this device.
+		 */
+		return (zfs_error(hdl, EZFS_NOREPLICAS, msg));
+
+	case EEXIST:
+		/*
+		 * The log device has unplayed logs
+		 */
+		return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg));
+
+	default:
+		return (zpool_standard_error(hdl, errno, msg));
+	}
+}
+
+/*
+ * Mark the given vdev faulted.
+ */
+int
+zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
+{
+	zfs_cmd_t zc = {"\0"};
+	char msg[1024];
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot fault %llu"), (u_longlong_t)guid);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	zc.zc_guid = guid;
+	zc.zc_cookie = VDEV_STATE_FAULTED;
+	zc.zc_obj = aux;
+
+	if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+		return (0);
+
+	switch (errno) {
+	case EBUSY:
+
+		/*
+		 * There are no other replicas of this device.
+		 */
+		return (zfs_error(hdl, EZFS_NOREPLICAS, msg));
+
+	default:
+		return (zpool_standard_error(hdl, errno, msg));
+	}
+
+}
+
+/*
+ * Mark the given vdev degraded.
+ */
+int
+zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
+{
+	zfs_cmd_t zc = {"\0"};
+	char msg[1024];
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot degrade %llu"), (u_longlong_t)guid);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	zc.zc_guid = guid;
+	zc.zc_cookie = VDEV_STATE_DEGRADED;
+	zc.zc_obj = aux;
+
+	if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+		return (0);
+
+	return (zpool_standard_error(hdl, errno, msg));
+}
+
+/*
+ * Returns TRUE if the given nvlist is a vdev that was originally swapped in as
+ * a hot spare.
+ */
+static boolean_t
+is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	char *type;
+
+	if (nvlist_lookup_nvlist_array(search, ZPOOL_CONFIG_CHILDREN, &child,
+	    &children) == 0) {
+		verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE,
+		    &type) == 0);
+
+		if (strcmp(type, VDEV_TYPE_SPARE) == 0 &&
+		    children == 2 && child[which] == tgt)
+			return (B_TRUE);
+
+		for (c = 0; c < children; c++)
+			if (is_replacing_spare(child[c], tgt, which))
+				return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Attach new_disk (fully described by nvroot) to old_disk.
+ * If 'replacing' is specified, the new disk will replace the old one.
+ */
+int
+zpool_vdev_attach(zpool_handle_t *zhp,
+    const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing)
+{
+	zfs_cmd_t zc = {"\0"};
+	char msg[1024];
+	int ret;
+	nvlist_t *tgt;
+	boolean_t avail_spare, l2cache, islog;
+	uint64_t val;
+	char *newname;
+	nvlist_t **child;
+	uint_t children;
+	nvlist_t *config_root;
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	boolean_t rootpool = zpool_is_bootable(zhp);
+
+	if (replacing)
+		(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+		    "cannot replace %s with %s"), old_disk, new_disk);
+	else
+		(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+		    "cannot attach %s to %s"), new_disk, old_disk);
+
+#if defined(__sun__) || defined(__sun)
+	/*
+	 * If this is a root pool, make sure that we're not attaching an
+	 * EFI labeled device.
+	 */
+	if (rootpool && pool_uses_efi(nvroot)) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "EFI labeled devices are not supported on root pools."));
+		return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg));
+	}
+#endif
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache,
+	    &islog)) == 0)
+		return (zfs_error(hdl, EZFS_NODEVICE, msg));
+
+	if (avail_spare)
+		return (zfs_error(hdl, EZFS_ISSPARE, msg));
+
+	if (l2cache)
+		return (zfs_error(hdl, EZFS_ISL2CACHE, msg));
+
+	verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
+	zc.zc_cookie = replacing;
+
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0 || children != 1) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "new device must be a single disk"));
+		return (zfs_error(hdl, EZFS_INVALCONFIG, msg));
+	}
+
+	verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
+	    ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0);
+
+	if ((newname = zpool_vdev_name(NULL, NULL, child[0], B_FALSE)) == NULL)
+		return (-1);
+
+	/*
+	 * If the target is a hot spare that has been swapped in, we can only
+	 * replace it with another hot spare.
+	 */
+	if (replacing &&
+	    nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 &&
+	    (zpool_find_vdev(zhp, newname, &avail_spare, &l2cache,
+	    NULL) == NULL || !avail_spare) &&
+	    is_replacing_spare(config_root, tgt, 1)) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "can only be replaced by another hot spare"));
+		free(newname);
+		return (zfs_error(hdl, EZFS_BADTARGET, msg));
+	}
+
+	free(newname);
+
+	if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
+		return (-1);
+
+	ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc);
+
+	zcmd_free_nvlists(&zc);
+
+	if (ret == 0) {
+		if (rootpool) {
+			/*
+			 * XXX need a better way to prevent user from
+			 * booting up a half-baked vdev.
+			 */
+			(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make "
+			    "sure to wait until resilver is done "
+			    "before rebooting.\n"));
+		}
+		return (0);
+	}
+
+	switch (errno) {
+	case ENOTSUP:
+		/*
+		 * Can't attach to or replace this type of vdev.
+		 */
+		if (replacing) {
+			uint64_t version = zpool_get_prop_int(zhp,
+			    ZPOOL_PROP_VERSION, NULL);
+
+			if (islog)
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "cannot replace a log with a spare"));
+			else if (version >= SPA_VERSION_MULTI_REPLACE)
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "already in replacing/spare config; wait "
+				    "for completion or use 'zpool detach'"));
+			else
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "cannot replace a replacing device"));
+		} else {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "can only attach to mirrors and top-level "
+			    "disks"));
+		}
+		(void) zfs_error(hdl, EZFS_BADTARGET, msg);
+		break;
+
+	case EINVAL:
+		/*
+		 * The new device must be a single disk.
+		 */
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "new device must be a single disk"));
+		(void) zfs_error(hdl, EZFS_INVALCONFIG, msg);
+		break;
+
+	case EBUSY:
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy"),
+		    new_disk);
+		(void) zfs_error(hdl, EZFS_BADDEV, msg);
+		break;
+
+	case EOVERFLOW:
+		/*
+		 * The new device is too small.
+		 */
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "device is too small"));
+		(void) zfs_error(hdl, EZFS_BADDEV, msg);
+		break;
+
+	case EDOM:
+		/*
+		 * The new device has a different optimal sector size.
+		 */
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "new device has a different optimal sector size; use the "
+		    "option '-o ashift=N' to override the optimal size"));
+		(void) zfs_error(hdl, EZFS_BADDEV, msg);
+		break;
+
+	case ENAMETOOLONG:
+		/*
+		 * The resulting top-level vdev spec won't fit in the label.
+		 */
+		(void) zfs_error(hdl, EZFS_DEVOVERFLOW, msg);
+		break;
+
+	default:
+		(void) zpool_standard_error(hdl, errno, msg);
+	}
+
+	return (-1);
+}
+
+/*
+ * Detach the specified device.
+ */
+int
+zpool_vdev_detach(zpool_handle_t *zhp, const char *path)
+{
+	zfs_cmd_t zc = {"\0"};
+	char msg[1024];
+	nvlist_t *tgt;
+	boolean_t avail_spare, l2cache;
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot detach %s"), path);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
+	    NULL)) == 0)
+		return (zfs_error(hdl, EZFS_NODEVICE, msg));
+
+	if (avail_spare)
+		return (zfs_error(hdl, EZFS_ISSPARE, msg));
+
+	if (l2cache)
+		return (zfs_error(hdl, EZFS_ISL2CACHE, msg));
+
+	verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
+
+	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_DETACH, &zc) == 0)
+		return (0);
+
+	switch (errno) {
+
+	case ENOTSUP:
+		/*
+		 * Can't detach from this type of vdev.
+		 */
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only "
+		    "applicable to mirror and replacing vdevs"));
+		(void) zfs_error(hdl, EZFS_BADTARGET, msg);
+		break;
+
+	case EBUSY:
+		/*
+		 * There are no other replicas of this device.
+		 */
+		(void) zfs_error(hdl, EZFS_NOREPLICAS, msg);
+		break;
+
+	default:
+		(void) zpool_standard_error(hdl, errno, msg);
+	}
+
+	return (-1);
+}
+
+/*
+ * Find a mirror vdev in the source nvlist.
+ *
+ * The mchild array contains a list of disks in one of the top-level mirrors
+ * of the source pool.  The schild array contains a list of disks that the
+ * user specified on the command line.  We loop over the mchild array to
+ * see if any entry in the schild array matches.
+ *
+ * If a disk in the mchild array is found in the schild array, we return
+ * the index of that entry.  Otherwise we return -1.
+ */
+static int
+find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren,
+    nvlist_t **schild, uint_t schildren)
+{
+	uint_t mc;
+
+	for (mc = 0; mc < mchildren; mc++) {
+		uint_t sc;
+		char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp,
+		    mchild[mc], B_FALSE);
+
+		for (sc = 0; sc < schildren; sc++) {
+			char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp,
+			    schild[sc], B_FALSE);
+			boolean_t result = (strcmp(mpath, spath) == 0);
+
+			free(spath);
+			if (result) {
+				free(mpath);
+				return (mc);
+			}
+		}
+
+		free(mpath);
+	}
+
+	return (-1);
+}
+
+/*
+ * Split a mirror pool.  If newroot points to null, then a new nvlist
+ * is generated and it is the responsibility of the caller to free it.
+ */
+int
+zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot,
+    nvlist_t *props, splitflags_t flags)
+{
+	zfs_cmd_t zc = {"\0"};
+	char msg[1024];
+	nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL;
+	nvlist_t **varray = NULL, *zc_props = NULL;
+	uint_t c, children, newchildren, lastlog = 0, vcount, found = 0;
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	uint64_t vers;
+	boolean_t freelist = B_FALSE, memory_err = B_TRUE;
+	int retval = 0;
+
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name);
+
+	if (!zpool_name_valid(hdl, B_FALSE, newname))
+		return (zfs_error(hdl, EZFS_INVALIDNAME, msg));
+
+	if ((config = zpool_get_config(zhp, NULL)) == NULL) {
+		(void) fprintf(stderr, gettext("Internal error: unable to "
+		    "retrieve pool configuration\n"));
+		return (-1);
+	}
+
+	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree)
+	    == 0);
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0);
+
+	if (props) {
+		prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE };
+		if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name,
+		    props, vers, flags, msg)) == NULL)
+			return (-1);
+	}
+
+	if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
+	    &children) != 0) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "Source pool is missing vdev tree"));
+		if (zc_props)
+			nvlist_free(zc_props);
+		return (-1);
+	}
+
+	varray = zfs_alloc(hdl, children * sizeof (nvlist_t *));
+	vcount = 0;
+
+	if (*newroot == NULL ||
+	    nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN,
+	    &newchild, &newchildren) != 0)
+		newchildren = 0;
+
+	for (c = 0; c < children; c++) {
+		uint64_t is_log = B_FALSE, is_hole = B_FALSE;
+		char *type;
+		nvlist_t **mchild, *vdev;
+		uint_t mchildren;
+		int entry;
+
+		/*
+		 * Unlike cache & spares, slogs are stored in the
+		 * ZPOOL_CONFIG_CHILDREN array.  We filter them out here.
+		 */
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+		    &is_log);
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+		    &is_hole);
+		if (is_log || is_hole) {
+			/*
+			 * Create a hole vdev and put it in the config.
+			 */
+			if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0)
+				goto out;
+			if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE,
+			    VDEV_TYPE_HOLE) != 0)
+				goto out;
+			if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE,
+			    1) != 0)
+				goto out;
+			if (lastlog == 0)
+				lastlog = vcount;
+			varray[vcount++] = vdev;
+			continue;
+		}
+		lastlog = 0;
+		verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type)
+		    == 0);
+		if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "Source pool must be composed only of mirrors\n"));
+			retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
+			goto out;
+		}
+
+		verify(nvlist_lookup_nvlist_array(child[c],
+		    ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
+
+		/* find or add an entry for this top-level vdev */
+		if (newchildren > 0 &&
+		    (entry = find_vdev_entry(zhp, mchild, mchildren,
+		    newchild, newchildren)) >= 0) {
+			/* We found a disk that the user specified. */
+			vdev = mchild[entry];
+			++found;
+		} else {
+			/* User didn't specify a disk for this vdev. */
+			vdev = mchild[mchildren - 1];
+		}
+
+		if (nvlist_dup(vdev, &varray[vcount++], 0) != 0)
+			goto out;
+	}
+
+	/* did we find every disk the user specified? */
+	if (found != newchildren) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must "
+		    "include at most one disk from each mirror"));
+		retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
+		goto out;
+	}
+
+	/* Prepare the nvlist for populating. */
+	if (*newroot == NULL) {
+		if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0)
+			goto out;
+		freelist = B_TRUE;
+		if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE,
+		    VDEV_TYPE_ROOT) != 0)
+			goto out;
+	} else {
+		verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0);
+	}
+
+	/* Add all the children we found */
+	if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, varray,
+	    lastlog == 0 ? vcount : lastlog) != 0)
+		goto out;
+
+	/*
+	 * If we're just doing a dry run, exit now with success.
+	 */
+	if (flags.dryrun) {
+		memory_err = B_FALSE;
+		freelist = B_FALSE;
+		goto out;
+	}
+
+	/* now build up the config list & call the ioctl */
+	if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0)
+		goto out;
+
+	if (nvlist_add_nvlist(newconfig,
+	    ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 ||
+	    nvlist_add_string(newconfig,
+	    ZPOOL_CONFIG_POOL_NAME, newname) != 0 ||
+	    nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0)
+		goto out;
+
+	/*
+	 * The new pool is automatically part of the namespace unless we
+	 * explicitly export it.
+	 */
+	if (!flags.import)
+		zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT;
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	(void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string));
+	if (zcmd_write_conf_nvlist(hdl, &zc, newconfig) != 0)
+		goto out;
+	if (zc_props != NULL && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0)
+		goto out;
+
+	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) {
+		retval = zpool_standard_error(hdl, errno, msg);
+		goto out;
+	}
+
+	freelist = B_FALSE;
+	memory_err = B_FALSE;
+
+out:
+	if (varray != NULL) {
+		int v;
+
+		for (v = 0; v < vcount; v++)
+			nvlist_free(varray[v]);
+		free(varray);
+	}
+	zcmd_free_nvlists(&zc);
+	if (zc_props)
+		nvlist_free(zc_props);
+	if (newconfig)
+		nvlist_free(newconfig);
+	if (freelist) {
+		nvlist_free(*newroot);
+		*newroot = NULL;
+	}
+
+	if (retval != 0)
+		return (retval);
+
+	if (memory_err)
+		return (no_memory(hdl));
+
+	return (0);
+}
+
+/*
+ * Remove the given device.  Currently, this is supported only for hot spares
+ * and level 2 cache devices.
+ */
+int
+zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
+{
+	zfs_cmd_t zc = {"\0"};
+	char msg[1024];
+	nvlist_t *tgt;
+	boolean_t avail_spare, l2cache, islog;
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	uint64_t version;
+
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot remove %s"), path);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
+	    &islog)) == 0)
+		return (zfs_error(hdl, EZFS_NODEVICE, msg));
+	/*
+	 * XXX - this should just go away.
+	 */
+	if (!avail_spare && !l2cache && !islog) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "only inactive hot spares, cache, top-level, "
+		    "or log devices can be removed"));
+		return (zfs_error(hdl, EZFS_NODEVICE, msg));
+	}
+
+	version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
+	if (islog && version < SPA_VERSION_HOLES) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "pool must be upgrade to support log removal"));
+		return (zfs_error(hdl, EZFS_BADVERSION, msg));
+	}
+
+	verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
+
+	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0)
+		return (0);
+
+	return (zpool_standard_error(hdl, errno, msg));
+}
+
+/*
+ * Clear the errors for the pool, or the particular device if specified.
+ */
+int
+zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl)
+{
+	zfs_cmd_t zc = {"\0"};
+	char msg[1024];
+	nvlist_t *tgt;
+	zpool_rewind_policy_t policy;
+	boolean_t avail_spare, l2cache;
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	nvlist_t *nvi = NULL;
+	int error;
+
+	if (path)
+		(void) snprintf(msg, sizeof (msg),
+		    dgettext(TEXT_DOMAIN, "cannot clear errors for %s"),
+		    path);
+	else
+		(void) snprintf(msg, sizeof (msg),
+		    dgettext(TEXT_DOMAIN, "cannot clear errors for %s"),
+		    zhp->zpool_name);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	if (path) {
+		if ((tgt = zpool_find_vdev(zhp, path, &avail_spare,
+		    &l2cache, NULL)) == 0)
+			return (zfs_error(hdl, EZFS_NODEVICE, msg));
+
+		/*
+		 * Don't allow error clearing for hot spares.  Do allow
+		 * error clearing for l2cache devices.
+		 */
+		if (avail_spare)
+			return (zfs_error(hdl, EZFS_ISSPARE, msg));
+
+		verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID,
+		    &zc.zc_guid) == 0);
+	}
+
+	zpool_get_rewind_policy(rewindnvl, &policy);
+	zc.zc_cookie = policy.zrp_request;
+
+	if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2) != 0)
+		return (-1);
+
+	if (zcmd_write_src_nvlist(hdl, &zc, rewindnvl) != 0)
+		return (-1);
+
+	while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 &&
+	    errno == ENOMEM) {
+		if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
+			zcmd_free_nvlists(&zc);
+			return (-1);
+		}
+	}
+
+	if (!error || ((policy.zrp_request & ZPOOL_TRY_REWIND) &&
+	    errno != EPERM && errno != EACCES)) {
+		if (policy.zrp_request &
+		    (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
+			(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
+			zpool_rewind_exclaim(hdl, zc.zc_name,
+			    ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0),
+			    nvi);
+			nvlist_free(nvi);
+		}
+		zcmd_free_nvlists(&zc);
+		return (0);
+	}
+
+	zcmd_free_nvlists(&zc);
+	return (zpool_standard_error(hdl, errno, msg));
+}
+
+/*
+ * Similar to zpool_clear(), but takes a GUID (used by fmd).
+ */
+int
+zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
+{
+	zfs_cmd_t zc = {"\0"};
+	char msg[1024];
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"),
+	    (u_longlong_t)guid);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	zc.zc_guid = guid;
+	zc.zc_cookie = ZPOOL_NO_REWIND;
+
+	if (ioctl(hdl->libzfs_fd, ZFS_IOC_CLEAR, &zc) == 0)
+		return (0);
+
+	return (zpool_standard_error(hdl, errno, msg));
+}
+
+/*
+ * Change the GUID for a pool.
+ */
+int
+zpool_reguid(zpool_handle_t *zhp)
+{
+	char msg[1024];
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	zfs_cmd_t zc = {"\0"};
+
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0)
+		return (0);
+
+	return (zpool_standard_error(hdl, errno, msg));
+}
+
+/*
+ * Reopen the pool.
+ */
+int
+zpool_reopen(zpool_handle_t *zhp)
+{
+	zfs_cmd_t zc = {"\0"};
+	char msg[1024];
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot reopen '%s'"),
+	    zhp->zpool_name);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	if (zfs_ioctl(hdl, ZFS_IOC_POOL_REOPEN, &zc) == 0)
+		return (0);
+	return (zpool_standard_error(hdl, errno, msg));
+}
+
+/*
+ * Convert from a devid string to a path.
+ */
+static char *
+devid_to_path(char *devid_str)
+{
+	ddi_devid_t devid;
+	char *minor;
+	char *path;
+	devid_nmlist_t *list = NULL;
+	int ret;
+
+	if (devid_str_decode(devid_str, &devid, &minor) != 0)
+		return (NULL);
+
+	ret = devid_deviceid_to_nmlist("/dev", devid, minor, &list);
+
+	devid_str_free(minor);
+	devid_free(devid);
+
+	if (ret != 0)
+		return (NULL);
+
+	if ((path = strdup(list[0].devname)) == NULL)
+		return (NULL);
+
+	devid_free_nmlist(list);
+
+	return (path);
+}
+
+/*
+ * Convert from a path to a devid string.
+ */
+static char *
+path_to_devid(const char *path)
+{
+	int fd;
+	ddi_devid_t devid;
+	char *minor, *ret;
+
+	if ((fd = open(path, O_RDONLY)) < 0)
+		return (NULL);
+
+	minor = NULL;
+	ret = NULL;
+	if (devid_get(fd, &devid) == 0) {
+		if (devid_get_minor_name(fd, &minor) == 0)
+			ret = devid_str_encode(devid, minor);
+		if (minor != NULL)
+			devid_str_free(minor);
+		devid_free(devid);
+	}
+	(void) close(fd);
+
+	return (ret);
+}
+
+/*
+ * Issue the necessary ioctl() to update the stored path value for the vdev.  We
+ * ignore any failure here, since a common case is for an unprivileged user to
+ * type 'zpool status', and we'll display the correct information anyway.
+ */
+static void
+set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path)
+{
+	zfs_cmd_t zc = {"\0"};
+
+	(void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	(void) strncpy(zc.zc_value, path, sizeof (zc.zc_value));
+	verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
+	    &zc.zc_guid) == 0);
+
+	(void) ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SETPATH, &zc);
+}
+
+/*
+ * Remove partition suffix from a vdev path.  Partition suffixes may take three
+ * forms: "-partX", "pX", or "X", where X is a string of digits.  The second
+ * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The
+ * third case only occurs when preceded by a string matching the regular
+ * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk.
+ */
+static char *
+strip_partition(libzfs_handle_t *hdl, char *path)
+{
+	char *tmp = zfs_strdup(hdl, path);
+	char *part = NULL, *d = NULL;
+
+	if ((part = strstr(tmp, "-part")) && part != tmp) {
+		d = part + 5;
+	} else if ((part = strrchr(tmp, 'p')) &&
+	    part > tmp + 1 && isdigit(*(part-1))) {
+		d = part + 1;
+	} else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') &&
+	    tmp[1] == 'd') {
+		for (d = &tmp[2]; isalpha(*d); part = ++d);
+	} else if (strncmp("xvd", tmp, 3) == 0) {
+		for (d = &tmp[3]; isalpha(*d); part = ++d);
+	}
+	if (part && d && *d != '\0') {
+		for (; isdigit(*d); d++);
+		if (*d == '\0')
+			*part = '\0';
+	}
+	return (tmp);
+}
+
+#define	PATH_BUF_LEN	64
+
+/*
+ * Given a vdev, return the name to display in iostat.  If the vdev has a path,
+ * we use that, stripping off any leading "/dev/dsk/"; if not, we use the type.
+ * We also check if this is a whole disk, in which case we strip off the
+ * trailing 's0' slice name.
+ *
+ * This routine is also responsible for identifying when disks have been
+ * reconfigured in a new location.  The kernel will have opened the device by
+ * devid, but the path will still refer to the old location.  To catch this, we
+ * first do a path -> devid translation (which is fast for the common case).  If
+ * the devid matches, we're done.  If not, we do a reverse devid -> path
+ * translation and issue the appropriate ioctl() to update the path of the vdev.
+ * If 'zhp' is NULL, then this is an exported pool, and we don't need to do any
+ * of these checks.
+ */
+char *
+zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
+    boolean_t verbose)
+{
+	char *path, *devid, *type;
+	uint64_t value;
+	char buf[PATH_BUF_LEN];
+	char tmpbuf[PATH_BUF_LEN];
+	vdev_stat_t *vs;
+	uint_t vsc;
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+	    &value) == 0) {
+		verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
+		    &value) == 0);
+		(void) snprintf(buf, sizeof (buf), "%llu",
+		    (u_longlong_t)value);
+		path = buf;
+	} else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
+		/*
+		 * If the device is dead (faulted, offline, etc) then don't
+		 * bother opening it.  Otherwise we may be forcing the user to
+		 * open a misbehaving device, which can have undesirable
+		 * effects.
+		 */
+		if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+		    (uint64_t **)&vs, &vsc) != 0 ||
+		    vs->vs_state >= VDEV_STATE_DEGRADED) &&
+		    zhp != NULL &&
+		    nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &devid) == 0) {
+			/*
+			 * Determine if the current path is correct.
+			 */
+			char *newdevid = path_to_devid(path);
+
+			if (newdevid == NULL ||
+			    strcmp(devid, newdevid) != 0) {
+				char *newpath;
+
+				if ((newpath = devid_to_path(devid)) != NULL) {
+					/*
+					 * Update the path appropriately.
+					 */
+					set_path(zhp, nv, newpath);
+					if (nvlist_add_string(nv,
+					    ZPOOL_CONFIG_PATH, newpath) == 0)
+						verify(nvlist_lookup_string(nv,
+						    ZPOOL_CONFIG_PATH,
+						    &path) == 0);
+					free(newpath);
+				}
+			}
+
+			if (newdevid)
+				devid_str_free(newdevid);
+		}
+
+		/*
+		 * For a block device only use the name.
+		 */
+		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+		if (strcmp(type, VDEV_TYPE_DISK) == 0) {
+			path = strrchr(path, '/');
+			path++;
+		}
+
+		/*
+		 * Remove the partition from the path it this is a whole disk.
+		 */
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+		    &value) == 0 && value) {
+			return (strip_partition(hdl, path));
+		}
+	} else {
+		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0);
+
+		/*
+		 * If it's a raidz device, we need to stick in the parity level.
+		 */
+		if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) {
+
+			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
+			    &value) == 0);
+			(void) snprintf(buf, sizeof (buf), "%s%llu", path,
+			    (u_longlong_t)value);
+			path = buf;
+		}
+
+		/*
+		 * We identify each top-level vdev by using a <type-id>
+		 * naming convention.
+		 */
+		if (verbose) {
+			uint64_t id;
+
+			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
+			    &id) == 0);
+			(void) snprintf(tmpbuf, sizeof (tmpbuf), "%s-%llu",
+			    path, (u_longlong_t)id);
+			path = tmpbuf;
+		}
+	}
+
+	return (zfs_strdup(hdl, path));
+}
+
+static int
+zbookmark_compare(const void *a, const void *b)
+{
+	return (memcmp(a, b, sizeof (zbookmark_phys_t)));
+}
+
+/*
+ * Retrieve the persistent error log, uniquify the members, and return to the
+ * caller.
+ */
+int
+zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
+{
+	zfs_cmd_t zc = {"\0"};
+	uint64_t count;
+	zbookmark_phys_t *zb = NULL;
+	int i;
+
+	/*
+	 * Retrieve the raw error list from the kernel.  If the number of errors
+	 * has increased, allocate more space and continue until we get the
+	 * entire list.
+	 */
+	verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT,
+	    &count) == 0);
+	if (count == 0)
+		return (0);
+	if ((zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl,
+	    count * sizeof (zbookmark_phys_t))) == (uintptr_t)NULL)
+		return (-1);
+	zc.zc_nvlist_dst_size = count;
+	(void) strcpy(zc.zc_name, zhp->zpool_name);
+	for (;;) {
+		if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_ERROR_LOG,
+		    &zc) != 0) {
+			free((void *)(uintptr_t)zc.zc_nvlist_dst);
+			if (errno == ENOMEM) {
+				void *dst;
+
+				count = zc.zc_nvlist_dst_size;
+				dst = zfs_alloc(zhp->zpool_hdl, count *
+				    sizeof (zbookmark_phys_t));
+				if (dst == NULL)
+					return (-1);
+				zc.zc_nvlist_dst = (uintptr_t)dst;
+			} else {
+				return (-1);
+			}
+		} else {
+			break;
+		}
+	}
+
+	/*
+	 * Sort the resulting bookmarks.  This is a little confusing due to the
+	 * implementation of ZFS_IOC_ERROR_LOG.  The bookmarks are copied last
+	 * to first, and 'zc_nvlist_dst_size' indicates the number of boomarks
+	 * _not_ copied as part of the process.  So we point the start of our
+	 * array appropriate and decrement the total number of elements.
+	 */
+	zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) +
+	    zc.zc_nvlist_dst_size;
+	count -= zc.zc_nvlist_dst_size;
+
+	qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_compare);
+
+	verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);
+
+	/*
+	 * Fill in the nverrlistp with nvlist's of dataset and object numbers.
+	 */
+	for (i = 0; i < count; i++) {
+		nvlist_t *nv;
+
+		/* ignoring zb_blkid and zb_level for now */
+		if (i > 0 && zb[i-1].zb_objset == zb[i].zb_objset &&
+		    zb[i-1].zb_object == zb[i].zb_object)
+			continue;
+
+		if (nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) != 0)
+			goto nomem;
+		if (nvlist_add_uint64(nv, ZPOOL_ERR_DATASET,
+		    zb[i].zb_objset) != 0) {
+			nvlist_free(nv);
+			goto nomem;
+		}
+		if (nvlist_add_uint64(nv, ZPOOL_ERR_OBJECT,
+		    zb[i].zb_object) != 0) {
+			nvlist_free(nv);
+			goto nomem;
+		}
+		if (nvlist_add_nvlist(*nverrlistp, "ejk", nv) != 0) {
+			nvlist_free(nv);
+			goto nomem;
+		}
+		nvlist_free(nv);
+	}
+
+	free((void *)(uintptr_t)zc.zc_nvlist_dst);
+	return (0);
+
+nomem:
+	free((void *)(uintptr_t)zc.zc_nvlist_dst);
+	return (no_memory(zhp->zpool_hdl));
+}
+
+/*
+ * Upgrade a ZFS pool to the latest on-disk version.
+ */
+int
+zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version)
+{
+	zfs_cmd_t zc = {"\0"};
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+
+	(void) strcpy(zc.zc_name, zhp->zpool_name);
+	zc.zc_cookie = new_version;
+
+	if (zfs_ioctl(hdl, ZFS_IOC_POOL_UPGRADE, &zc) != 0)
+		return (zpool_standard_error_fmt(hdl, errno,
+		    dgettext(TEXT_DOMAIN, "cannot upgrade '%s'"),
+		    zhp->zpool_name));
+	return (0);
+}
+
+void
+zfs_save_arguments(int argc, char **argv, char *string, int len)
+{
+	int i;
+
+	(void) strlcpy(string, basename(argv[0]), len);
+	for (i = 1; i < argc; i++) {
+		(void) strlcat(string, " ", len);
+		(void) strlcat(string, argv[i], len);
+	}
+}
+
+int
+zpool_log_history(libzfs_handle_t *hdl, const char *message)
+{
+	zfs_cmd_t zc = {"\0"};
+	nvlist_t *args;
+	int err;
+
+	args = fnvlist_alloc();
+	fnvlist_add_string(args, "message", message);
+	err = zcmd_write_src_nvlist(hdl, &zc, args);
+	if (err == 0)
+		err = ioctl(hdl->libzfs_fd, ZFS_IOC_LOG_HISTORY, &zc);
+	nvlist_free(args);
+	zcmd_free_nvlists(&zc);
+	return (err);
+}
+
+/*
+ * Perform ioctl to get some command history of a pool.
+ *
+ * 'buf' is the buffer to fill up to 'len' bytes.  'off' is the
+ * logical offset of the history buffer to start reading from.
+ *
+ * Upon return, 'off' is the next logical offset to read from and
+ * 'len' is the actual amount of bytes read into 'buf'.
+ */
+static int
+get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len)
+{
+	zfs_cmd_t zc = {"\0"};
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+
+	zc.zc_history = (uint64_t)(uintptr_t)buf;
+	zc.zc_history_len = *len;
+	zc.zc_history_offset = *off;
+
+	if (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_HISTORY, &zc) != 0) {
+		switch (errno) {
+		case EPERM:
+			return (zfs_error_fmt(hdl, EZFS_PERM,
+			    dgettext(TEXT_DOMAIN,
+			    "cannot show history for pool '%s'"),
+			    zhp->zpool_name));
+		case ENOENT:
+			return (zfs_error_fmt(hdl, EZFS_NOHISTORY,
+			    dgettext(TEXT_DOMAIN, "cannot get history for pool "
+			    "'%s'"), zhp->zpool_name));
+		case ENOTSUP:
+			return (zfs_error_fmt(hdl, EZFS_BADVERSION,
+			    dgettext(TEXT_DOMAIN, "cannot get history for pool "
+			    "'%s', pool must be upgraded"), zhp->zpool_name));
+		default:
+			return (zpool_standard_error_fmt(hdl, errno,
+			    dgettext(TEXT_DOMAIN,
+			    "cannot get history for '%s'"), zhp->zpool_name));
+		}
+	}
+
+	*len = zc.zc_history_len;
+	*off = zc.zc_history_offset;
+
+	return (0);
+}
+
+/*
+ * Process the buffer of nvlists, unpacking and storing each nvlist record
+ * into 'records'.  'leftover' is set to the number of bytes that weren't
+ * processed as there wasn't a complete record.
+ */
+int
+zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover,
+    nvlist_t ***records, uint_t *numrecords)
+{
+	uint64_t reclen;
+	nvlist_t *nv;
+	int i;
+
+	while (bytes_read > sizeof (reclen)) {
+
+		/* get length of packed record (stored as little endian) */
+		for (i = 0, reclen = 0; i < sizeof (reclen); i++)
+			reclen += (uint64_t)(((uchar_t *)buf)[i]) << (8*i);
+
+		if (bytes_read < sizeof (reclen) + reclen)
+			break;
+
+		/* unpack record */
+		if (nvlist_unpack(buf + sizeof (reclen), reclen, &nv, 0) != 0)
+			return (ENOMEM);
+		bytes_read -= sizeof (reclen) + reclen;
+		buf += sizeof (reclen) + reclen;
+
+		/* add record to nvlist array */
+		(*numrecords)++;
+		if (ISP2(*numrecords + 1)) {
+			*records = realloc(*records,
+			    *numrecords * 2 * sizeof (nvlist_t *));
+		}
+		(*records)[*numrecords - 1] = nv;
+	}
+
+	*leftover = bytes_read;
+	return (0);
+}
+
+/*
+ * Retrieve the command history of a pool.
+ */
+int
+zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp)
+{
+	char *buf;
+	int buflen = 128 * 1024;
+	uint64_t off = 0;
+	nvlist_t **records = NULL;
+	uint_t numrecords = 0;
+	int err, i;
+
+	buf = malloc(buflen);
+	if (buf == NULL)
+		return (ENOMEM);
+	do {
+		uint64_t bytes_read = buflen;
+		uint64_t leftover;
+
+		if ((err = get_history(zhp, buf, &off, &bytes_read)) != 0)
+			break;
+
+		/* if nothing else was read in, we're at EOF, just return */
+		if (!bytes_read)
+			break;
+
+		if ((err = zpool_history_unpack(buf, bytes_read,
+		    &leftover, &records, &numrecords)) != 0)
+			break;
+		off -= leftover;
+		if (leftover == bytes_read) {
+			/*
+			 * no progress made, because buffer is not big enough
+			 * to hold this record; resize and retry.
+			 */
+			buflen *= 2;
+			free(buf);
+			buf = malloc(buflen);
+			if (buf == NULL)
+				return (ENOMEM);
+		}
+
+		/* CONSTCOND */
+	} while (1);
+
+	free(buf);
+
+	if (!err) {
+		verify(nvlist_alloc(nvhisp, NV_UNIQUE_NAME, 0) == 0);
+		verify(nvlist_add_nvlist_array(*nvhisp, ZPOOL_HIST_RECORD,
+		    records, numrecords) == 0);
+	}
+	for (i = 0; i < numrecords; i++)
+		nvlist_free(records[i]);
+	free(records);
+
+	return (err);
+}
+
+/*
+ * Retrieve the next event given the passed 'zevent_fd' file descriptor.
+ * If there is a new event available 'nvp' will contain a newly allocated
+ * nvlist and 'dropped' will be set to the number of missed events since
+ * the last call to this function.  When 'nvp' is set to NULL it indicates
+ * no new events are available.  In either case the function returns 0 and
+ * it is up to the caller to free 'nvp'.  In the case of a fatal error the
+ * function will return a non-zero value.  When the function is called in
+ * blocking mode (the default, unless the ZEVENT_NONBLOCK flag is passed),
+ * it will not return until a new event is available.
+ */
+int
+zpool_events_next(libzfs_handle_t *hdl, nvlist_t **nvp,
+    int *dropped, unsigned flags, int zevent_fd)
+{
+	zfs_cmd_t zc = {"\0"};
+	int error = 0;
+
+	*nvp = NULL;
+	*dropped = 0;
+	zc.zc_cleanup_fd = zevent_fd;
+
+	if (flags & ZEVENT_NONBLOCK)
+		zc.zc_guid = ZEVENT_NONBLOCK;
+
+	if (zcmd_alloc_dst_nvlist(hdl, &zc, ZEVENT_SIZE) != 0)
+		return (-1);
+
+retry:
+	if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_NEXT, &zc) != 0) {
+		switch (errno) {
+		case ESHUTDOWN:
+			error = zfs_error_fmt(hdl, EZFS_POOLUNAVAIL,
+			    dgettext(TEXT_DOMAIN, "zfs shutdown"));
+			goto out;
+		case ENOENT:
+			/* Blocking error case should not occur */
+			if (!(flags & ZEVENT_NONBLOCK))
+				error = zpool_standard_error_fmt(hdl, errno,
+				    dgettext(TEXT_DOMAIN, "cannot get event"));
+
+			goto out;
+		case ENOMEM:
+			if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
+				error = zfs_error_fmt(hdl, EZFS_NOMEM,
+				    dgettext(TEXT_DOMAIN, "cannot get event"));
+				goto out;
+			} else {
+				goto retry;
+			}
+		default:
+			error = zpool_standard_error_fmt(hdl, errno,
+			    dgettext(TEXT_DOMAIN, "cannot get event"));
+			goto out;
+		}
+	}
+
+	error = zcmd_read_dst_nvlist(hdl, &zc, nvp);
+	if (error != 0)
+		goto out;
+
+	*dropped = (int)zc.zc_cookie;
+out:
+	zcmd_free_nvlists(&zc);
+
+	return (error);
+}
+
+/*
+ * Clear all events.
+ */
+int
+zpool_events_clear(libzfs_handle_t *hdl, int *count)
+{
+	zfs_cmd_t zc = {"\0"};
+	char msg[1024];
+
+	(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+	    "cannot clear events"));
+
+	if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_CLEAR, &zc) != 0)
+		return (zpool_standard_error_fmt(hdl, errno, msg));
+
+	if (count != NULL)
+		*count = (int)zc.zc_cookie; /* # of events cleared */
+
+	return (0);
+}
+
+/*
+ * Seek to a specific EID, ZEVENT_SEEK_START, or ZEVENT_SEEK_END for
+ * the passed zevent_fd file handle.  On success zero is returned,
+ * otherwise -1 is returned and hdl->libzfs_error is set to the errno.
+ */
+int
+zpool_events_seek(libzfs_handle_t *hdl, uint64_t eid, int zevent_fd)
+{
+	zfs_cmd_t zc = {"\0"};
+	int error = 0;
+
+	zc.zc_guid = eid;
+	zc.zc_cleanup_fd = zevent_fd;
+
+	if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_SEEK, &zc) != 0) {
+		switch (errno) {
+		case ENOENT:
+			error = zfs_error_fmt(hdl, EZFS_NOENT,
+			    dgettext(TEXT_DOMAIN, "cannot get event"));
+			break;
+
+		case ENOMEM:
+			error = zfs_error_fmt(hdl, EZFS_NOMEM,
+			    dgettext(TEXT_DOMAIN, "cannot get event"));
+			break;
+
+		default:
+			error = zpool_standard_error_fmt(hdl, errno,
+			    dgettext(TEXT_DOMAIN, "cannot get event"));
+			break;
+		}
+	}
+
+	return (error);
+}
+
+void
+zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
+    char *pathname, size_t len)
+{
+	zfs_cmd_t zc = {"\0"};
+	boolean_t mounted = B_FALSE;
+	char *mntpnt = NULL;
+	char dsname[MAXNAMELEN];
+
+	if (dsobj == 0) {
+		/* special case for the MOS */
+		(void) snprintf(pathname, len, "<metadata>:<0x%llx>",
+		    (longlong_t)obj);
+		return;
+	}
+
+	/* get the dataset's name */
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	zc.zc_obj = dsobj;
+	if (ioctl(zhp->zpool_hdl->libzfs_fd,
+	    ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) {
+		/* just write out a path of two object numbers */
+		(void) snprintf(pathname, len, "<0x%llx>:<0x%llx>",
+		    (longlong_t)dsobj, (longlong_t)obj);
+		return;
+	}
+	(void) strlcpy(dsname, zc.zc_value, sizeof (dsname));
+
+	/* find out if the dataset is mounted */
+	mounted = is_mounted(zhp->zpool_hdl, dsname, &mntpnt);
+
+	/* get the corrupted object's path */
+	(void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name));
+	zc.zc_obj = obj;
+	if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_PATH,
+	    &zc) == 0) {
+		if (mounted) {
+			(void) snprintf(pathname, len, "%s%s", mntpnt,
+			    zc.zc_value);
+		} else {
+			(void) snprintf(pathname, len, "%s:%s",
+			    dsname, zc.zc_value);
+		}
+	} else {
+		(void) snprintf(pathname, len, "%s:<0x%llx>", dsname,
+		    (longlong_t)obj);
+	}
+	free(mntpnt);
+}
+
+/*
+ * Read the EFI label from the config, if a label does not exist then
+ * pass back the error to the caller. If the caller has passed a non-NULL
+ * diskaddr argument then we set it to the starting address of the EFI
+ * partition.
+ */
+static int
+read_efi_label(nvlist_t *config, diskaddr_t *sb)
+{
+	char *path;
+	int fd;
+	char diskname[MAXPATHLEN];
+	int err = -1;
+
+	if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0)
+		return (err);
+
+	(void) snprintf(diskname, sizeof (diskname), "%s%s", DISK_ROOT,
+	    strrchr(path, '/'));
+	if ((fd = open(diskname, O_RDWR|O_DIRECT)) >= 0) {
+		struct dk_gpt *vtoc;
+
+		if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) {
+			if (sb != NULL)
+				*sb = vtoc->efi_parts[0].p_start;
+			efi_free(vtoc);
+		}
+		(void) close(fd);
+	}
+	return (err);
+}
+
+/*
+ * determine where a partition starts on a disk in the current
+ * configuration
+ */
+static diskaddr_t
+find_start_block(nvlist_t *config)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	diskaddr_t sb = MAXOFFSET_T;
+	uint64_t wholedisk;
+
+	if (nvlist_lookup_nvlist_array(config,
+	    ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) {
+		if (nvlist_lookup_uint64(config,
+		    ZPOOL_CONFIG_WHOLE_DISK,
+		    &wholedisk) != 0 || !wholedisk) {
+			return (MAXOFFSET_T);
+		}
+		if (read_efi_label(config, &sb) < 0)
+			sb = MAXOFFSET_T;
+		return (sb);
+	}
+
+	for (c = 0; c < children; c++) {
+		sb = find_start_block(child[c]);
+		if (sb != MAXOFFSET_T) {
+			return (sb);
+		}
+	}
+	return (MAXOFFSET_T);
+}
+
+int
+zpool_label_disk_wait(char *path, int timeout)
+{
+	struct stat64 statbuf;
+	int i;
+
+	/*
+	 * Wait timeout miliseconds for a newly created device to be available
+	 * from the given path.  There is a small window when a /dev/ device
+	 * will exist and the udev link will not, so we must wait for the
+	 * symlink.  Depending on the udev rules this may take a few seconds.
+	 */
+	for (i = 0; i < timeout; i++) {
+		usleep(1000);
+
+		errno = 0;
+		if ((stat64(path, &statbuf) == 0) && (errno == 0))
+			return (0);
+	}
+
+	return (ENOENT);
+}
+
+int
+zpool_label_disk_check(char *path)
+{
+	struct dk_gpt *vtoc;
+	int fd, err;
+
+	if ((fd = open(path, O_RDWR|O_DIRECT)) < 0)
+		return (errno);
+
+	if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) {
+		(void) close(fd);
+		return (err);
+	}
+
+	if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
+		efi_free(vtoc);
+		(void) close(fd);
+		return (EIDRM);
+	}
+
+	efi_free(vtoc);
+	(void) close(fd);
+	return (0);
+}
+
+/*
+ * Label an individual disk.  The name provided is the short name,
+ * stripped of any leading /dev path.
+ */
+int
+zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)
+{
+	char path[MAXPATHLEN];
+	struct dk_gpt *vtoc;
+	int rval, fd;
+	size_t resv = EFI_MIN_RESV_SIZE;
+	uint64_t slice_size;
+	diskaddr_t start_block;
+	char errbuf[1024];
+
+	/* prepare an error message just in case */
+	(void) snprintf(errbuf, sizeof (errbuf),
+	    dgettext(TEXT_DOMAIN, "cannot label '%s'"), name);
+
+	if (zhp) {
+		nvlist_t *nvroot;
+
+#if defined(__sun__) || defined(__sun)
+		if (zpool_is_bootable(zhp)) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "EFI labeled devices are not supported on root "
+			    "pools."));
+			return (zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf));
+		}
+#endif
+
+		verify(nvlist_lookup_nvlist(zhp->zpool_config,
+		    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+
+		if (zhp->zpool_start_block == 0)
+			start_block = find_start_block(nvroot);
+		else
+			start_block = zhp->zpool_start_block;
+		zhp->zpool_start_block = start_block;
+	} else {
+		/* new pool */
+		start_block = NEW_START_BLOCK;
+	}
+
+	(void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name);
+
+	if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) {
+		/*
+		 * This shouldn't happen.  We've long since verified that this
+		 * is a valid device.
+		 */
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
+		    "label '%s': unable to open device: %d"), path, errno);
+		return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
+	}
+
+	if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) {
+		/*
+		 * The only way this can fail is if we run out of memory, or we
+		 * were unable to read the disk's capacity
+		 */
+		if (errno == ENOMEM)
+			(void) no_memory(hdl);
+
+		(void) close(fd);
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
+		    "label '%s': unable to read disk capacity"), path);
+
+		return (zfs_error(hdl, EZFS_NOCAP, errbuf));
+	}
+
+	slice_size = vtoc->efi_last_u_lba + 1;
+	slice_size -= EFI_MIN_RESV_SIZE;
+	if (start_block == MAXOFFSET_T)
+		start_block = NEW_START_BLOCK;
+	slice_size -= start_block;
+	slice_size = P2ALIGN(slice_size, PARTITION_END_ALIGNMENT);
+
+	vtoc->efi_parts[0].p_start = start_block;
+	vtoc->efi_parts[0].p_size = slice_size;
+
+	/*
+	 * Why we use V_USR: V_BACKUP confuses users, and is considered
+	 * disposable by some EFI utilities (since EFI doesn't have a backup
+	 * slice).  V_UNASSIGNED is supposed to be used only for zero size
+	 * partitions, and efi_write() will fail if we use it.  V_ROOT, V_BOOT,
+	 * etc. were all pretty specific.  V_USR is as close to reality as we
+	 * can get, in the absence of V_OTHER.
+	 */
+	vtoc->efi_parts[0].p_tag = V_USR;
+	(void) strcpy(vtoc->efi_parts[0].p_name, "zfs");
+
+	vtoc->efi_parts[8].p_start = slice_size + start_block;
+	vtoc->efi_parts[8].p_size = resv;
+	vtoc->efi_parts[8].p_tag = V_RESERVED;
+
+	if ((rval = efi_write(fd, vtoc)) != 0 || (rval = efi_rescan(fd)) != 0) {
+		/*
+		 * Some block drivers (like pcata) may not support EFI
+		 * GPT labels.  Print out a helpful error message dir-
+		 * ecting the user to manually label the disk and give
+		 * a specific slice.
+		 */
+		(void) close(fd);
+		efi_free(vtoc);
+
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "try using "
+		    "parted(8) and then provide a specific slice: %d"), rval);
+		return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
+	}
+
+	(void) close(fd);
+	efi_free(vtoc);
+
+	/* Wait for the first expected partition to appear. */
+
+	(void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name);
+	(void) zfs_append_partition(path, MAXPATHLEN);
+
+	rval = zpool_label_disk_wait(path, 3000);
+	if (rval) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to "
+		    "detect device partitions on '%s': %d"), path, rval);
+		return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
+	}
+
+	/* We can't be to paranoid.  Read the label back and verify it. */
+	(void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name);
+	rval = zpool_label_disk_check(path);
+	if (rval) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "freshly written "
+		    "EFI label on '%s' is damaged.  Ensure\nthis device "
+		    "is not in in use, and is functioning properly: %d"),
+		    path, rval);
+		return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
+	}
+
+	return (0);
+}
diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index b35428f907cd..2adcb0c0f532 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -63,8 +63,9 @@
 /* in libzfs_dataset.c */
 extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *);
 
-static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t *,
-    int, const char *, nvlist_t *, avl_tree_t *, char **, int, uint64_t *);
+static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *,
+    recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int,
+    uint64_t *);
 
 static const zio_cksum_t zero_cksum = { { 0 } };
 
@@ -2523,7 +2524,7 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
 		 * zfs_receive_one() will take care of it (ie,
 		 * recv_skip() and return 0).
 		 */
-		error = zfs_receive_impl(hdl, destname, flags, fd,
+		error = zfs_receive_impl(hdl, destname, NULL, flags, fd,
 		    sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd,
 		    action_handlep);
 		if (error == ENODATA) {
@@ -2656,9 +2657,9 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
  */
 static int
 zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
-    recvflags_t *flags, dmu_replay_record_t *drr,
-    dmu_replay_record_t *drr_noswap, const char *sendfs,
-    nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
+    const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr,
+    dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv,
+    avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
     uint64_t *action_handlep)
 {
 	zfs_cmd_t zc = {"\0"};
@@ -2808,10 +2809,15 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 		}
 		if (flags->verbose)
 			(void) printf("found clone origin %s\n", zc.zc_string);
+	} else if (originsnap) {
+		(void) strncpy(zc.zc_string, originsnap, ZFS_MAXNAMELEN);
+		if (flags->verbose)
+			(void) printf("using provided clone origin %s\n",
+			    zc.zc_string);
 	}
 
 	stream_wantsnewfs = (drrb->drr_fromguid == 0 ||
-	    (drrb->drr_flags & DRR_FLAG_CLONE));
+	    (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap);
 
 	if (stream_wantsnewfs) {
 		/*
@@ -3189,9 +3195,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 }
 
 static int
-zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
-    int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl,
-    char **top_zfs, int cleanup_fd, uint64_t *action_handlep)
+zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap,
+    const char *originsnap, recvflags_t *flags, int infd, const char *sendfs,
+    nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
+    uint64_t *action_handlep)
 {
 	int err;
 	dmu_replay_record_t drr, drr_noswap;
@@ -3210,6 +3217,12 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
 		    "(%s) does not exist"), tosnap);
 		return (zfs_error(hdl, EZFS_NOENT, errbuf));
 	}
+	if (originsnap &&
+	    !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs "
+		    "(%s) does not exist"), originsnap);
+		return (zfs_error(hdl, EZFS_NOENT, errbuf));
+	}
 
 	/* read in the BEGIN record */
 	if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE,
@@ -3282,14 +3295,14 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
 				*cp = '\0';
 			sendfs = nonpackage_sendfs;
 		}
-		return (zfs_receive_one(hdl, infd, tosnap, flags,
-		    &drr, &drr_noswap, sendfs, stream_nv, stream_avl,
-		    top_zfs, cleanup_fd, action_handlep));
+		return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags,
+		    &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs,
+		    cleanup_fd, action_handlep));
 	} else {
 		assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 		    DMU_COMPOUNDSTREAM);
-		return (zfs_receive_package(hdl, infd, tosnap, flags,
-		    &drr, &zcksum, top_zfs, cleanup_fd, action_handlep));
+		return (zfs_receive_package(hdl, infd, tosnap, flags, &drr,
+		    &zcksum, top_zfs, cleanup_fd, action_handlep));
 	}
 }
 
@@ -3300,14 +3313,15 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
  * (-1 will override -2).
  */
 int
-zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
-    int infd, avl_tree_t *stream_avl)
+zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props,
+    recvflags_t *flags, int infd, avl_tree_t *stream_avl)
 {
 	char *top_zfs = NULL;
 	int err;
 	int cleanup_fd;
 	uint64_t action_handle = 0;
 	struct stat sb;
+	char *originsnap = NULL;
 
 	/*
 	 * The only way fstat can fail is if we do not have a valid file
@@ -3350,10 +3364,16 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags,
 	}
 #endif /* __linux__ */
 
+	if (props) {
+		err = nvlist_lookup_string(props, "origin", &originsnap);
+		if (err && err != ENOENT)
+			return (err);
+	}
+
 	cleanup_fd = open(ZFS_DEV, O_RDWR);
 	VERIFY(cleanup_fd >= 0);
 
-	err = zfs_receive_impl(hdl, tosnap, flags, infd, NULL, NULL,
+	err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL,
 	    stream_avl, &top_zfs, cleanup_fd, &action_handle);
 
 	VERIFY(0 == close(cleanup_fd));
diff --git a/man/man8/zfs.8.orig b/man/man8/zfs.8.orig
new file mode 100644
index 000000000000..8d4aee23f38d
--- /dev/null
+++ b/man/man8/zfs.8.orig
@@ -0,0 +1,3902 @@
+'\" t
+.\"
+.\" CDDL HEADER START
+.\"
+.\" The contents of this file are subject to the terms of the
+.\" Common Development and Distribution License (the "License").
+.\" You may not use this file except in compliance with the License.
+.\"
+.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+.\" or http://www.opensolaris.org/os/licensing.
+.\" See the License for the specific language governing permissions
+.\" and limitations under the License.
+.\"
+.\" When distributing Covered Code, include this CDDL HEADER in each
+.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+.\" If applicable, add the following below this CDDL HEADER, with the
+.\" fields enclosed by brackets "[]" replaced with your own identifying
+.\" information: Portions Copyright [yyyy] [name of copyright owner]
+.\"
+.\" CDDL HEADER END
+.\"
+.\"
+.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
+.\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
+.\" Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
+.\" Copyright 2012 Nexenta Systems, Inc. All Rights Reserved.
+.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+.\"
+.TH zfs 8 "Nov 19, 2013" "ZFS pool 28, filesystem 5" "System Administration Commands"
+.SH NAME
+zfs \- configures ZFS file systems
+.SH SYNOPSIS
+.LP
+.nf
+\fBzfs\fR [\fB-?\fR]
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBcreate\fR [\fB-p\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIfilesystem\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBcreate\fR [\fB-ps\fR] [\fB-b\fR \fIblocksize\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fB-V\fR \fIsize\fR \fIvolume\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBdestroy\fR [\fB-fnpRrv\fR] \fIfilesystem\fR|\fIvolume\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBdestroy\fR [\fB-dnpRrv\fR] \fIfilesystem\fR|\fIvolume\fR@\fIsnap\fR[%\fIsnap\fR][,...]
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBdestroy\fR \fIfilesystem\fR|\fIvolume\fR#\fIbookmark\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBsnapshot | snap\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... 
+      \fIfilesystem@snapname\fR|\fIvolume@snapname\fR ...
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBrollback\fR [\fB-rRf\fR] \fIsnapshot\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBclone\fR [\fB-p\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBpromote\fR \fIclone-filesystem\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBrename\fR [\fB-f\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+     \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBrename\fR [\fB-fp\fR] \fIfilesystem\fR|\fIvolume\fR \fIfilesystem\fR|\fIvolume\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBrename\fR \fB-r\fR \fIsnapshot\fR \fIsnapshot\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBlist\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR][\fB-Hp\fR][\fB-o\fR \fIproperty\fR[,\fIproperty\fR]...] [\fB-t\fR \fItype\fR[,\fItype\fR]..]
+     [\fB-s\fR \fIproperty\fR] ... [\fB-S\fR \fIproperty\fR] ... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR] ...
+.fi
+
+.LP
+.nf
++\fBzfs\fR \fBset\fR \fIproperty\fR=\fIvalue\fR... \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR...
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBget\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR][\fB-Hp\fR][\fB-o\fR \fIfield\fR[,...]] [\fB-t\fR \fItype\fR[,...]] 
+    [\fB-s\fR \fIsource\fR[,...]] "\fIall\fR" | \fIproperty\fR[,...] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBinherit\fR [\fB-rS\fR] \fIproperty\fR \fIfilesystem\fR|\fIvolume|snapshot\fR ...
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBupgrade\fR [\fB-v\fR]
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBupgrade\fR [\fB-r\fR] [\fB-V\fR \fIversion\fR] \fB-a\fR | \fIfilesystem\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBuserspace\fR [\fB-Hinp\fR] [\fB-o\fR \fIfield\fR[,...]] [\fB-s\fR \fIfield\fR] ...
+    [\fB-S\fR \fIfield\fR] ... [\fB-t\fR \fItype\fR[,...]] \fIfilesystem\fR|\fIsnapshot\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBgroupspace\fR [\fB-Hinp\fR] [\fB-o\fR \fIfield\fR[,...]] [\fB-s\fR \fIfield\fR] ...
+    [\fB-S\fR \fIfield\fR] ... [\fB-t\fR \fItype\fR[,...]] \fIfilesystem\fR|\fIsnapshot\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBmount\fR 
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBmount\fR [\fB-vO\fR] [\fB-o \fIoptions\fR\fR] \fB-a\fR | \fIfilesystem\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBunmount | umount\fR [\fB-f\fR] \fB-a\fR | \fIfilesystem\fR|\fImountpoint\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBshare\fR \fB-a\fR | \fIfilesystem\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBunshare\fR \fB-a\fR \fIfilesystem\fR|\fImountpoint\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBbookmark\fR \fIsnapshot\fR \fIbookmark\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBsend\fR [\fB-DnPpRveL\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBsend\fR [\fB-eL\fR] [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBreceive | recv\fR [\fB-vnFu\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBreceive | recv\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] \fIfilesystem\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBallow\fR \fIfilesystem\fR|\fIvolume\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBallow\fR [\fB-ldug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...] \fIperm\fR|\fI@setname\fR[,...] 
+     \fIfilesystem\fR|\fIvolume\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBallow\fR [\fB-ld\fR] \fB-e\fR \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBallow\fR \fB-c\fR \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBallow\fR \fB-s\fR @\fIsetname\fR \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBunallow\fR [\fB-rldug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...] [\fIperm\fR|@\fIsetname\fR[,... ]] 
+     \fIfilesystem\fR|\fIvolume\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBunallow\fR [\fB-rld\fR] \fB-e\fR [\fIperm\fR|@\fIsetname\fR[,... ]] \fIfilesystem\fR|\fIvolume\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBunallow\fR [\fB-r\fR] \fB-c\fR [\fIperm\fR|@\fIsetname\fR[ ... ]] \fIfilesystem\fR|\fIvolume\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBunallow\fR [\fB-r\fR] \fB-s\fR @\fIsetname\fR [\fIperm\fR|@\fIsetname\fR[,... ]] \fIfilesystem\fR|\fIvolume\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBhold\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR...
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBholds\fR [\fB-r\fR] \fIsnapshot\fR...
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBrelease\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR...
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBdiff\fR [\fB-FHt\fR] \fIsnapshot\fR \fIsnapshot|filesystem\fR
+
+.SH DESCRIPTION
+.LP
+The \fBzfs\fR command configures \fBZFS\fR datasets within a \fBZFS\fR storage pool, as described in \fBzpool\fR(8). A dataset is identified by a unique path within the \fBZFS\fR namespace. For example:
+.sp
+.in +2
+.nf
+pool/{filesystem,volume,snapshot}
+.fi
+.in -2
+.sp
+
+.sp
+.LP
+where the maximum length of a dataset name is \fBMAXNAMELEN\fR (256 bytes).
+.sp
+.LP
+A dataset can be one of the following:
+.sp
+.ne 2
+.mk
+.na
+\fB\fIfile system\fR\fR
+.ad
+.sp .6
+.RS 4n
+A \fBZFS\fR dataset of type \fBfilesystem\fR can be mounted within the standard system namespace and behaves like other file systems. While \fBZFS\fR file systems are designed to be \fBPOSIX\fR compliant, known issues exist that prevent compliance in some cases. Applications that depend on standards conformance might fail due to nonstandard behavior when checking file system free space.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fIvolume\fR\fR
+.ad
+.sp .6
+.RS 4n
+A logical volume exported as a raw or block device. This type of dataset should only be used under special circumstances. File systems are typically used in most environments.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fIsnapshot\fR\fR
+.ad
+.sp .6
+.RS 4n
+A read-only version of a file system or volume at a given point in time. It is specified as \fIfilesystem@name\fR or \fIvolume@name\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fIbookmark\fR\fR
+.ad
+.sp .6
+.RS 4n
+Much like a \fIsnapshot\fR, but without the hold on on-disk data. It can be used as the source of a send (but not for a receive).
+It is specified as \fIfilesystem#name\fR or \fIvolume#name\fR.
+.RE
+
+.SS "ZFS File System Hierarchy"
+.LP
+A \fBZFS\fR storage pool is a logical collection of devices that provide space for datasets. A storage pool is also the root of the \fBZFS\fR file system hierarchy.
+.sp
+.LP
+The root of the pool can be accessed as a file system, such as mounting and unmounting, taking snapshots, and setting properties. The physical storage characteristics, however, are managed by the \fBzpool\fR(8) command.
+.sp
+.LP
+See \fBzpool\fR(8) for more information on creating and administering pools.
+.SS "Snapshots"
+.LP
+A snapshot is a read-only copy of a file system or volume. Snapshots can be created extremely quickly, and initially consume no additional space within the pool. As data within the active dataset changes, the snapshot consumes more data than would otherwise be shared with the active dataset.
+.sp
+.LP
+Snapshots can have arbitrary names. Snapshots of volumes can be cloned or rolled back.  Visibility is determined by the \fBsnapdev\fR property of the parent volume.
+.sp
+.LP
+File system snapshots can be accessed under the \fB\&.zfs/snapshot\fR directory in the root of the file system. Snapshots are automatically mounted on demand and may be unmounted at regular intervals. The visibility of the \fB\&.zfs\fR directory can be controlled by the \fBsnapdir\fR property.
+.SS "Bookmarks"
+.LP
+A bookmark is like a snapshot, a read-only copy of a file system or volume. Bookmarks can be created extremely quickly, compared to snapshots, and they consume no additional space within the pool.  Bookmarks can also have arbitrary names, much like snapshots.
+.sp
+.LP
+Unlike snapshots, bookmarks can not be accessed through the filesystem in any way. From a storage standpoint a bookmark just provides a way to reference when a snapshot was created as a distinct object.  Bookmarks are initially tied to a snapshot, not the filesystem/volume, and they will survive if the snapshot itself is destroyed.  Since they are very light weight there's little incentive to destroy them.
+.SS "Clones"
+.LP
+A clone is a writable volume or file system whose initial contents are the same as another dataset. As with snapshots, creating a clone is nearly instantaneous, and initially consumes no additional space.
+.sp
+.LP
+Clones can only be created from a snapshot. When a snapshot is cloned, it creates an implicit dependency between the parent and child. Even though the clone is created somewhere else in the dataset hierarchy, the original snapshot cannot be destroyed as long as a clone exists. The \fBorigin\fR property exposes this dependency, and the \fBdestroy\fR command lists any such dependencies, if they exist.
+.sp
+.LP
+The clone parent-child dependency relationship can be reversed by using the \fBpromote\fR subcommand. This causes the "origin" file system to become a clone of the specified file system, which makes it possible to destroy the file system that the clone was created from.
+.SS "Mount Points"
+.LP
+Creating a \fBZFS\fR file system is a simple operation, so the number of file systems per system is likely to be numerous. To cope with this, \fBZFS\fR automatically manages mounting and unmounting file systems without the need to edit the \fB/etc/fstab\fR file. All automatically managed file systems are mounted by \fBZFS\fR at boot time.
+.sp
+.LP
+By default, file systems are mounted under \fB/\fIpath\fR\fR, where \fIpath\fR is the name of the file system in the \fBZFS\fR namespace. Directories are created and destroyed as needed.
+.sp
+.LP
+A file system can also have a mount point set in the \fBmountpoint\fR property. This directory is created as needed, and \fBZFS\fR automatically mounts the file system when the \fBzfs mount -a\fR command is invoked (without editing \fB/etc/fstab\fR). The \fBmountpoint\fR property can be inherited, so if \fBpool/home\fR has a mount point of \fB/export/stuff\fR, then \fBpool/home/user\fR automatically inherits a mount point of \fB/export/stuff/user\fR.
+.sp
+.LP
+A file system \fBmountpoint\fR property of \fBnone\fR prevents the file system from being mounted.
+.sp
+.LP
+If needed, \fBZFS\fR file systems can also be managed with traditional tools (\fBmount\fR, \fBumount\fR, \fB/etc/fstab\fR). If a file system's mount point is set to \fBlegacy\fR, \fBZFS\fR makes no attempt to manage the file system, and the administrator is responsible for mounting and unmounting the file system.
+.SS "Deduplication"
+.LP
+Deduplication is the process for removing redundant data at the block-level, reducing the total amount of data stored. If a file system has the \fBdedup\fR property enabled, duplicate data blocks are removed synchronously.  The result is that only unique data is stored and common components are shared among files.
+.sp
+\fBWARNING: DO NOT ENABLE DEDUPLICATION UNLESS YOU NEED IT AND KNOW EXACTLY WHAT YOU ARE DOING!\fR
+.sp
+Deduplicating data is a very resource-intensive operation. It is generally recommended that you have \fIat least\fR 1.25 GB of RAM per 1 TB of storage when you enable deduplication. But calculating the exact requirenments is a somewhat complicated affair. Please see the \fBOracle Dedup Guide\fR for more information..
+.sp
+Enabling deduplication on an improperly-designed system will result in extreme performance issues (extremely slow filesystem and snapshot deletions etc.) and can potentially lead to data loss (i.e. unimportable pool due to memory exhaustion) if your system is not built for this purpose. Deduplication affects the processing power (CPU), disks (and the controller) as well as primary (real) memory.
+.sp
+Before creating a pool with deduplication enabled, ensure that you have planned your hardware requirements appropriately and implemented appropriate recovery practices, such as regular backups.
+.sp
+Unless necessary, deduplication should NOT be enabled on a system. Instead, consider using \fIcompression=lz4\fR, as a less resource-intensive alternative.
+.SS "Native Properties"
+.LP
+Properties are divided into two types, native properties and user-defined (or "user") properties. Native properties either export internal statistics or control \fBZFS\fR behavior. In addition, native properties are either editable or read-only. User properties have no effect on \fBZFS\fR behavior, but you can use them to annotate datasets in a way that is meaningful in your environment. For more information about user properties, see the "User Properties" section, below.
+.sp
+.LP
+Every dataset has a set of properties that export statistics about the dataset as well as control various behaviors. Properties are inherited from the parent unless overridden by the child. Some properties apply only to certain types of datasets (file systems, volumes, or snapshots).
+.sp
+.LP
+The values of numeric properties can be specified using human-readable suffixes (for example, \fBk\fR, \fBKB\fR, \fBM\fR, \fBGb\fR, and so forth, up to \fBZ\fR for zettabyte). The following are all valid (and equal) specifications: 
+.sp
+.in +2
+.nf
+1536M, 1.5g, 1.50GB
+.fi
+.in -2
+.sp
+
+.sp
+.LP
+The values of non-numeric properties are case sensitive and must be lowercase, except for \fBmountpoint\fR, \fBsharenfs\fR, and \fBsharesmb\fR.
+.sp
+.LP
+The following native properties consist of read-only statistics about the dataset. These properties can be neither set, nor inherited. Native properties apply to all dataset types unless otherwise noted.
+.sp
+.ne 2
+.mk
+.na
+\fB\fBavailable\fR\fR
+.ad
+.sp .6
+.RS 4n
+The amount of space available to the dataset and all its children, assuming that there is no other activity in the pool. Because space is shared within a pool, availability can be limited by any number of factors, including physical pool size, quotas, reservations, or other datasets within the pool.
+.sp
+This property can also be referred to by its shortened column name, \fBavail\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBcompressratio\fR\fR
+.ad
+.sp .6
+.RS 4n
+For non-snapshots, the compression ratio achieved for the \fBused\fR space of this dataset, expressed as a multiplier.  The \fBused\fR property includes descendant datasets, and, for clones, does not include the space shared with the origin snapshot.  For snapshots, the \fBcompressratio\fR is the same as the \fBrefcompressratio\fR property.  Compression can be turned on by running: \fBzfs set compression=on \fIdataset\fR\fR. The default value is \fBoff\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBcreation\fR\fR
+.ad
+.sp .6
+.RS 4n
+The time this dataset was created.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBclones\fR\fR
+.ad
+.sp .6
+.RS 4n
+For snapshots, this property is a comma-separated list of filesystems or
+volumes which are clones of this snapshot.  The clones' \fBorigin\fR property
+is this snapshot.  If the \fBclones\fR property is not empty, then this
+snapshot can not be destroyed (even with the \fB-r\fR or \fB-f\fR options).
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBdefer_destroy\fR\fR
+.ad
+.sp .6
+.RS 4n
+This property is \fBon\fR if the snapshot has been marked for deferred destruction by using the \fBzfs destroy\fR \fB-d\fR command. Otherwise, the property is \fBoff\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBfilesystem_count\fR
+.ad
+.sp .6
+.RS 4n
+The total number of filesystems and volumes that exist under this location in the
+dataset tree.  This value is only available when a \fBfilesystem_limit\fR has
+been set somewhere in the tree under which the dataset resides.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBlogicalreferenced\fR\fR
+.ad
+.sp .6
+.RS 4n
+The amount of space that is "logically" accessible by this dataset.  See
+the \fBreferenced\fR property.  The logical space ignores the effect of
+the \fBcompression\fR and \fBcopies\fR properties, giving a quantity
+closer to the amount of data that applications see.  However, it does
+include space consumed by metadata.
+.sp
+This property can also be referred to by its shortened column name,
+\fBlrefer\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBlogicalused\fR\fR
+.ad
+.sp .6
+.RS 4n
+The amount of space that is "logically" consumed by this dataset and all
+its descendents.  See the \fBused\fR property.  The logical space
+ignores the effect of the \fBcompression\fR and \fBcopies\fR properties,
+giving a quantity closer to the amount of data that applications see.
+However, it does include space consumed by metadata.
+.sp
+This property can also be referred to by its shortened column name,
+\fBlused\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBmounted\fR\fR
+.ad
+.sp .6
+.RS 4n
+For file systems, indicates whether the file system is currently mounted. This property can be either \fByes\fR or \fBno\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBorigin\fR\fR
+.ad
+.sp .6
+.RS 4n
+For cloned file systems or volumes, the snapshot from which the clone was created. See also the \fBclones\fR property.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBreferenced\fR\fR
+.ad
+.sp .6
+.RS 4n
+The amount of data that is accessible by this dataset, which may or may not be shared with other datasets in the pool. When a snapshot or clone is created, it initially references the same amount of space as the file system or snapshot it was created from, since its contents are identical.
+.sp
+This property can also be referred to by its shortened column name, \fBrefer\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBrefcompressratio\fR\fR
+.ad
+.sp .6
+.RS 4n
+The compression ratio achieved for the \fBreferenced\fR space of this
+dataset, expressed as a multiplier.  See also the \fBcompressratio\fR
+property.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBsnapshot_count\fR
+.ad
+.sp .6
+.RS 4n
+The total number of snapshots that exist under this location in the dataset tree.
+This value is only available when a \fBsnapshot_limit\fR has been set somewhere
+in the tree under which the dataset resides.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBtype\fR\fR
+.ad
+.sp .6
+.RS 4n
+The type of dataset: \fBfilesystem\fR, \fBvolume\fR, or \fBsnapshot\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBused\fR\fR
+.ad
+.sp .6
+.RS 4n
+The amount of space consumed by this dataset and all its descendents. This is the value that is checked against this dataset's quota and reservation. The space used does not include this dataset's reservation, but does take into account the reservations of any descendent datasets. The amount of space that a dataset consumes from its parent, as well as the amount of space that are freed if this dataset is recursively destroyed, is the greater of its space used and its reservation.
+.sp
+When snapshots (see the "Snapshots" section) are created, their space is initially shared between the snapshot and the file system, and possibly with previous snapshots. As the file system changes, space that was previously shared becomes unique to the snapshot, and counted in the snapshot's space used. Additionally, deleting snapshots can increase the amount of space unique to (and used by) other snapshots.
+.sp
+The amount of space used, available, or referenced does not take into account pending changes. Pending changes are generally accounted for within a few seconds. Committing a change to a disk using \fBfsync\fR(2) or \fBO_SYNC\fR does not necessarily guarantee that the space usage information is updated immediately.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBusedby*\fR\fR
+.ad
+.sp .6
+.RS 4n
+The \fBusedby*\fR properties decompose the \fBused\fR properties into the various reasons that space is used. Specifically, \fBused\fR = \fBusedbychildren\fR + \fBusedbydataset\fR + \fBusedbyrefreservation\fR +, \fBusedbysnapshots\fR. These properties are only available for datasets created on \fBzpool\fR "version 13" pools.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBusedbychildren\fR\fR
+.ad
+.sp .6
+.RS 4n
+The amount of space used by children of this dataset, which would be freed if all the dataset's children were destroyed.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBusedbydataset\fR\fR
+.ad
+.sp .6
+.RS 4n
+The amount of space used by this dataset itself, which would be freed if the dataset were destroyed (after first removing any \fBrefreservation\fR and destroying any necessary snapshots or descendents).
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBusedbyrefreservation\fR\fR
+.ad
+.sp .6
+.RS 4n
+The amount of space used by a \fBrefreservation\fR set on this dataset, which would be freed if the \fBrefreservation\fR was removed.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBusedbysnapshots\fR\fR
+.ad
+.sp .6
+.RS 4n
+The amount of space consumed by snapshots of this dataset. In particular, it is the amount of space that would be freed if all of this dataset's snapshots were destroyed. Note that this is not simply the sum of the snapshots' \fBused\fR properties because space can be shared by multiple snapshots.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBuserused@\fR\fIuser\fR\fR
+.ad
+.sp .6
+.RS 4n
+The amount of space consumed by the specified user in this dataset. Space is charged to the owner of each file, as displayed by \fBls\fR \fB-l\fR. The amount of space charged is displayed by \fBdu\fR and \fBls\fR \fB-s\fR. See the \fBzfs userspace\fR subcommand for more information.
+.sp
+Unprivileged users can access only their own space usage. The root user, or a user who has been granted the \fBuserused\fR privilege with \fBzfs allow\fR, can access everyone's usage.
+.sp
+The \fBuserused@\fR... properties are not displayed by \fBzfs get all\fR. The user's name must be appended after the \fB@\fR symbol, using one of the following forms:
+.RS +4
+.TP
+.ie t \(bu
+.el o
+\fIPOSIX name\fR (for example, \fBjoe\fR)
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+\fIPOSIX numeric ID\fR (for example, \fB789\fR)
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+\fISID name\fR (for example, \fBjoe.smith@mydomain\fR)
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+\fISID numeric ID\fR (for example, \fBS-1-123-456-789\fR)
+.RE
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBuserrefs\fR\fR
+.ad
+.sp .6
+.RS 4n
+This property is set to the number of user holds on this snapshot. User holds are set by using the \fBzfs hold\fR command.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBgroupused@\fR\fIgroup\fR\fR
+.ad
+.sp .6
+.RS 4n
+The amount of space consumed by the specified group in this dataset. Space is charged to the group of each file, as displayed by \fBls\fR \fB-l\fR. See the \fBuserused@\fR\fIuser\fR property for more information.
+.sp
+Unprivileged users can only access their own groups' space usage. The root user, or a user who has been granted the \fBgroupused\fR privilege with \fBzfs allow\fR, can access all groups' usage.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBvolblocksize\fR=\fIblocksize\fR\fR
+.ad
+.sp .6
+.RS 4n
+For volumes, specifies the block size of the volume. The \fBblocksize\fR cannot be changed once the volume has been written, so it should be set at volume creation time. The default \fBblocksize\fR for volumes is 8 Kbytes. Any power of 2 from 512 bytes to 128 Kbytes is valid.
+.sp
+This property can also be referred to by its shortened column name, \fBvolblock\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBwritten\fR\fR
+.ad
+.sp .6
+.RS 4n
+The amount of \fBreferenced\fR space written to this dataset since the
+previous snapshot.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBwritten@\fR\fIsnapshot\fR\fR
+.ad
+.sp .6
+.RS 4n
+The amount of \fBreferenced\fR space written to this dataset since the
+specified snapshot.  This is the space that is referenced by this dataset
+but was not referenced by the specified snapshot.
+.sp
+The \fIsnapshot\fR may be specified as a short snapshot name (just the part
+after the \fB@\fR), in which case it will be interpreted as a snapshot in
+the same filesystem as this dataset.
+The \fIsnapshot\fR be a full snapshot name (\fIfilesystem\fR@\fIsnapshot\fR),
+which for clones may be a snapshot in the origin's filesystem (or the origin
+of the origin's filesystem, etc).
+.RE
+
+.sp
+.LP
+The following native properties can be used to change the behavior of a \fBZFS\fR dataset.
+.sp
+.ne 2
+.mk
+.na
+\fB\fBaclinherit\fR=\fBdiscard\fR | \fBnoallow\fR | \fBrestricted\fR | \fBpassthrough\fR | \fBpassthrough-x\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls how \fBACL\fR entries are inherited when files and directories are created. A file system with an \fBaclinherit\fR property of \fBdiscard\fR does not inherit any \fBACL\fR entries. A file system with an \fBaclinherit\fR property value of \fBnoallow\fR only inherits inheritable \fBACL\fR entries that specify "deny" permissions. The property value \fBrestricted\fR (the default) removes the \fBwrite_acl\fR and \fBwrite_owner\fR permissions when the \fBACL\fR entry is inherited. A file system with an \fBaclinherit\fR property value of \fBpassthrough\fR inherits all inheritable \fBACL\fR entries without any modifications made to the \fBACL\fR entries when they are inherited. A file system with an \fBaclinherit\fR property value of \fBpassthrough-x\fR has the same meaning as \fBpassthrough\fR, except that the \fBowner@\fR, \fBgroup@\fR, and \fBeveryone@\fR \fBACE\fRs inherit the execute permission only if the file creation mode also requests the execute bit.
+.sp
+When the property value is set to \fBpassthrough\fR, files are created with a mode determined by the inheritable \fBACE\fRs. If no inheritable \fBACE\fRs exist that affect the mode, then the mode is set in accordance to the requested mode from the application.
+.sp
+The \fBaclinherit\fR property does not apply to Posix ACLs.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBacltype\fR=\fBnoacl\fR | \fBposixacl\fR \fR
+.ad
+.sp .6
+.RS 4n
+Controls whether ACLs are enabled and if so what type of ACL to use.  When
+a file system has the \fBacltype\fR property set to \fBnoacl\fR (the default)
+then ACLs are disabled.  Setting the \fBacltype\fR property to \fBposixacl\fR
+indicates Posix ACLs should be used.  Posix ACLs are specific to Linux and
+are not functional on other platforms.  Posix ACLs are stored as an xattr and
+therefore will not overwrite any existing ZFS/NFSv4 ACLs which may be set.
+Currently only \fBposixacls\fR are supported on Linux.
+.sp
+To obtain the best performance when setting \fBposixacl\fR users are strongly
+encouraged to set the \fBxattr=sa\fR property.  This will result in the
+Posix ACL being stored more efficiently on disk.  But as a consequence of this
+all new xattrs will only be accessible from ZFS implementations which support
+the \fBxattr=sa\fR property.  See the \fBxattr\fR property for more details.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBatime\fR=\fBon\fR | \fBoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether the access time for files is updated when they are read. Turning this property off avoids producing write traffic when reading files and can result in significant performance gains, though it might confuse mailers and other similar utilities. The default value is \fBon\fR.  See also \fBrelatime\fR below.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBcanmount\fR=\fBon\fR | \fBoff\fR | \fBnoauto\fR\fR
+.ad
+.sp .6
+.RS 4n
+If this property is set to \fBoff\fR, the file system cannot be mounted, and is ignored by \fBzfs mount -a\fR. Setting this property to \fBoff\fR is similar to setting the \fBmountpoint\fR property to \fBnone\fR, except that the dataset still has a normal \fBmountpoint\fR property, which can be inherited. Setting this property to \fBoff\fR allows datasets to be used solely as a mechanism to inherit properties. One example of setting \fBcanmount=\fR\fBoff\fR is to have two datasets with the same \fBmountpoint\fR, so that the children of both datasets appear in the same directory, but might have different inherited characteristics.
+.sp
+When the \fBnoauto\fR option is set, a dataset can only be mounted and unmounted explicitly. The dataset is not mounted automatically when the dataset is created or imported, nor is it mounted by the \fBzfs mount -a\fR command or unmounted by the \fBzfs unmount -a\fR command.
+.sp
+This property is not inherited.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBchecksum\fR=\fBon\fR | \fBoff\fR | \fBfletcher2,\fR| \fBfletcher4\fR | \fBsha256\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls the checksum used to verify data integrity. The default value is \fBon\fR, which automatically selects an appropriate algorithm (currently, \fBfletcher4\fR, but this may change in future releases). The value \fBoff\fR disables integrity checking on user data. Disabling checksums is \fBNOT\fR a recommended practice.
+.sp
+Changing this property affects only newly-written data.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBcompression\fR=\fBon\fR | \fBoff\fR | \fBlzjb\fR | \fBlz4\fR |
+\fBgzip\fR | \fBgzip-\fR\fIN\fR | \fBzle\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls the compression algorithm used for this dataset.
+.sp
+Setting compression to \fBon\fR indicates that the current default
+compression algorithm should be used.  The default balances compression
+and decompression speed, with compression ratio and is expected to
+work well on a wide variety of workloads.  Unlike all other settings for
+this property, \fBon\fR does not select a fixed compression type.  As
+new compression algorithms are added to ZFS and enabled on a pool, the
+default compression algorithm may change.  The current default compression
+algorthm is either \fBlzjb\fR or, if the \fBlz4_compress\fR feature is
+enabled, \fBlz4\fR.
+.sp
+The \fBlzjb\fR compression algorithm is optimized for performance while
+providing decent data compression.
+.sp
+The \fBlz4\fR compression algorithm is a high-performance replacement
+for the \fBlzjb\fR algorithm. It features significantly faster
+compression and decompression, as well as a moderately higher
+compression ratio than \fBlzjb\fR, but can only be used on pools with
+the \fBlz4_compress\fR feature set to \fIenabled\fR. See
+\fBzpool-features\fR(5) for details on ZFS feature flags and the
+\fBlz4_compress\fR feature.
+.sp
+The \fBgzip\fR compression algorithm uses the same compression as
+the \fBgzip\fR(1) command. You can specify the \fBgzip\fR level by using the
+value \fBgzip-\fR\fIN\fR where \fIN\fR is an integer from 1 (fastest) to 9
+(best compression ratio). Currently, \fBgzip\fR is equivalent to \fBgzip-6\fR
+(which is also the default for \fBgzip\fR(1)). The \fBzle\fR compression
+algorithm compresses runs of zeros.
+.sp
+This property can also be referred to by its shortened column name
+\fBcompress\fR. Changing this property affects only newly-written data.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBcopies\fR=\fB1\fR | \fB2\fR | \fB3\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls the number of copies of data stored for this dataset. These copies are in addition to any redundancy provided by the pool, for example, mirroring or RAID-Z. The copies are stored on different disks, if possible. The space used by multiple copies is charged to the associated file and dataset, changing the \fBused\fR property and counting against quotas and reservations.
+.sp
+Changing this property only affects newly-written data. Therefore, set this property at file system creation time by using the \fB-o\fR \fBcopies=\fR\fIN\fR option.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBdedup\fR=\fBon\fR | \fBoff\fR | \fBverify\fR | \fBsha256\fR[,\fBverify\fR]\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether deduplication is in effect for a dataset. The default value is \fBoff\fR. The default checksum used for deduplication is \fBsha256\fR (subject to change). When \fBdedup\fR is enabled, the \fBdedup\fR checksum algorithm overrides the \fBchecksum\fR property. Setting the value to \fBverify\fR is equivalent to specifying \fBsha256,verify\fR.
+.sp
+If the property is set to \fBverify\fR, then, whenever two blocks have the same signature, ZFS will do a byte-for-byte comparison with the existing block to ensure that the contents are identical.
+.sp
+Unless necessary, deduplication should NOT be enabled on a system. See \fBDeduplication\fR above.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBdevices\fR=\fBon\fR | \fBoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether device nodes can be opened on this file system. The default value is \fBon\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBexec\fR=\fBon\fR | \fBoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether processes can be executed from within this file system. The default value is \fBon\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBmlslabel\fR=\fIlabel\fR | \fBnone\fR\fR
+.ad
+.sp .6
+.RS 4n
+The \fBmlslabel\fR property is a sensitivity label that determines if a dataset  can be mounted in a zone on a system with Trusted Extensions enabled. If the labeled dataset matches the labeled zone, the dataset can be mounted  and accessed from the labeled zone.
+.sp
+When the \fBmlslabel\fR property is not set, the default value is \fBnone\fR. Setting the  \fBmlslabel\fR property to \fBnone\fR is equivalent to removing the property.
+.sp
+The \fBmlslabel\fR property can be modified only when Trusted Extensions is enabled and only with appropriate privilege. Rights to modify it cannot be delegated. When changing a label to a higher label or setting the initial dataset label, the \fB{PRIV_FILE_UPGRADE_SL}\fR privilege is required. When changing a label to a lower label or the default (\fBnone\fR), the \fB{PRIV_FILE_DOWNGRADE_SL}\fR privilege is required. Changing the dataset to labels other than the default can be done only when the dataset is not mounted. When a dataset with the default label is mounted into a labeled-zone, the mount operation automatically sets the \fBmlslabel\fR property to the label of that zone.
+.sp
+When Trusted Extensions is \fBnot\fR enabled, only datasets with the default label (\fBnone\fR) can be mounted.
+.sp
+Zones are a Solaris feature and are not relevant on Linux.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBfilesystem_limit\fR=\fIcount\fR | \fBnone\fR\fR
+.ad
+.sp .6
+.RS 4n
+Limits the number of filesystems and volumes that can exist under this point in
+the dataset tree.  The limit is not enforced if the user is allowed to change
+the limit. Setting a filesystem_limit on a descendent of a filesystem that
+already has a filesystem_limit does not override the ancestor's filesystem_limit,
+but rather imposes an additional limit. This feature must be enabled to be used
+(see \fBzpool-features\fR(5)).
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBmountpoint\fR=\fIpath\fR | \fBnone\fR | \fBlegacy\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls the mount point used for this file system. See the "Mount Points" section for more information on how this property is used. 
+.sp
+When the \fBmountpoint\fR property is changed for a file system, the file system and any children that inherit the mount point are unmounted. If the new value is \fBlegacy\fR, then they remain unmounted. Otherwise, they are automatically remounted in the new location if the property was previously \fBlegacy\fR or \fBnone\fR, or if they were mounted before the property was changed. In addition, any shared file systems are unshared and shared in the new location.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBnbmand\fR=\fBon\fR | \fBoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether the file system should be mounted with \fBnbmand\fR (Non Blocking mandatory locks). This is used for \fBCIFS\fR clients. Changes to this property only take effect when the file system is umounted and remounted. See \fBmount\fR(8) for more information on \fBnbmand\fR mounts.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBprimarycache\fR=\fBall\fR | \fBnone\fR | \fBmetadata\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls what is cached in the primary cache (ARC). If this property is set to \fBall\fR, then both user data and metadata is cached. If this property is set to \fBnone\fR, then neither user data nor metadata is cached. If this property is set to \fBmetadata\fR, then only metadata is cached. The default value is \fBall\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBquota\fR=\fIsize\fR | \fBnone\fR\fR
+.ad
+.sp .6
+.RS 4n
+Limits the amount of space a dataset and its descendents can consume. This property enforces a hard limit on the amount of space used. This includes all space consumed by descendents, including file systems and snapshots. Setting a quota on a descendent of a dataset that already has a quota does not override the ancestor's quota, but rather imposes an additional limit.
+.sp
+Quotas cannot be set on volumes, as the \fBvolsize\fR property acts as an implicit quota.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBsnapshot_limit\fR=\fIcount\fR | \fBnone\fR\fR
+.ad
+.sp .6
+.RS 4n
+Limits the number of snapshots that can be created on a dataset and its
+descendents. Setting a snapshot_limit on a descendent of a dataset that already
+has a snapshot_limit does not override the ancestor's snapshot_limit, but
+rather imposes an additional limit. The limit is not enforced if the user is
+allowed to change the limit. For example, this means that recursive snapshots
+taken from the global zone are counted against each delegated dataset within
+a zone. This feature must be enabled to be used (see \fBzpool-features\fR(5)).
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBuserquota@\fR\fIuser\fR=\fIsize\fR | \fBnone\fR\fR
+.ad
+.sp .6
+.RS 4n
+Limits the amount of space consumed by the specified user. Similar to the \fBrefquota\fR property, the \fBuserquota\fR space calculation does not include space that is used by descendent datasets, such as snapshots and clones. User space consumption is identified by the \fBuserspace@\fR\fIuser\fR property.
+.sp
+Enforcement of user quotas may be delayed by several seconds. This delay means that a user might exceed their quota before the system notices that they are over quota and begins to refuse additional writes with the \fBEDQUOT\fR error message . See the \fBzfs userspace\fR subcommand for more information.
+.sp
+Unprivileged users can only access their own groups' space usage. The root user, or a user who has been granted the \fBuserquota\fR privilege with \fBzfs allow\fR, can get and set everyone's quota.
+.sp
+This property is not available on volumes, on file systems before version 4, or on pools before version 15. The \fBuserquota@\fR... properties are not displayed by \fBzfs get all\fR. The user's name must be appended after the \fB@\fR symbol, using one of the following forms:
+.RS +4
+.TP
+.ie t \(bu
+.el o
+\fIPOSIX name\fR (for example, \fBjoe\fR)
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+\fIPOSIX numeric ID\fR (for example, \fB789\fR)
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+\fISID name\fR (for example, \fBjoe.smith@mydomain\fR)
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+\fISID numeric ID\fR (for example, \fBS-1-123-456-789\fR)
+.RE
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBgroupquota@\fR\fIgroup\fR=\fIsize\fR | \fBnone\fR\fR
+.ad
+.sp .6
+.RS 4n
+Limits the amount of space consumed by the specified group. Group space consumption is identified by the \fBuserquota@\fR\fIuser\fR property.
+.sp
+Unprivileged users can access only their own groups' space usage. The root user, or a user who has been granted the \fBgroupquota\fR privilege with \fBzfs allow\fR, can get and set all groups' quotas.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBreadonly\fR=\fBon\fR | \fBoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether this dataset can be modified. The default value is \fBoff\fR.
+.sp
+This property can also be referred to by its shortened column name, \fBrdonly\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBrecordsize\fR=\fIsize\fR\fR
+.ad
+.sp .6
+.RS 4n
+Specifies a suggested block size for files in the file system. This property is designed solely for use with database workloads that access files in fixed-size records. \fBZFS\fR automatically tunes block sizes according to internal algorithms optimized for typical access patterns. 
+.sp
+For databases that create very large files but access them in small random chunks, these algorithms may be suboptimal. Specifying a \fBrecordsize\fR greater than or equal to the record size of the database can result in significant performance gains. Use of this property for general purpose file systems is strongly discouraged, and may adversely affect performance.
+.sp
+The size specified must be a power of two greater than or equal to 512 and less than or equal to 128 Kbytes.
+.sp
+Changing the file system's \fBrecordsize\fR affects only files created afterward; existing files are unaffected.
+.sp
+This property can also be referred to by its shortened column name, \fBrecsize\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBredundant_metadata\fR=\fBall\fR | \fBmost\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls what types of metadata are stored redundantly.  ZFS stores an
+extra copy of metadata, so that if a single block is corrupted, the
+amount of user data lost is limited.  This extra copy is in addition to
+any redundancy provided at the pool level (e.g. by mirroring or RAID-Z),
+and is in addition to an extra copy specified by the \fBcopies\fR
+property (up to a total of 3 copies).  For example if the pool is
+mirrored, \fBcopies\fR=2, and \fBredundant_metadata\fR=most, then ZFS
+stores 6 copies of most metadata, and 4 copies of data and some
+metadata.
+.sp
+When set to \fBall\fR, ZFS stores an extra copy of all metadata.  If a
+single on-disk block is corrupt, at worst a single block of user data
+(which is \fBrecordsize\fR bytes long) can be lost.
+.sp
+When set to \fBmost\fR, ZFS stores an extra copy of most types of
+metadata.  This can improve performance of random writes, because less
+metadata must be written.  In practice, at worst about 100 blocks (of
+\fBrecordsize\fR bytes each) of user data can be lost if a single
+on-disk block is corrupt.  The exact behavior of which metadata blocks
+are stored redundantly may change in future releases.
+.sp
+The default value is \fBall\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBrefquota\fR=\fIsize\fR | \fBnone\fR\fR
+.ad
+.sp .6
+.RS 4n
+Limits the amount of space a dataset can consume. This property enforces a hard limit on the amount of space used. This hard limit does not include space used by descendents, including file systems and snapshots.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBrefreservation\fR=\fIsize\fR | \fBnone\fR\fR
+.ad
+.sp .6
+.RS 4n
+The minimum amount of space guaranteed to a dataset, not including its descendents. When the amount of space used is below this value, the dataset is treated as if it were taking up the amount of space specified by \fBrefreservation\fR. The \fBrefreservation\fR reservation is accounted for in the parent datasets' space used, and counts against the parent datasets' quotas and reservations.
+.sp
+If \fBrefreservation\fR is set, a snapshot is only allowed if there is enough free pool space outside of this reservation to accommodate the current number of "referenced" bytes in the dataset.
+.sp
+This property can also be referred to by its shortened column name, \fBrefreserv\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBrelatime\fR=\fBon\fR | \fBoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls the manner in which the access time is updated when \fBatime=on\fR is set.  Turning this property \fBon\fR causes the access time to be updated relative to the modify or change time.  Access time is only updated if the previous access time was earlier than the current modify or change time or if the existing access time hasn't been updated within the past 24 hours.  The default value is \fBoff\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBreservation\fR=\fIsize\fR | \fBnone\fR\fR
+.ad
+.sp .6
+.RS 4n
+The minimum amount of space guaranteed to a dataset and its descendents. When the amount of space used is below this value, the dataset is treated as if it were taking up the amount of space specified by its reservation. Reservations are accounted for in the parent datasets' space used, and count against the parent datasets' quotas and reservations.
+.sp
+This property can also be referred to by its shortened column name, \fBreserv\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBsecondarycache\fR=\fBall\fR | \fBnone\fR | \fBmetadata\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls what is cached in the secondary cache (L2ARC). If this property is set to \fBall\fR, then both user data and metadata is cached. If this property is set to \fBnone\fR, then neither user data nor metadata is cached. If this property is set to \fBmetadata\fR, then only metadata is cached. The default value is \fBall\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBsetuid\fR=\fBon\fR | \fBoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether the set-\fBUID\fR bit is respected for the file system. The default value is \fBon\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBsharesmb\fR=\fBon\fR | \fBoff\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether the file system is shared by using \fBSamba USERSHARES\fR, and what options are to be used. Otherwise, the file system is automatically shared and unshared with the \fBzfs share\fR and \fBzfs unshare\fR commands. If the property is set to \fBon\fR, the \fBnet\fR(8) command is invoked to create a \fBUSERSHARE\fR.
+.sp
+Because \fBSMB\fR shares requires a resource name, a unique resource name is constructed from the dataset name. The constructed name is a copy of the dataset name except that the characters in the dataset name, which would be illegal in the resource name, are replaced with underscore (\fB_\fR) characters. The ZFS On Linux driver does not (yet) support additional options which might be available in the Solaris version.
+.sp
+If the \fBsharesmb\fR property is set to \fBoff\fR, the file systems are unshared.
+.sp
+In Linux, the share is created with the ACL (Access Control List) "Everyone:F" ("F" stands for "full permissions", ie. read and write permissions) and no guest access (which means samba must be able to authenticate a real user, system passwd/shadow, ldap or smbpasswd based) by default. This means that any additional access control (dissalow specific user specific access etc) must be done on the underlaying filesystem.
+.sp
+.in +2
+Example to mount a SMB filesystem shared through ZFS (share/tmp):
+.mk
+Note that a user and his/her password \fBmust\fR be given!
+.sp
+.in +2
+smbmount //127.0.0.1/share_tmp /mnt/tmp -o user=workgroup/turbo,password=obrut,uid=1000
+.in -2
+.in -2
+.sp
+.ne 2
+.mk
+.na
+\fBMinimal /etc/samba/smb.conf configuration\fR
+.sp
+.in +2
+* Samba will need to listen to 'localhost' (127.0.0.1) for the zfs utilities to communitate with samba.  This is the default behavior for most Linux distributions.
+.sp
+* Samba must be able to authenticate a user. This can be done in a number of ways, depending on if using the system password file, LDAP or the Samba specific smbpasswd file. How to do this is outside the scope of this manual. Please refer to the smb.conf(5) manpage for more information.
+.sp
+* See the \fBUSERSHARE\fR section of the \fBsmb.conf\fR(5) man page for all configuration options in case you need to modify any options to the share afterwards. Do note that any changes done with the 'net' command will be undone if the share is every unshared (such as at a reboot etc). In the future, ZoL will be able to set specific options directly using sharesmb=<option>.
+.sp
+.in -2
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBsharenfs\fR=\fBon\fR | \fBoff\fR | \fIopts\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether the file system is shared via \fBNFS\fR, and what options are used. A file system with a \fBsharenfs\fR property of \fBoff\fR is managed with the \fBexportfs\fR(8) command and entries in \fB/etc/exports\fR file. Otherwise, the file system is automatically shared and unshared with the \fBzfs share\fR and \fBzfs unshare\fR commands. If the property is set to \fBon\fR, the dataset is shared using the \fBexportfs\fR(8) command in the following manner (see \fBexportfs\fR(8) for the meaning of the different options):
+.sp
+.in +4
+.nf
+/usr/sbin/exportfs -i -o sec=sys,rw,no_subtree_check,no_root_squash,mountpoint *:<mountpoint of dataset>
+.fi
+.in -4
+.sp
+Otherwise, the \fBexportfs\fR(8) command is invoked with options equivalent to the contents of this property.
+.sp
+When the \fBsharenfs\fR property is changed for a dataset, the dataset and any children inheriting the property are re-shared with the new options, only if the property was previously \fBoff\fR, or if they were shared before the property was changed. If the new property is \fBoff\fR, the file systems are unshared.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBlogbias\fR = \fBlatency\fR | \fBthroughput\fR\fR
+.ad
+.sp .6
+.RS 4n
+Provide a hint to ZFS about handling of synchronous requests in this dataset. If \fBlogbias\fR is set to \fBlatency\fR (the default), ZFS will use pool log devices (if configured) to handle the requests at low latency. If \fBlogbias\fR is set to \fBthroughput\fR, ZFS will not use configured pool log devices. ZFS will instead optimize synchronous operations for global pool throughput and efficient use of resources.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBsnapdev\fR=\fBhidden\fR | \fBvisible\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether the snapshots devices of zvol's are hidden or visible. The default value is \fBhidden\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBsnapdir\fR=\fBhidden\fR | \fBvisible\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether the \fB\&.zfs\fR directory is hidden or visible in the root of the file system as discussed in the "Snapshots" section. The default value is \fBhidden\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBsync\fR=\fBstandard\fR | \fBalways\fR | \fBdisabled\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls the behavior of synchronous requests (e.g. fsync, O_DSYNC).
+\fBstandard\fR is the POSIX specified behavior of ensuring all synchronous
+requests are written to stable storage and all devices are flushed to ensure
+data is not cached by device controllers (this is the default). \fBalways\fR
+causes every file system transaction to be written and flushed before its
+system call returns. This has a large performance penalty. \fBdisabled\fR
+disables synchronous requests. File system transactions are only committed to
+stable storage periodically. This option will give the highest performance.
+However, it is very dangerous as ZFS would be ignoring the synchronous
+transaction demands of applications such as databases or NFS.  Administrators
+should only use this option when the risks are understood.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBversion\fR=\fB1\fR | \fB2\fR | \fBcurrent\fR\fR
+.ad
+.sp .6
+.RS 4n
+The on-disk version of this file system, which is independent of the pool version. This property can only be set to later supported versions. See the \fBzfs upgrade\fR command.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBvolsize\fR=\fIsize\fR\fR
+.ad
+.sp .6
+.RS 4n
+For volumes, specifies the logical size of the volume. By default, creating a volume establishes a reservation of equal size. For storage pools with a version number of 9 or higher, a \fBrefreservation\fR is set instead. Any changes to \fBvolsize\fR are reflected in an equivalent change to the reservation (or \fBrefreservation\fR). The \fBvolsize\fR can only be set to a multiple of \fBvolblocksize\fR, and cannot be zero.
+.sp
+The reservation is kept equal to the volume's logical size to prevent unexpected behavior for consumers. Without the reservation, the volume could run out of space, resulting in undefined behavior or data corruption, depending on how the volume is used. These effects can also occur when the volume size is changed while it is in use (particularly when shrinking the size). Extreme care should be used when adjusting the volume size.
+.sp
+Though not recommended, a "sparse volume" (also known as "thin provisioning") can be created by specifying the \fB-s\fR option to the \fBzfs create -V\fR command, or by changing the reservation after the volume has been created. A "sparse volume" is a volume where the reservation is less then the volume size. Consequently, writes to a sparse volume can fail with \fBENOSPC\fR when the pool is low on space. For a sparse volume, changes to \fBvolsize\fR are not reflected in the reservation.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBvscan\fR=\fBon\fR | \fBoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether regular files should be scanned for viruses when a file is opened and closed. In addition to enabling this property, the virus scan service must also be enabled for virus scanning to occur. The default value is \fBoff\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBxattr\fR=\fBon\fR | \fBoff\fR | \fBsa\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether extended attributes are enabled for this file system.  Two
+styles of extended attributes are supported either directory based or system
+attribute based.
+.sp
+The default value of \fBon\fR enables directory based extended attributes.
+This style of xattr imposes no practical limit on either the size or number of
+xattrs which may be set on a file.  Although under Linux the \fBgetxattr\fR(2)
+and \fBsetxattr\fR(2) system calls limit the maximum xattr size to 64K.  This
+is the most compatible style of xattr and it is supported by the majority of
+ZFS implementations.
+.sp
+System attribute based xattrs may be enabled by setting the value to \fBsa\fR.
+The key advantage of this type of xattr is improved performance.  Storing
+xattrs as system attributes significantly decreases the amount of disk IO
+required.  Up to 64K of xattr data may be stored per file in the space reserved
+for system attributes.  If there is not enough space available for an xattr then
+it will be automatically written as a directory based xattr.  System attribute
+based xattrs are not accessible on platforms which do not support the
+\fBxattr=sa\fR feature.
+.sp
+The use of system attribute based xattrs is strongly encouraged for users of
+SELinux or Posix ACLs.  Both of these features heavily rely of xattrs and
+benefit significantly from the reduced xattr access time.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzoned\fR=\fBon\fR | \fBoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Controls whether the dataset is managed from a non-global zone. Zones are a Solaris feature and are not relevant on Linux. The default value is \fBoff\fR.
+.RE
+
+.sp
+.LP
+The following three properties cannot be changed after the file system is created, and therefore, should be set when the file system is created. If the properties are not set with the \fBzfs create\fR or \fBzpool create\fR commands, these properties are inherited from the parent dataset. If the parent dataset lacks these properties due to having been created prior to these features being supported, the new file system will have the default values for these properties.
+.sp
+.ne 2
+.mk
+.na
+\fB\fBcasesensitivity\fR=\fBsensitive\fR | \fBinsensitive\fR | \fBmixed\fR\fR
+.ad
+.sp .6
+.RS 4n
+Indicates whether the file name matching algorithm used by the file system should be case-sensitive, case-insensitive, or allow a combination of both styles of matching. The default value for the \fBcasesensitivity\fR property is \fBsensitive\fR. Traditionally, UNIX and POSIX file systems have case-sensitive file names.
+.sp
+The \fBmixed\fR value for the \fBcasesensitivity\fR property indicates that the file system can support requests for both case-sensitive and case-insensitive matching behavior. Currently, case-insensitive matching behavior on a file system that supports mixed behavior is limited to the Solaris CIFS server product. For more information about the \fBmixed\fR value behavior, see the \fISolaris ZFS Administration Guide\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBnormalization\fR = \fBnone\fR | \fBformC\fR | \fBformD\fR | \fBformKC\fR | \fBformKD\fR\fR
+.ad
+.sp .6
+.RS 4n
+Indicates whether the file system should perform a \fBunicode\fR normalization of file names whenever two file names are compared, and which normalization algorithm should be used. File names are always stored unmodified, names are normalized as part of any comparison process. If this property is set to a legal value other than \fBnone\fR, and the \fButf8only\fR property was left unspecified, the \fButf8only\fR property is automatically set to \fBon\fR. The default value of the \fBnormalization\fR property is \fBnone\fR. This property cannot be changed after the file system is created.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fButf8only\fR=\fBon\fR | \fBoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Indicates whether the file system should reject file names that include characters that are not present in the \fBUTF-8\fR character code set. If this property is explicitly set to \fBoff\fR, the normalization property must either not be explicitly set or be set to \fBnone\fR. The default value for the \fButf8only\fR property is \fBoff\fR. This property cannot be changed after the file system is created.
+.RE
+
+.sp
+.LP
+The \fBcasesensitivity\fR, \fBnormalization\fR, and \fButf8only\fR properties are also new permissions that can be assigned to non-privileged users by using the \fBZFS\fR delegated administration feature.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBcontext\fR=\fBSELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level\fR\fR
+.ad
+.sp .6
+.RS 4n
+This flag sets the SELinux context for all files in the filesytem under the mountpoint for that filesystem.  See \fBselinux\fR(8) for more information.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBfscontext\fR=\fBSELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level\fR\fR
+.ad
+.sp .6
+.RS 4n
+This flag sets the SELinux context for the filesytem being mounted.  See \fBselinux\fR(8) for more information.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBdefntext\fR=\fBSELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level\fR\fR
+.ad
+.sp .6
+.RS 4n
+This flag sets the SELinux context for unlabeled files.  See \fBselinux\fR(8) for more information.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBrootcontext\fR=\fBSELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level\fR\fR
+.ad
+.sp .6
+.RS 4n
+This flag sets the SELinux context for the root inode of the filesystem.  See \fBselinux\fR(8) for more information.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBoverlay\fR=\fBon\fR | \fBoff\fR\fR
+.ad
+.sp .6
+.RS 4n
+Allow mounting on a busy directory or a directory which already contains files/directories. This is the default mount behavior for Linux filesystems.  However, for consistency with ZFS on other platforms overlay mounts are disabled by default.  Set \fBoverlay=on\fR to enable overlay mounts.
+.RE
+
+.SS "Temporary Mount Point Properties"
+.LP
+When a file system is mounted, either through \fBmount\fR(8) for legacy mounts or the \fBzfs mount\fR command for normal file systems, its mount options are set according to its properties. The correlation between properties and mount options is as follows:
+.sp
+.in +2
+.nf
+    PROPERTY                MOUNT OPTION
+     devices                 devices/nodevices
+     exec                    exec/noexec
+     readonly                ro/rw
+     setuid                  setuid/nosetuid
+     xattr                   xattr/noxattr
+     atime                   atime/noatime
+     relatime                relatime/norelatime
+     nbmand                  nbmand/nonbmand
+.fi
+.in -2
+.sp
+
+.sp
+.LP
+In addition, these options can be set on a per-mount basis using the \fB-o\fR option, without affecting the property that is stored on disk. The values specified on the command line override the values stored in the dataset. The \fB-nosuid\fR option is an alias for \fBnodevices,nosetuid\fR. These properties are reported as "temporary" by the \fBzfs get\fR command. If the properties are changed while the dataset is mounted, the new setting overrides any temporary settings.
+.SS "User Properties"
+.LP
+In addition to the standard native properties, \fBZFS\fR supports arbitrary user properties. User properties have no effect on \fBZFS\fR behavior, but applications or administrators can use them to annotate datasets (file systems, volumes, and snapshots).
+.sp
+.LP
+User property names must contain a colon (\fB:\fR) character to distinguish them from native properties. They may contain lowercase letters, numbers, and the following punctuation characters: colon (\fB:\fR), dash (\fB-\fR), period (\fB\&.\fR), and underscore (\fB_\fR). The expected convention is that the property name is divided into two portions such as \fImodule\fR\fB:\fR\fIproperty\fR, but this namespace is not enforced by \fBZFS\fR. User property names can be at most 256 characters, and cannot begin with a dash (\fB-\fR).
+.sp
+.LP
+When making programmatic use of user properties, it is strongly suggested to use a reversed \fBDNS\fR domain name for the \fImodule\fR component of property names to reduce the chance that two independently-developed packages use the same property name for different purposes. For example, property names beginning with \fBcom.sun\fR. are reserved for use by Oracle Corporation (which acquired Sun Microsystems).
+.sp
+.LP
+The values of user properties are arbitrary strings, are always inherited, and are never validated. All of the commands that operate on properties (\fBzfs list\fR, \fBzfs get\fR, \fBzfs set\fR, and so forth) can be used to manipulate both native properties and user properties. Use the \fBzfs inherit\fR command to clear a user property . If the property is not defined in any parent dataset, it is removed entirely. Property values are limited to 1024 characters.
+.SS "ZFS Volumes as Swap"
+.LP
+\fBZFS\fR volumes may be used as Linux swap devices.  After creating the volume
+with the \fBzfs create\fR command set up and enable the swap area using the
+\fBmkswap\fR(8) and \fBswapon\fR(8) commands.  Do not swap to a file on a
+\fBZFS\fR file system. A \fBZFS\fR swap file configuration is not supported.
+.SH SUBCOMMANDS
+.LP
+All subcommands that modify state are logged persistently to the pool in their original form.
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs ?\fR\fR
+.ad
+.sp .6
+.RS 4n
+Displays a help message.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs create\fR [\fB-p\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIfilesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates a new \fBZFS\fR file system. The file system is automatically mounted according to the \fBmountpoint\fR property inherited from the parent.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-p\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates all the non-existing parent datasets. Datasets created in this manner are automatically mounted according to the \fBmountpoint\fR property inherited from their parent. Any property specified on the command line using the \fB-o\fR option is ignored. If the target filesystem already exists, the operation completes successfully.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR \fIproperty\fR=\fIvalue\fR\fR
+.ad
+.sp .6
+.RS 4n
+Sets the specified property as if the command \fBzfs set\fR \fIproperty\fR=\fIvalue\fR was invoked at the same time the dataset was created. Any editable \fBZFS\fR property can also be set at creation time. Multiple \fB-o\fR options can be specified. An error results if the same property is specified in multiple \fB-o\fR options.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs create\fR [\fB-ps\fR] [\fB-b\fR \fIblocksize\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fB-V\fR \fIsize\fR \fIvolume\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates a volume of the given size. The volume is exported as a block device in \fB/dev/zvol/\fR\fIpath\fR, where \fIpath\fR is the name of the volume in the \fBZFS\fR namespace. The size represents the logical size as exported by the device. By default, a reservation of equal size is created.
+.sp
+\fIsize\fR is automatically rounded up to the nearest 128 Kbytes to ensure that the volume has an integral number of blocks regardless of \fIblocksize\fR.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-p\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates all the non-existing parent datasets. Datasets created in this manner are automatically mounted according to the \fBmountpoint\fR property inherited from their parent. Any property specified on the command line using the \fB-o\fR option is ignored. If the target filesystem already exists, the operation completes successfully.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-s\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates a sparse volume with no reservation. See \fBvolsize\fR in the Native Properties section for more information about sparse volumes.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR \fIproperty\fR=\fIvalue\fR\fR
+.ad
+.sp .6
+.RS 4n
+Sets the specified property as if the \fBzfs set\fR \fIproperty\fR=\fIvalue\fR command was invoked at the same time the dataset was created. Any editable \fBZFS\fR property can also be set at creation time. Multiple \fB-o\fR options can be specified. An error results if the same property is specified in multiple \fB-o\fR options.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-b\fR \fIblocksize\fR\fR
+.ad
+.sp .6
+.RS 4n
+Equivalent to \fB-o\fR \fBvolblocksize\fR=\fIblocksize\fR. If this option is specified in conjunction with \fB-o\fR \fBvolblocksize\fR, the resulting behavior is undefined.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBzfs destroy\fR [\fB-fnpRrv\fR] \fIfilesystem\fR|\fIvolume\fR
+.ad
+.sp .6
+.RS 4n
+Destroys the given dataset. By default, the command unshares any file systems that are currently shared, unmounts any file systems that are currently mounted, and refuses to destroy a dataset that has active dependents (children or clones).
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Recursively destroy all children.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-R\fR\fR
+.ad
+.sp .6
+.RS 4n
+Recursively destroy all dependents, including cloned file systems outside the target hierarchy.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-f\fR\fR
+.ad
+.sp .6
+.RS 4n
+Force an unmount of any file systems using the \fBunmount -f\fR command. This option has no effect on non-file systems or unmounted file systems.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-n\fR\fR
+.ad
+.sp .6
+.RS 4n
+Do a dry-run ("No-op") deletion.  No data will be deleted.  This is
+useful in conjunction with the \fB-v\fR or \fB-p\fR flags to determine what
+data would be deleted.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-p\fR\fR
+.ad
+.sp .6
+.RS 4n
+Print machine-parsable verbose information about the deleted data.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-v\fR\fR
+.ad
+.sp .6
+.RS 4n
+Print verbose information about the deleted data.
+.RE
+.sp
+
+Extreme care should be taken when applying either the \fB-r\fR or the \fB-R\fR options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBzfs destroy\fR [\fB-dnpRrv\fR] \fIfilesystem\fR|\fIvolume\fR@\fIsnap\fR[%\fIsnap\fR][,...]
+.ad
+.sp .6
+.RS 4n
+The given snapshots are destroyed immediately if and only if the \fBzfs destroy\fR command without the \fB-d\fR option would have destroyed it. Such immediate destruction would occur, for example, if the snapshot had no clones and the user-initiated reference count were zero.
+.sp
+If a snapshot does not qualify for immediate destruction, it is marked for deferred destruction. In this state, it exists as a usable, visible snapshot until both of the preconditions listed above are met, at which point it is destroyed.
+.sp
+An inclusive range of snapshots may be specified by separating the
+first and last snapshots with a percent sign.
+The first and/or last snapshots may be left blank, in which case the
+filesystem's oldest or newest snapshot will be implied.
+.sp
+Multiple snapshots
+(or ranges of snapshots) of the same filesystem or volume may be specified
+in a comma-separated list of snapshots.
+Only the snapshot's short name (the
+part after the \fB@\fR) should be specified when using a range or
+comma-separated list to identify multiple snapshots.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-d\fR\fR
+.ad
+.sp .6
+.RS 4n
+Defer snapshot deletion.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Destroy (or mark for deferred destruction) all snapshots with this name in descendent file systems.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-R\fR\fR
+.ad
+.sp .6
+.RS 4n
+Recursively destroy all clones of these snapshots, including the clones,
+snapshots, and children.  If this flag is specified, the \fB-d\fR flag will
+have no effect.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-n\fR\fR
+.ad
+.sp .6
+.RS 4n
+Do a dry-run ("No-op") deletion.  No data will be deleted.  This is
+useful in conjunction with the \fB-v\fR or \fB-p\fR flags to determine what
+data would be deleted.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-p\fR\fR
+.ad
+.sp .6
+.RS 4n
+Print machine-parsable verbose information about the deleted data.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-v\fR\fR
+.ad
+.sp .6
+.RS 4n
+Print verbose information about the deleted data.
+.RE
+
+.sp
+Extreme care should be taken when applying either the \fB-r\fR or the \fB-R\fR
+options, as they can destroy large portions of a pool and cause unexpected
+behavior for mounted file systems in use.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBzfs destroy\fR \fIfilesystem\fR|\fIvolume\fR#\fIbookmark\fR
+.ad
+.sp .6
+.RS 4n
+The given bookmark is destroyed.
+
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBzfs snapshot\fR [\fB-r\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIfilesystem@snapname\fR|\fIvolume@snapname\fR\fR ...
+.ad
+.sp .6
+.RS 4n
+Creates snapshots with the given names. All previous modifications by successful system calls to the file system are part of the snapshots. Snapshots are taken atomically, so that all snapshots correspond to the same moment in time. See the "Snapshots" section for details.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Recursively create snapshots of all descendent datasets.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR \fIproperty\fR=\fIvalue\fR\fR
+.ad
+.sp .6
+.RS 4n
+Sets the specified property; see \fBzfs create\fR for details.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs rollback\fR [\fB-rRf\fR] \fIsnapshot\fR\fR
+.ad
+.sp .6
+.RS 4n
+Roll back the given dataset to a previous snapshot. When a dataset is rolled back, all data that has changed since the snapshot is discarded, and the dataset reverts to the state at the time of the snapshot. By default, the command refuses to roll back to a snapshot other than the most recent one. In order to do so, all intermediate snapshots and bookmarks must be destroyed by specifying the \fB-r\fR option.
+.sp
+The \fB-rR\fR options do not recursively destroy the child snapshots of a recursive snapshot. Only direct snapshots of the specified filesystem are destroyed by either of these options. To completely roll back a recursive snapshot, you must rollback the individual child snapshots.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Destroy any snapshots and bookmarks more recent than the one specified.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-R\fR\fR
+.ad
+.sp .6
+.RS 4n
+Recursively destroy any more recent snapshots and bookmarks, as well as any clones of those snapshots.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-f\fR\fR
+.ad
+.sp .6
+.RS 4n
+Used with the \fB-R\fR option to force an unmount of any clone file systems that are to be destroyed.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs clone\fR [\fB-p\fR] [\fB-o\fR \fIproperty\fR=\fIvalue\fR] ... \fIsnapshot\fR \fIfilesystem\fR|\fIvolume\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates a clone of the given snapshot. See the "Clones" section for details. The target dataset can be located anywhere in the \fBZFS\fR hierarchy, and is created as the same type as the original.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-p\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates all the non-existing parent datasets. Datasets created in this manner are automatically mounted according to the \fBmountpoint\fR property inherited from their parent. If the target filesystem or volume already exists, the operation completes successfully.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR \fIproperty\fR=\fIvalue\fR\fR
+.ad
+.sp .6
+.RS 4n
+Sets the specified property; see \fBzfs create\fR for details.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs promote\fR \fIclone-filesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Promotes a clone file system to no longer be dependent on its "origin" snapshot. This makes it possible to destroy the file system that the clone was created from. The clone parent-child dependency relationship is reversed, so that the origin file system becomes a clone of the specified file system. 
+.sp
+The snapshot that was cloned, and any snapshots previous to this snapshot, are now owned by the promoted clone. The space they use moves from the origin file system to the promoted clone, so enough space must be available to accommodate these snapshots. No new space is consumed by this operation, but the space accounting is adjusted. The promoted clone must not have any conflicting snapshot names of its own. The \fBrename\fR subcommand can be used to rename any conflicting snapshots.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs rename\fR [\fB-f\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
+.ad
+.br
+.na
+\fB\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
+.ad
+.br
+.na
+\fB\fBzfs rename\fR [\fB-fp\fR] \fIfilesystem\fR|\fIvolume\fR \fIfilesystem\fR|\fIvolume\fR\fR
+.ad
+.sp .6
+.RS 4n
+Renames the given dataset. The new target can be located anywhere in the \fBZFS\fR hierarchy, with the exception of snapshots. Snapshots can only be renamed within the parent file system or volume. When renaming a snapshot, the parent file system of the snapshot does not need to be specified as part of the second argument. Renamed file systems can inherit new mount points, in which case they are unmounted and remounted at the new mount point.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-p\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates all the nonexistent parent datasets. Datasets created in this manner are automatically mounted according to the \fBmountpoint\fR property inherited from their parent.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-f\fR\fR
+.ad
+.sp .6
+.RS 4n
+Force unmount any filesystems that need to be unmounted in the process.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs rename\fR \fB-r\fR \fIsnapshot\fR \fIsnapshot\fR\fR
+.ad
+.sp .6
+.RS 4n
+Recursively rename the snapshots of all descendent datasets. Snapshots are the only dataset that can be renamed recursively.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs\fR \fBlist\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR] [\fB-Hp\fR] [\fB-o\fR \fIproperty\fR[,\fI\&...\fR]] [ \fB-t\fR \fItype\fR[,\fI\&...\fR]] [ \fB-s\fR \fIproperty\fR ] ... [ \fB-S\fR \fIproperty\fR ] ... [\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR] ...\fR
+.ad
+.sp .6
+.RS 4n
+Lists the property information for the given datasets in tabular form. If specified, you can list property information by the absolute pathname or the relative pathname. By default, all file systems and volumes are displayed. Snapshots are displayed if the \fBlistsnaps\fR property is \fBon\fR (the default is \fBoff\fR). When listing hundreds or thousands of snapshots performance can be improved by restricting the output to only the name.  In that case, it is recommended to use \fB-o name -s name\fR. The following fields are displayed by default, \fBname,used,available,referenced,mountpoint\fR.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-H\fR\fR
+.ad
+.sp .6
+.RS 4n
+Used for scripting mode. Do not print headers and separate fields by a single tab instead of arbitrary white space.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-p\fR\fR
+.sp .6
+.RS 4n
+Display numbers in parsable (exact) values.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Recursively display any children of the dataset on the command line. 
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-d\fR \fIdepth\fR\fR
+.ad
+.sp .6
+.RS 4n
+Recursively display any children of the dataset, limiting the recursion to \fIdepth\fR. A depth of \fB1\fR will display only the dataset and its direct children.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR \fIproperty\fR\fR
+.ad
+.sp .6
+.RS 4n
+A comma-separated list of properties to display. The property must be:
+.RS +4
+.TP
+.ie t \(bu
+.el o
+One of the properties described in the "Native Properties" section
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+A user property
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+The value \fBname\fR to display the dataset name
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+The value \fBspace\fR to display space usage properties on file systems and volumes. This is a shortcut for specifying \fB-o name,avail,used,usedsnap,usedds,usedrefreserv,usedchild\fR \fB-t filesystem,volume\fR syntax.
+.RE
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-s\fR \fIproperty\fR\fR
+.ad
+.sp .6
+.RS 4n
+A property for sorting the output by column in ascending order based on the value of the property. The property must be one of the properties described in the "Properties" section, or the special value \fBname\fR to sort by the dataset name. Multiple properties can be specified at one time using multiple \fB-s\fR property options. Multiple \fB-s\fR options are evaluated from left to right in decreasing order of importance.
+.sp
+The following is a list of sorting criteria:
+.RS +4
+.TP
+.ie t \(bu
+.el o
+Numeric types sort in numeric order.
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+String types sort in alphabetical order.
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+Types inappropriate for a row sort that row to the literal bottom, regardless of the specified ordering.
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+If no sorting options are specified the existing behavior of \fBzfs list\fR is preserved.
+.RE
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-S\fR \fIproperty\fR\fR
+.ad
+.sp .6
+.RS 4n
+Same as the \fB-s\fR option, but sorts by property in descending order. 
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-t\fR \fItype\fR\fR
+.ad
+.sp .6
+.RS 4n
+A comma-separated list of types to display, where \fItype\fR is one of \fBfilesystem\fR, \fBsnapshot\fR, \fBsnap\fR, \fBvolume\fR, \fBbookmark\fR, or \fBall\fR. For example, specifying \fB-t snapshot\fR displays only snapshots.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs set\fR \fIproperty\fR=\fIvalue\fR[ \fIproperty\fR=\fIvalue\fR]...
+\fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...\fR
+.ad
+.sp .6
+.RS 4n
+Sets the property or list of properties to the given value(s) for each dataset.
+Only some properties can be edited. See the "Properties" section for more
+information on what properties can be set and acceptable values. Numeric values
+can be specified as exact values, or in a human-readable form with a suffix of
+\fBB\fR, \fBK\fR, \fBM\fR, \fBG\fR, \fBT\fR, \fBP\fR, \fBE\fR, \fBZ\fR (for
+bytes, kilobytes, megabytes, gigabytes, terabytes, petabytes, exabytes, or
+zettabytes, respectively). User properties can be set on snapshots. For more
+information, see the "User Properties" section.
+.RE
+
+.sp
+.ne 2
+.mk .na
+\fB\fBzfs get\fR [\fB-r\fR|\fB-d\fR \fIdepth\fR] [\fB-Hp\fR] [\fB-o\fR \fIfield\fR[,...] [\fB-t\fR \fItype\fR[,...]] [\fB-s\fR \fIsource\fR[,...] "\fIall\fR" | \fIproperty\fR[,...] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...\fR
+.ad
+.sp .6
+.RS 4n
+Displays properties for the given datasets. If no datasets are specified, then the command displays properties for all datasets on the system. For each property, the following columns are displayed:
+.sp
+.in +2
+.nf
+    name      Dataset name
+     property  Property name
+     value     Property value
+     source    Property source. Can either be local, default,
+               temporary, inherited, received, or none (-).
+.fi
+.in -2
+.sp
+
+All columns are displayed by default, though this can be controlled by using the \fB-o\fR option. This command takes a comma-separated list of properties as described in the "Native Properties" and "User Properties" sections.
+.sp
+The special value \fBall\fR can be used to display all properties that apply to the given dataset's type (filesystem, volume snapshot, or bookmark).
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Recursively display properties for any children.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-d\fR \fIdepth\fR\fR
+.ad
+.sp .6
+.RS 4n
+Recursively display any children of the dataset, limiting the recursion to \fIdepth\fR. A depth of \fB1\fR will display only the dataset and its direct children.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-H\fR\fR
+.ad
+.sp .6
+.RS 4n
+Display output in a form more easily parsed by scripts. Any headers are omitted, and fields are explicitly separated by a single tab instead of an arbitrary amount of space.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR \fIfield\fR\fR
+.ad
+.sp .6
+.RS 4n
+A comma-separated list of columns to display. \fBname,property,value,source\fR is the default value. 
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-s\fR \fIsource\fR\fR
+.ad
+.sp .6
+.RS 4n
+A comma-separated list of sources to display. Those properties coming from a source other than those in this list are ignored. Each source must be one of the following: \fBlocal,default,inherited,received,temporary,none\fR. The default value is all sources.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-p\fR\fR
+.ad
+.sp .6
+.RS 4n
+Display numbers in parsable (exact) values.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs inherit\fR [\fB-rS\fR] \fIproperty\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR ...\fR
+.ad
+.sp .6
+.RS 4n
+Clears the specified property, causing it to be inherited from an ancestor, restored to default if no ancestor has the property set, or with the \fB-S\fR option reverted to the received value if one exists.  See the "Properties" section for a listing of default values, and details on which properties can be inherited.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Recursively inherit the given property for all children.
+.RE
+.sp
+.ne 2
+.na
+\fB\fB-S\fR\fR
+.ad
+.sp .6
+.RS 4n
+Revert the property to the received value if one exists; otherwise operate as
+if the \fB-S\fR option was not specified.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs upgrade\fR [\fB-v\fR]\fR
+.ad
+.sp .6
+.RS 4n
+Displays a list of file systems that are not the most recent version.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs upgrade\fR [\fB-r\fR] [\fB-V\fR \fIversion\fR] [\fB-a\fR | \fIfilesystem\fR]\fR
+.ad
+.sp .6
+.RS 4n
+Upgrades file systems to a new on-disk version. Once this is done, the file systems will no longer be accessible on systems running older versions of the software. \fBzfs send\fR streams generated from new snapshots of these file systems cannot be accessed on systems running older versions of the software.
+.sp
+In general, the file system version is independent of the pool version. See \fBzpool\fR(8) for information on the \fBzpool upgrade\fR command. 
+.sp
+In some cases, the file system version and the pool version are interrelated and the pool version must be upgraded before the file system version can be upgraded.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-a\fR\fR
+.ad
+.sp .6
+.RS 4n
+Upgrade all file systems on all imported pools.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fIfilesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Upgrade the specified file system. 
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Upgrade the specified file system and all descendent file systems 
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-V\fR \fIversion\fR\fR
+.ad
+.sp .6
+.RS 4n
+Upgrade to the specified \fIversion\fR. If the \fB-V\fR flag is not specified, this command upgrades to the most recent version. This option can only be used to increase the version number, and only up to the most recent version supported by this software.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBzfs\fR \fBuserspace\fR [\fB-Hinp\fR] [\fB-o\fR \fIfield\fR[,...]]
+[\fB-s\fR \fIfield\fR] ...
+[\fB-S\fR \fIfield\fR] ...
+[\fB-t\fR \fItype\fR[,...]] \fIfilesystem\fR|\fIsnapshot\fR
+.ad
+.sp .6
+.RS 4n
+Displays space consumed by, and quotas on, each user in the specified
+filesystem or snapshot. This corresponds to the \fBuserused@\fR\fIuser\fR and
+\fBuserquota@\fR\fIuser\fR properties.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-n\fR\fR
+.ad
+.sp .6
+.RS 4n
+Print numeric ID instead of user/group name.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-H\fR\fR
+.ad
+.sp .6
+.RS 4n
+Do not print headers, use tab-delimited output.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-p\fR\fR
+.ad
+.sp .6
+.RS 4n
+Use exact (parsable) numeric output.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR \fIfield\fR[,...]\fR
+.ad
+.sp .6
+.RS 4n
+Display only the specified fields from the following
+set: \fBtype, name, used, quota\fR. The default is to display all fields.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-s\fR \fIfield\fR\fR
+.ad
+.sp .6
+.RS 4n
+Sort output by this field. The \fIs\fR and \fIS\fR flags may be specified
+multiple times to sort first by one field, then by another. The default is
+\fB-s type\fR \fB-s name\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-S\fR \fIfield\fR\fR
+.ad
+.sp .6
+.RS 4n
+Sort by this field in reverse order. See \fB-s\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-t\fR \fItype\fR[,...]\fR
+.ad
+.sp .6
+.RS 4n
+Print only the specified types from the following
+set: \fBall, posixuser, smbuser, posixgroup, smbgroup\fR. The default
+is \fB-t posixuser,smbuser\fR. The default can be changed to include group
+types.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-i\fR\fR
+.ad
+.sp .6
+.RS 4n
+Translate SID to POSIX ID. The POSIX ID may be ephemeral if no mapping exists.
+Normal POSIX interfaces (for example, \fBstat\fR(2), \fBls\fR \fB-l\fR) perform
+this translation, so the \fB-i\fR option allows the output from \fBzfs
+userspace\fR to be compared directly with those utilities. However, \fB-i\fR
+may lead to confusion if some files were created by an SMB user before a
+SMB-to-POSIX name mapping was established. In such a case, some files will be owned
+by the SMB entity and some by the POSIX entity. However, the \fB-i\fR option
+will report that the POSIX entity has the total usage and quota for both.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fBzfs\fR \fBgroupspace\fR [\fB-Hinp\fR] [\fB-o\fR \fIfield\fR[,...]]
+[\fB-s\fR \fIfield\fR] ...
+[\fB-S\fR \fIfield\fR] ...
+[\fB-t\fR \fItype\fR[,...]] \fIfilesystem\fR|\fIsnapshot\fR
+.ad
+.sp .6
+.RS 4n
+Displays space consumed by, and quotas on, each group in the specified
+filesystem or snapshot. This subcommand is identical to \fBzfs userspace\fR,
+except that the default types to display are \fB-t posixgroup,smbgroup\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs mount\fR\fR
+.ad
+.sp .6
+.RS 4n
+Displays all \fBZFS\fR file systems currently mounted.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs mount\fR [\fB-vO\fR] [\fB-o\fR \fIoptions\fR] \fB-a\fR | \fIfilesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Mounts \fBZFS\fR file systems. Invoked automatically as part of the boot process.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-o\fR \fIoptions\fR\fR
+.ad
+.sp .6
+.RS 4n
+An optional, comma-separated list of mount options to use temporarily for the
+duration of the mount. See the "Temporary Mount Point Properties" section for
+details.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-O\fR\fR
+.ad
+.sp .6
+.RS 4n
+Perform an overlay mount. See \fBmount\fR(8) for more information.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-v\fR\fR
+.ad
+.sp .6
+.RS 4n
+Report mount progress.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-a\fR\fR
+.ad
+.sp .6
+.RS 4n
+Mount all available \fBZFS\fR file systems. Invoked automatically as part of
+the boot process.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fIfilesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Mount the specified filesystem.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs unmount\fR [\fB-f\fR] \fB-a\fR | \fIfilesystem\fR|\fImountpoint\fR\fR
+.ad
+.sp .6
+.RS 4n
+Unmounts currently mounted \fBZFS\fR file systems. Invoked automatically as part of the shutdown process.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-f\fR\fR
+.ad
+.sp .6
+.RS 4n
+Forcefully unmount the file system, even if it is currently in use.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-a\fR\fR
+.ad
+.sp .6
+.RS 4n
+Unmount all available \fBZFS\fR file systems. Invoked automatically as part of the shutdown process.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fIfilesystem\fR|\fImountpoint\fR\fR
+.ad
+.sp .6
+.RS 4n
+Unmount the specified filesystem. The command can also be given a path to a \fBZFS\fR file system mount point on the system.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs share\fR \fB-a\fR | \fIfilesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Shares available \fBZFS\fR file systems. 
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-a\fR\fR
+.ad
+.sp .6
+.RS 4n
+Share all available \fBZFS\fR file systems. Invoked automatically as part of the boot process. 
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fIfilesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Share the specified filesystem according to the \fBsharenfs\fR and \fBsharesmb\fR properties. File systems are shared when the \fBsharenfs\fR or \fBsharesmb\fR property is set.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs unshare\fR \fB-a\fR | \fIfilesystem\fR|\fImountpoint\fR\fR
+.ad
+.sp .6
+.RS 4n
+Unshares currently shared \fBZFS\fR file systems. This is invoked automatically as part of the shutdown process.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-a\fR\fR
+.ad
+.sp .6
+.RS 4n
+Unshare all available \fBZFS\fR file systems. Invoked automatically as part of the boot process. 
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fIfilesystem\fR|\fImountpoint\fR\fR
+.ad
+.sp .6
+.RS 4n
+Unshare the specified filesystem. The command can also be given a path to a \fBZFS\fR file system shared on the system.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs bookmark\fR \fIsnapshot\fR \fIbookmark\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates a bookmark of the given snapshot.  Bookmarks mark the point in time
+when the snapshot was created, and can be used as the incremental source for
+a \fBzfs send\fR command.
+.sp
+This feature must be enabled to be used.
+See \fBzpool-features\fR(5) for details on ZFS feature flags and the
+\fBbookmarks\fR feature.
+.RE
+
+
+.RE
+.sp
+.ne 2
+.na
+\fBzfs send\fR [\fB-DnPpRveL\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
+.ad
+.sp .6
+.RS 4n
+Creates a stream representation of the second \fIsnapshot\fR, which is written to standard output. The output can be redirected to a file or to a different system (for example, using \fBssh\fR(1). By default, a full stream is generated.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-i\fR \fIsnapshot\fR\fR
+.ad
+.sp .6
+.RS 4n
+Generate an incremental stream from the first \fIsnapshot\fR (the incremental source) to the second \fIsnapshot\fR (the incremental target).  The incremental source can be specified as the last component of the snapshot name (the \fB@\fR character and following) and it is assumed to be from the same file system as the incremental target.
+.sp
+If the destination is a clone, the source may be the origin snapshot, which must be fully specified (for example, \fBpool/fs@origin\fR, not just \fB@origin\fR).
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-I\fR \fIsnapshot\fR\fR
+.ad
+.sp .6
+.RS 4n
+Generate a stream package that sends all intermediary snapshots from the first snapshot to the second snapshot. For example, \fB-I @a fs@d\fR is similar to \fB-i @a fs@b; -i @b fs@c; -i @c fs@d\fR. The incremental source may be specified as with the \fB-i\fR option.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-R\fR\fR
+.ad
+.sp .6
+.RS 4n
+Generate a replication stream package, which will replicate the specified filesystem, and all descendent file systems, up to the named snapshot. When received, all properties, snapshots, descendent file systems, and clones are preserved.
+.sp
+If the \fB-i\fR or \fB-I\fR flags are used in conjunction with the \fB-R\fR flag, an incremental replication stream is generated. The current values of properties, and current snapshot and file system names are set when the stream is received. If the \fB-F\fR flag is specified when this stream is received, snapshots and file systems that do not exist on the sending side are destroyed. 
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-D\fR\fR
+.ad
+.sp .6
+.RS 4n
+Generate a deduplicated stream. Blocks which would have been sent multiple times in the send stream will only be sent once. The receiving system must also support this feature to receive a deduplicated stream.  This flag can be used regardless of the dataset's dedup  property, but performance will be much better if the filesystem uses a dedup-capable checksum (eg.  sha256).
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-L\fR\fR
+.ad
+.sp .6
+.RS 4n
+Generate a stream which may contain blocks larger than 128KB.  This flag
+has no effect if the \fBlarge_blocks\fR pool feature is disabled, or if
+the \fRrecordsize\fR property of this filesystem has never been set above
+128KB.  The receiving system must have the \fBlarge_blocks\fR pool feature
+enabled as well.  See \fBzpool-features\fR(5) for details on ZFS feature
+flags and the \fBlarge_blocks\fR feature.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-e\fR\fR
+.ad
+.sp .6
+.RS 4n
+Generate a more compact stream by using WRITE_EMBEDDED records for blocks
+which are stored more compactly on disk by the \fBembedded_data\fR pool
+feature.  This flag has no effect if the \fBembedded_data\fR feature is
+disabled.  The receiving system must have the \fBembedded_data\fR feature
+enabled.  If the \fBlz4_compress\fR feature is active on the sending system,
+then the receiving system must have that feature enabled as well. See
+\fBzpool-features\fR(5) for details on ZFS feature flags and the
+\fBembedded_data\fR feature.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-p\fR\fR
+.ad
+.sp .6
+.RS 4n
+Include the dataset's properties in the stream.  This flag is implicit when -R is specified.  The receiving system must also support this feature.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-n\fR\fR
+.ad
+.sp .6
+.RS 4n
+Do a dry-run ("No-op") send.  Do not generate any actual send data.  This is
+useful in conjunction with the \fB-v\fR or \fB-P\fR flags to determine what
+data will be sent.  In this case, the verbose output will be written to
+standard output (contrast with a non-dry-run, where the stream is written
+to standard output and the verbose output goes to standard error).
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-P\fR\fR
+.ad
+.sp .6
+.RS 4n
+Print machine-parsable verbose information about the stream package generated.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-v\fR\fR
+.ad
+.sp .6
+.RS 4n
+Print verbose information about the stream package generated.  This information
+includes a per-second report of how much data has been sent.
+.RE
+
+The format of the stream is committed. You will be able to receive your streams on future versions of \fBZFS\fR.
+.RE
+
+.RE
+.sp
+.ne 2
+.na
+\fBzfs send\fR [\fB-eL\fR] [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+.ad
+.sp .6
+.RS 4n
+Generate a send stream, which may be of a filesystem, and may be
+incremental from a bookmark.  If the destination is a filesystem or volume,
+the pool must be read-only, or the filesystem must not be mounted.  When the
+stream generated from a filesystem or volume is received, the default snapshot
+name will be "--head--".
+
+.sp
+.ne 2
+.na
+\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR
+.ad
+.sp .6
+.RS 4n
+Generate an incremental send stream.  The incremental source must be an earlier
+snapshot in the destination's history. It will commonly be an earlier
+snapshot in the destination's filesystem, in which case it can be
+specified as the last component of the name (the \fB#\fR or \fB@\fR character
+and following).
+.sp
+If the incremental target is a clone, the incremental source can
+be the origin snapshot, or an earlier snapshot in the origin's filesystem,
+or the origin's origin, etc.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-L\fR\fR
+.ad
+.sp .6
+.RS 4n
+Generate a stream which may contain blocks larger than 128KB.  This flag
+has no effect if the \fBlarge_blocks\fR pool feature is disabled, or if
+the \fRrecordsize\fR property of this filesystem has never been set above
+128KB.  The receiving system must have the \fBlarge_blocks\fR pool feature
+enabled as well.  See \fBzpool-features\fR(5) for details on ZFS feature
+flags and the \fBlarge_blocks\fR feature.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-e\fR\fR
+.ad
+.sp .6
+.RS 4n
+Generate a more compact stream by using WRITE_EMBEDDED records for blocks
+which are stored more compactly on disk by the \fBembedded_data\fR pool
+feature.  This flag has no effect if the \fBembedded_data\fR feature is
+disabled.  The receiving system must have the \fBembedded_data\fR feature
+enabled.  If the \fBlz4_compress\fR feature is active on the sending system,
+then the receiving system must have that feature enabled as well. See
+\fBzpool-features\fR(5) for details on ZFS feature flags and the
+\fBembedded_data\fR feature.
+.RE
+
+.RE
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs receive\fR [\fB-vnFu\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
+.ad
+.br
+.na
+\fB\fBzfs receive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] \fIfilesystem\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates a snapshot whose contents are as specified in the stream provided on standard input. If a full stream is received, then a new file system is created as well. Streams are created using the \fBzfs send\fR subcommand, which by default creates a full stream. \fBzfs recv\fR can be used as an alias for \fBzfs receive\fR.
+.sp
+If an incremental stream is received, then the destination file system must already exist, and its most recent snapshot must match the incremental stream's source. For \fBzvols\fR, the destination device link is destroyed and recreated, which means the \fBzvol\fR cannot be accessed during the \fBreceive\fR operation.
+.sp
+When a snapshot replication package stream that is generated by using the \fBzfs send\fR \fB-R\fR command is  received, any snapshots that do not exist on the sending location are destroyed by using the \fBzfs destroy\fR \fB-d\fR command.
+.sp
+The name of the snapshot (and file system, if a full stream is received) that this subcommand creates depends on the argument type and the use of the \fB-d\fR or \fB-e\fR options.
+.sp
+If the argument is a snapshot name, the specified \fIsnapshot\fR is created. If the argument is a file system or volume name, a snapshot with the same name as the sent snapshot is created within the specified \fIfilesystem\fR or \fIvolume\fR.  If neither of the \fB-d\fR or \fB-e\fR options are specified, the provided target snapshot name is used exactly as provided.
+.sp
+The \fB-d\fR and \fB-e\fR options cause the file system name of the target snapshot to be determined by appending a portion of the sent snapshot's name to the specified target \fIfilesystem\fR. If the \fB-d\fR option is specified, all but the first element of the sent snapshot's file system path (usually the pool name) is used and any required intermediate file systems within the specified one are created.  If the \fB-e\fR option is specified, then only the last element of the sent snapshot's file system name (i.e. the name of the source file system itself) is used as the target file system name.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-d\fR\fR
+.ad
+.sp .6
+.RS 4n
+Discard the first element of the sent snapshot's file system name, using the remaining elements to determine the name of the target file system for the new snapshot as described in the paragraph above.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-e\fR\fR
+.ad
+.sp .6
+.RS 4n
+Discard all but the last element of the sent snapshot's file system name, using that element to determine the name of the target file system for the new snapshot as described in the paragraph above.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-u\fR\fR
+.ad
+.sp .6
+.RS 4n
+File system that is associated with the received stream is not mounted.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-v\fR\fR
+.ad
+.sp .6
+.RS 4n
+Print verbose information about the stream and the time required to perform the receive operation.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-n\fR\fR
+.ad
+.sp .6
+.RS 4n
+Do not actually receive the stream. This can be useful in conjunction with the \fB-v\fR option to verify the name the receive operation would use.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-F\fR\fR
+.ad
+.sp .6
+.RS 4n
+Force a rollback of the file system to the most recent snapshot before performing the receive operation. If receiving an incremental replication stream (for example, one generated by \fBzfs send -R -[iI]\fR), destroy snapshots and file systems that do not exist on the sending side.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs allow\fR \fIfilesystem\fR | \fIvolume\fR\fR
+.ad
+.sp .6
+.RS 4n
+Displays permissions that have been delegated on the specified filesystem or volume. See the other forms of \fBzfs allow\fR for more information.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs allow\fR [\fB-ldug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...] \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR| \fIvolume\fR\fR
+.ad
+.br
+.na
+\fB\fBzfs allow\fR [\fB-ld\fR] \fB-e\fR \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR | \fIvolume\fR\fR
+.ad
+.sp .6
+.RS 4n
+Delegates \fBZFS\fR administration permission for the file systems to non-privileged users.
+.sp
+.ne 2
+.mk
+.na
+\fB[\fB-ug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...]\fR
+.ad
+.sp .6
+.RS 4n
+Specifies to whom the permissions are delegated. Multiple entities can be specified as a comma-separated list. If neither of the \fB-ug\fR options are specified, then the argument is interpreted preferentially as the keyword "everyone", then as a user name, and lastly as a group name. To specify a user or group named "everyone", use the \fB-u\fR or \fB-g\fR options. To specify a group with the same name as a user, use the \fB-g\fR options.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB[\fB-e\fR] \fIperm\fR|@\fIsetname\fR[,...]\fR
+.ad
+.sp .6
+.RS 4n
+Specifies that the permissions be delegated to "everyone." Multiple permissions may be specified as a comma-separated list. Permission names are the same as \fBZFS\fR subcommand and property names. See the property list below. Property set names, which begin with an at sign (\fB@\fR) , may be specified. See the \fB-s\fR form below for details.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB[\fB-ld\fR] \fIfilesystem\fR|\fIvolume\fR\fR
+.ad
+.sp .6
+.RS 4n
+Specifies where the permissions are delegated. If neither of the \fB-ld\fR options are specified, or both are, then the permissions are allowed for the file system or volume, and all of its descendents. If only the \fB-l\fR option is used, then is allowed "locally" only for the specified file system. If only the \fB-d\fR option is used, then is allowed only for the descendent file systems.
+.RE
+
+.RE
+
+.sp
+.LP
+Permissions are generally the ability to use a \fBZFS\fR subcommand or change a \fBZFS\fR property. The following permissions are available:
+.sp
+.in +2
+.nf
+NAME             TYPE           NOTES
+allow            subcommand     Must also have the permission that is being
+                                allowed
+clone            subcommand     Must also have the 'create' ability and 'mount'
+                                ability in the origin file system
+create           subcommand     Must also have the 'mount' ability
+destroy          subcommand     Must also have the 'mount' ability
+diff             subcommand     Allows lookup of paths within a dataset
+                                given an object number, and the ability to
+                                create snapshots necessary to 'zfs diff'.
+mount            subcommand     Allows mount/umount of ZFS datasets
+promote          subcommand     Must also have the 'mount'
+                                and 'promote' ability in the origin file system
+receive          subcommand     Must also have the 'mount' and 'create' ability
+rename           subcommand     Must also have the 'mount' and 'create'
+                                ability in the new parent
+rollback         subcommand     Must also have the 'mount' ability
+send             subcommand     
+share            subcommand     Allows sharing file systems over NFS or SMB
+                                protocols
+snapshot         subcommand     Must also have the 'mount' ability
+groupquota       other          Allows accessing any groupquota@... property
+groupused        other          Allows reading any groupused@... property
+userprop         other          Allows changing any user property
+userquota        other          Allows accessing any userquota@... property
+userused         other          Allows reading any userused@... property
+
+acltype          property
+aclinherit       property       
+atime            property       
+canmount         property       
+casesensitivity  property       
+checksum         property       
+compression      property       
+copies           property       
+dedup            property
+devices          property       
+exec             property       
+filesystem_limit property
+logbias          property
+mlslabel         property
+mountpoint       property       
+nbmand           property       
+normalization    property       
+primarycache     property       
+quota            property       
+readonly         property       
+recordsize       property       
+refquota         property       
+refreservation   property       
+reservation      property       
+secondarycache   property       
+setuid           property       
+sharenfs         property       
+sharesmb         property       
+snapdir          property       
+snapshot_limit   property
+utf8only         property       
+version          property       
+volblocksize     property       
+volsize          property       
+vscan            property       
+xattr            property       
+zoned            property       
+.fi
+.in -2
+.sp
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs allow\fR \fB-c\fR \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR\fR
+.ad
+.sp .6
+.RS 4n
+Sets "create time" permissions. These permissions are granted (locally) to the creator of any newly-created descendent file system.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs allow\fR \fB-s\fR @\fIsetname\fR \fIperm\fR|@\fIsetname\fR[,...] \fIfilesystem\fR|\fIvolume\fR\fR
+.ad
+.sp .6
+.RS 4n
+Defines or adds permissions to a permission set. The set can be used by other \fBzfs allow\fR commands for the specified file system and its descendents. Sets are evaluated dynamically, so changes to a set are immediately reflected. Permission sets follow the same naming restrictions as ZFS file systems, but the name must begin with an "at sign" (\fB@\fR), and can be no more than 64 characters long.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs unallow\fR [\fB-rldug\fR] "\fIeveryone\fR"|\fIuser\fR|\fIgroup\fR[,...] [\fIperm\fR|@\fIsetname\fR[, ...]] \fIfilesystem\fR|\fIvolume\fR\fR
+.ad
+.br
+.na
+\fB\fBzfs unallow\fR [\fB-rld\fR] \fB-e\fR [\fIperm\fR|@\fIsetname\fR [,...]] \fIfilesystem\fR|\fIvolume\fR\fR
+.ad
+.br
+.na
+\fB\fBzfs unallow\fR [\fB-r\fR] \fB-c\fR [\fIperm\fR|@\fIsetname\fR[,...]]\fR
+.ad
+.br
+.na
+\fB\fIfilesystem\fR|\fIvolume\fR\fR
+.ad
+.sp .6
+.RS 4n
+Removes permissions that were granted with the \fBzfs allow\fR command. No permissions are explicitly denied, so other permissions granted are still in effect. For example, if the permission is granted by an ancestor. If no permissions are specified, then all permissions for the specified \fIuser\fR, \fIgroup\fR, or \fIeveryone\fR are removed. Specifying "everyone" (or using the \fB-e\fR option) only removes the permissions that were granted to "everyone", not all permissions for every user and group. See the \fBzfs allow\fR command for a description of the \fB-ldugec\fR options.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Recursively remove the permissions from this file system and all descendents.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs unallow\fR [\fB-r\fR] \fB-s\fR @\fIsetname\fR [\fIperm\fR|@\fIsetname\fR[,...]]\fR
+.ad
+.br
+.na
+\fB\fIfilesystem\fR|\fIvolume\fR\fR
+.ad
+.sp .6
+.RS 4n
+Removes permissions from a permission set. If no permissions are specified, then all permissions are removed, thus removing the set entirely.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs hold\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR...\fR
+.ad
+.sp .6
+.RS 4n
+Adds a single reference, named with the \fItag\fR argument, to the specified snapshot or snapshots. Each snapshot has its own tag namespace, and tags must be unique within that space.
+.sp
+If a hold exists on a snapshot, attempts to destroy that snapshot by using the \fBzfs destroy\fR command return \fBEBUSY\fR.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Specifies that a hold with the given tag is applied recursively to the snapshots of all descendent file systems.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs holds\fR [\fB-r\fR] \fIsnapshot\fR...\fR
+.ad
+.sp .6
+.RS 4n
+Lists all existing user references for the given snapshot or snapshots.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Lists the holds that are set on the named descendent snapshots, in addition to listing the holds on the named snapshot.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs release\fR [\fB-r\fR] \fItag\fR \fIsnapshot\fR...\fR
+.ad
+.sp .6
+.RS 4n
+Removes a single reference, named with the \fItag\fR argument, from the specified snapshot or snapshots. The tag must already exist for each snapshot.
+.sp
+If a hold exists on a snapshot, attempts to destroy that snapshot by using the \fBzfs destroy\fR command return \fBEBUSY\fR.
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-r\fR\fR
+.ad
+.sp .6
+.RS 4n
+Recursively releases a hold with the given tag on the snapshots of all descendent file systems.
+.RE
+
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fBzfs diff\fR [\fB-FHt\fR] \fIsnapshot\fR \fIsnapshot|filesystem\fR
+.ad
+.sp .6
+.RS 4n
+Display the difference between a snapshot of a given filesystem and another
+snapshot of that filesystem from a later time or the current contents of the
+filesystem.  The first column is a character indicating the type of change,
+the other columns indicate pathname, new pathname (in case of rename), change
+in link count, and optionally file type and/or change time.
+
+The types of change are:
+.in +2
+.nf
+-       The path has been removed
++       The path has been created
+M       The path has been modified
+R       The path has been renamed
+.fi
+.in -2
+.sp
+.ne 2
+.na
+\fB-F\fR
+.ad
+.sp .6
+.RS 4n
+Display an indication of the type of file, in a manner similar to the \fB-F\fR
+option of \fBls\fR(1).
+.in +2
+.nf
+B       Block device
+C       Character device
+/       Directory
+>       Door
+|       Named pipe
+@       Symbolic link
+P       Event port
+=       Socket
+F       Regular file
+.fi
+.in -2
+.RE
+.sp
+.ne 2
+.na
+\fB-H\fR
+.ad
+.sp .6
+.RS 4n
+Give more parsable tab-separated output, without header lines and without arrows.
+.RE
+.sp
+.ne 2
+.na
+\fB-t\fR
+.ad
+.sp .6
+.RS 4n
+Display the path's inode change time as the first column of output.
+.RE
+
+.SH EXAMPLES
+.LP
+\fBExample 1 \fRCreating a ZFS File System Hierarchy
+.sp
+.LP
+The following commands create a file system named \fBpool/home\fR and a file system named \fBpool/home/bob\fR. The mount point \fB/export/home\fR is set for the parent file system, and is automatically inherited by the child file system.
+
+.sp
+.in +2
+.nf
+# \fBzfs create pool/home\fR
+# \fBzfs set mountpoint=/export/home pool/home\fR
+# \fBzfs create pool/home/bob\fR
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 2 \fRCreating a ZFS Snapshot
+.sp
+.LP
+The following command creates a snapshot named \fByesterday\fR. This snapshot is mounted on demand in the \fB\&.zfs/snapshot\fR directory at the root of the \fBpool/home/bob\fR file system.
+
+.sp
+.in +2
+.nf
+# \fBzfs snapshot pool/home/bob@yesterday\fR
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 3 \fRCreating and Destroying Multiple Snapshots
+.sp
+.LP
+The following command creates snapshots named \fByesterday\fR of \fBpool/home\fR and all of its descendent file systems. Each snapshot is mounted on demand in the \fB\&.zfs/snapshot\fR directory at the root of its file system. The second command destroys the newly created snapshots.
+
+.sp
+.in +2
+.nf
+# \fBzfs snapshot -r pool/home@yesterday\fR
+# \fBzfs destroy -r pool/home@yesterday\fR
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 4 \fRDisabling and Enabling File System Compression
+.sp
+.LP
+The following command disables the \fBcompression\fR property for all file systems under \fBpool/home\fR. The next command explicitly enables \fBcompression\fR for \fBpool/home/anne\fR.
+
+.sp
+.in +2
+.nf
+# \fBzfs set compression=off pool/home\fR
+# \fBzfs set compression=on pool/home/anne\fR
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 5 \fRListing ZFS Datasets
+.sp
+.LP
+The following command lists all active file systems and volumes in the system. Snapshots are displayed if the \fBlistsnaps\fR property is \fBon\fR. The default is \fBoff\fR. See \fBzpool\fR(8) for more information on pool properties.
+
+.sp
+.in +2
+.nf
+# \fBzfs list\fR
+   NAME                      USED  AVAIL  REFER  MOUNTPOINT
+   pool                      450K   457G    18K  /pool
+   pool/home                 315K   457G    21K  /export/home
+   pool/home/anne             18K   457G    18K  /export/home/anne
+   pool/home/bob             276K   457G   276K  /export/home/bob
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 6 \fRSetting a Quota on a ZFS File System
+.sp
+.LP
+The following command sets a quota of 50 Gbytes for \fBpool/home/bob\fR.
+
+.sp
+.in +2
+.nf
+# \fBzfs set quota=50G pool/home/bob\fR
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 7 \fRListing ZFS Properties
+.sp
+.LP
+The following command lists all properties for \fBpool/home/bob\fR.
+
+.sp
+.in +2
+.nf
+# \fBzfs get all pool/home/bob\fR
+NAME           PROPERTY              VALUE                  SOURCE
+pool/home/bob  type                  filesystem             -
+pool/home/bob  creation              Tue Jul 21 15:53 2009  -
+pool/home/bob  used                  21K                    -
+pool/home/bob  available             20.0G                  -
+pool/home/bob  referenced            21K                    -
+pool/home/bob  compressratio         1.00x                  -
+pool/home/bob  mounted               yes                    -
+pool/home/bob  quota                 20G                    local
+pool/home/bob  reservation           none                   default
+pool/home/bob  recordsize            128K                   default
+pool/home/bob  mountpoint            /pool/home/bob         default
+pool/home/bob  sharenfs              off                    default
+pool/home/bob  checksum              on                     default
+pool/home/bob  compression           on                     local
+pool/home/bob  atime                 on                     default
+pool/home/bob  devices               on                     default
+pool/home/bob  exec                  on                     default
+pool/home/bob  setuid                on                     default
+pool/home/bob  readonly              off                    default
+pool/home/bob  zoned                 off                    default
+pool/home/bob  snapdir               hidden                 default
+pool/home/bob  acltype               off                    default
+pool/home/bob  aclinherit            restricted             default
+pool/home/bob  canmount              on                     default
+pool/home/bob  xattr                 on                     default
+pool/home/bob  copies                1                      default
+pool/home/bob  version               4                      -
+pool/home/bob  utf8only              off                    -
+pool/home/bob  normalization         none                   -
+pool/home/bob  casesensitivity       sensitive              -
+pool/home/bob  vscan                 off                    default
+pool/home/bob  nbmand                off                    default
+pool/home/bob  sharesmb              off                    default
+pool/home/bob  refquota              none                   default
+pool/home/bob  refreservation        none                   default
+pool/home/bob  primarycache          all                    default
+pool/home/bob  secondarycache        all                    default
+pool/home/bob  usedbysnapshots       0                      -
+pool/home/bob  usedbydataset         21K                    -
+pool/home/bob  usedbychildren        0                      -
+pool/home/bob  usedbyrefreservation  0                      -
+pool/home/bob  logbias               latency                default
+pool/home/bob  dedup                 off                    default
+pool/home/bob  mlslabel              none                   default
+pool/home/bob  relatime              off                    default
+.fi
+.in -2
+.sp
+
+.sp
+.LP
+The following command gets a single property value.
+
+.sp
+.in +2
+.nf
+# \fBzfs get -H -o value compression pool/home/bob\fR
+on
+.fi
+.in -2
+.sp
+
+.sp
+.LP
+The following command lists all properties with local settings for \fBpool/home/bob\fR.
+
+.sp
+.in +2
+.nf
+# \fBzfs get -r -s local -o name,property,value all pool/home/bob\fR
+NAME           PROPERTY              VALUE
+pool/home/bob  quota                 20G
+pool/home/bob  compression           on
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 8 \fRRolling Back a ZFS File System
+.sp
+.LP
+The following command reverts the contents of \fBpool/home/anne\fR to the snapshot named \fByesterday\fR, deleting all intermediate snapshots.
+
+.sp
+.in +2
+.nf
+# \fBzfs rollback -r pool/home/anne@yesterday\fR
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 9 \fRCreating a ZFS Clone
+.sp
+.LP
+The following command creates a writable file system whose initial contents are the same as \fBpool/home/bob@yesterday\fR.
+
+.sp
+.in +2
+.nf
+# \fBzfs clone pool/home/bob@yesterday pool/clone\fR
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 10 \fRPromoting a ZFS Clone
+.sp
+.LP
+The following commands illustrate how to test out changes to a file system, and then replace the original file system with the changed one, using clones, clone promotion, and renaming:
+
+.sp
+.in +2
+.nf
+# \fBzfs create pool/project/production\fR
+  populate /pool/project/production with data
+# \fBzfs snapshot pool/project/production@today\fR
+# \fBzfs clone pool/project/production@today pool/project/beta\fR
+make changes to /pool/project/beta and test them
+# \fBzfs promote pool/project/beta\fR
+# \fBzfs rename pool/project/production pool/project/legacy\fR
+# \fBzfs rename pool/project/beta pool/project/production\fR
+once the legacy version is no longer needed, it can be destroyed
+# \fBzfs destroy pool/project/legacy\fR
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 11 \fRInheriting ZFS Properties
+.sp
+.LP
+The following command causes \fBpool/home/bob\fR and \fBpool/home/anne\fR to inherit the \fBchecksum\fR property from their parent.
+
+.sp
+.in +2
+.nf
+# \fBzfs inherit checksum pool/home/bob pool/home/anne\fR
+.fi
+.in -2
+.sp
+.LP
+The following command causes \fBpool/home/bob\fR to revert to the received
+value for the \fBquota\fR property if it exists.
+
+.sp
+.in +2
+.nf
+# \fBzfs inherit -S quota pool/home/bob
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 12 \fRRemotely Replicating ZFS Data
+.sp
+.LP
+The following commands send a full stream and then an incremental stream to a remote machine, restoring them into \fBpoolB/received/fs@a\fRand \fBpoolB/received/fs@b\fR, respectively. \fBpoolB\fR must contain the file system \fBpoolB/received\fR, and must not initially contain \fBpoolB/received/fs\fR.
+
+.sp
+.in +2
+.nf
+# \fBzfs send pool/fs@a | \e\fR
+   \fBssh host zfs receive poolB/received/fs@a\fR
+# \fBzfs send -i a pool/fs@b | ssh host \e\fR
+   \fBzfs receive poolB/received/fs\fR
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 13 \fRUsing the \fBzfs receive\fR \fB-d\fR Option
+.sp
+.LP
+The following command sends a full stream of \fBpoolA/fsA/fsB@snap\fR to a remote machine, receiving it into \fBpoolB/received/fsA/fsB@snap\fR. The \fBfsA/fsB@snap\fR portion of the received snapshot's name is determined from the name of the sent snapshot. \fBpoolB\fR must contain the file system \fBpoolB/received\fR. If \fBpoolB/received/fsA\fR does not exist, it is created as an empty file system.
+
+.sp
+.in +2
+.nf
+# \fBzfs send poolA/fsA/fsB@snap | \e
+   ssh host zfs receive -d poolB/received\fR
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 14 \fRSetting User Properties
+.sp
+.LP
+The following example sets the user-defined \fBcom.example:department\fR property for a dataset.
+
+.sp
+.in +2
+.nf
+# \fBzfs set com.example:department=12345 tank/accounting\fR
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 15 \fRPerforming a Rolling Snapshot
+.sp
+.LP
+The following example shows how to maintain a history of snapshots with a consistent naming scheme. To keep a week's worth of snapshots, the user destroys the oldest snapshot, renames the remaining snapshots, and then creates a new snapshot, as follows:
+
+.sp
+.in +2
+.nf
+# \fBzfs destroy -r pool/users@7daysago\fR
+# \fBzfs rename -r pool/users@6daysago @7daysago\fR
+# \fBzfs rename -r pool/users@5daysago @6daysago\fR
+# \fBzfs rename -r pool/users@4daysago @5daysago\fR
+# \fBzfs rename -r pool/users@3daysago @4daysago\fR
+# \fBzfs rename -r pool/users@2daysago @3daysago\fR
+# \fBzfs rename -r pool/users@yesterday @2daysago\fR
+# \fBzfs rename -r pool/users@today @yesterday\fR
+# \fBzfs snapshot -r pool/users@today\fR
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 16 \fRSetting \fBsharenfs\fR Property Options on a ZFS File System
+.sp
+.LP
+The following commands show how to set \fBsharenfs\fR property options to enable \fBrw\fR access for a set of \fBIP\fR addresses and to enable root access for system \fBneo\fR on the \fBtank/home\fR file system.
+
+.sp
+.in +2
+.nf
+# \fBzfs set sharenfs='rw=@123.123.0.0/16,root=neo' tank/home\fR
+.fi
+.in -2
+.sp
+
+.sp
+.LP
+If you are using \fBDNS\fR for host name resolution, specify the fully qualified hostname.
+
+.LP
+\fBExample 17 \fRDelegating ZFS Administration Permissions on a ZFS Dataset
+.sp
+.LP
+The following example shows how to set permissions so that user \fBcindys\fR can create, destroy, mount, and take snapshots on \fBtank/cindys\fR. The permissions on \fBtank/cindys\fR are also displayed.
+
+.sp
+.in +2
+.nf
+# \fBzfs allow cindys create,destroy,mount,snapshot tank/cindys\fR
+# \fBzfs allow tank/cindys\fR
+-------------------------------------------------------------
+Local+Descendent permissions on (tank/cindys)
+          user cindys create,destroy,mount,snapshot
+-------------------------------------------------------------
+.fi
+.in -2
+.sp
+
+.sp
+.LP
+Because the \fBtank/cindys\fR mount point permission is set to 755 by default, user \fBcindys\fR will be unable to mount file systems under \fBtank/cindys\fR. Set an \fBACL\fR similar to the following syntax to provide mount point access:
+.sp
+.in +2
+.nf
+# \fBchmod A+user:cindys:add_subdirectory:allow /tank/cindys\fR
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 18 \fRDelegating Create Time Permissions on a ZFS Dataset
+.sp
+.LP
+The following example shows how to grant anyone in the group \fBstaff\fR to create file systems in \fBtank/users\fR. This syntax also allows staff members to destroy their own file systems, but not destroy anyone else's file system. The permissions on \fBtank/users\fR are also displayed.
+
+.sp
+.in +2
+.nf
+# \fBzfs allow staff create,mount tank/users\fR
+# \fBzfs allow -c destroy tank/users\fR
+# \fBzfs allow tank/users\fR
+-------------------------------------------------------------
+Create time permissions on (tank/users)
+          create,destroy
+Local+Descendent permissions on (tank/users)
+          group staff create,mount
+------------------------------------------------------------- 
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 19 \fRDefining and Granting a Permission Set on a ZFS Dataset
+.sp
+.LP
+The following example shows how to define and grant a permission set on the \fBtank/users\fR file system. The permissions on \fBtank/users\fR are also displayed.
+
+.sp
+.in +2
+.nf
+# \fBzfs allow -s @pset create,destroy,snapshot,mount tank/users\fR
+# \fBzfs allow staff @pset tank/users\fR
+# \fBzfs allow tank/users\fR
+-------------------------------------------------------------
+Permission sets on (tank/users)
+        @pset create,destroy,mount,snapshot
+Create time permissions on (tank/users)
+        create,destroy
+Local+Descendent permissions on (tank/users)
+        group staff @pset,create,mount
+-------------------------------------------------------------
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 20 \fRDelegating Property Permissions on a ZFS Dataset
+.sp
+.LP
+The following example shows to grant the ability to set quotas and reservations on the \fBusers/home\fR file system. The permissions on \fBusers/home\fR are also displayed.
+
+.sp
+.in +2
+.nf
+# \fBzfs allow cindys quota,reservation users/home\fR
+# \fBzfs allow users/home\fR
+-------------------------------------------------------------
+Local+Descendent permissions on (users/home)
+        user cindys quota,reservation
+-------------------------------------------------------------
+cindys% \fBzfs set quota=10G users/home/marks\fR
+cindys% \fBzfs get quota users/home/marks\fR
+NAME              PROPERTY  VALUE             SOURCE
+users/home/marks  quota     10G               local 
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 21 \fRRemoving ZFS Delegated Permissions on a ZFS Dataset
+.sp
+.LP
+The following example shows how to remove the snapshot permission from the \fBstaff\fR group on the \fBtank/users\fR file system. The permissions on \fBtank/users\fR are also displayed.
+
+.sp
+.in +2
+.nf
+# \fBzfs unallow staff snapshot tank/users\fR
+# \fBzfs allow tank/users\fR
+-------------------------------------------------------------
+Permission sets on (tank/users)
+        @pset create,destroy,mount,snapshot
+Create time permissions on (tank/users)
+        create,destroy
+Local+Descendent permissions on (tank/users)
+        group staff @pset,create,mount
+------------------------------------------------------------- 
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 22\fR Showing the differences between a snapshot and a ZFS Dataset
+.sp
+.LP
+The following example shows how to see what has changed between a prior
+snapshot of a ZFS Dataset and its current state.  The \fB-F\fR option is used
+to indicate type information for the files affected.
+
+.sp
+.in +2
+.nf
+# zfs diff -F tank/test@before tank/test
+M       /       /tank/test/
+M       F       /tank/test/linked      (+1)
+R       F       /tank/test/oldname -> /tank/test/newname
+-       F       /tank/test/deleted
++       F       /tank/test/created
+M       F       /tank/test/modified
+.fi
+.in -2
+.sp
+
+.LP
+\fBExample 23\fR Creating a bookmark
+.sp
+.LP
+The following example create a bookmark to a snapshot. This bookmark can then
+be used instead of snapshot in send streams.
+
+.sp
+.in +2
+.nf
+# zfs bookmark rpool@snapshot rpool#bookmark
+.fi
+.in -2
+.sp
+
+.SH "ENVIRONMENT VARIABLES"
+.TP
+.B "ZFS_ABORT
+Cause \fBzfs\fR to dump core on exit for the purposes of running \fB::findleaks\fR.
+
+.SH EXIT STATUS
+.LP
+The following exit values are returned:
+.sp
+.ne 2
+.mk
+.na
+\fB\fB0\fR\fR
+.ad
+.sp .6
+.RS 4n
+Successful completion. 
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB1\fR\fR
+.ad
+.sp .6
+.RS 4n
+An error occurred.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB2\fR\fR
+.ad
+.sp .6
+.RS 4n
+Invalid command line options were specified.
+.RE
+
+.SH SEE ALSO
+.LP
+\fBchmod\fR(2), \fBfsync\fR(2), \fBgzip\fR(1), \fBmount\fR(8), \fBssh\fR(1), \fBstat\fR(2), \fBwrite\fR(2), \fBzpool\fR(8)
diff --git a/man/man8/zfs.8.rej b/man/man8/zfs.8.rej
new file mode 100644
index 000000000000..b7673a2e4021
--- /dev/null
+++ b/man/man8/zfs.8.rej
@@ -0,0 +1,42 @@
+--- man/man8/zfs.8 
++++ man/man8/zfs.8 
+@@ -175,11 +175,13 @@
+ .Nm
+ .Cm receive
+ .Op Fl Fnuv
++.Op Fl o Sy origin Ns = Ns Ar snapshot
+ .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
+ .Nm
+ .Cm receive
+ .Op Fl Fnuv
+ .Op Fl d Ns | Ns Fl e
++.Op Fl o Sy origin Ns = Ns Ar snapshot
+ .Ar filesystem
+ .Nm
+ .Cm allow
+@@ -2635,12 +2637,14 @@ origin, etc.
+ .Nm
+ .Cm receive
+ .Op Fl Fnuv
++.Op Fl o Sy origin Ns = Ns Ar snapshot
+ .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
+ .br
+ .Nm
+ .Cm receive
+ .Op Fl Fnuv
+ .Op Fl d Ns | Ns Fl e
++.Op Fl o Sy origin Ns = Ns Ar snapshot
+ .Ar filesystem
+ .Xc
+ Creates a snapshot whose contents are as specified in the stream provided on
+@@ -2730,6 +2734,10 @@ snapshot as described in the paragraph above.
+ Do not actually receive the stream. This can be useful in conjunction with the
+ .Fl v
+ option to verify the name the receive operation would use.
++.It Fl o Sy origin Ns = Ns Ar snapshot
++Forces the stream to be received as a clone of the given snapshot.
++This is only valid if the stream is an incremental stream whose source
++is the same as the provided origin.
+ .It Fl u
+ File system that is associated with the received stream is not mounted.
+ .It Fl v
diff --git a/module/zfs/Makefile.in.orig b/module/zfs/Makefile.in.orig
new file mode 100644
index 000000000000..55f8cef16b6d
--- /dev/null
+++ b/module/zfs/Makefile.in.orig
@@ -0,0 +1,108 @@
+src = @abs_top_srcdir@/module/zfs
+obj = @abs_builddir@
+
+MODULE := zfs
+
+EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+$(MODULE)-objs += arc.o
+$(MODULE)-objs += blkptr.o
+$(MODULE)-objs += bplist.o
+$(MODULE)-objs += bpobj.o
+$(MODULE)-objs += dbuf.o
+$(MODULE)-objs += dbuf_stats.o
+$(MODULE)-objs += bptree.o
+$(MODULE)-objs += ddt.o
+$(MODULE)-objs += ddt_zap.o
+$(MODULE)-objs += dmu.o
+$(MODULE)-objs += dmu_diff.o
+$(MODULE)-objs += dmu_object.o
+$(MODULE)-objs += dmu_objset.o
+$(MODULE)-objs += dmu_send.o
+$(MODULE)-objs += dmu_traverse.o
+$(MODULE)-objs += dmu_tx.o
+$(MODULE)-objs += dmu_zfetch.o
+$(MODULE)-objs += dnode.o
+$(MODULE)-objs += dnode_sync.o
+$(MODULE)-objs += dsl_dataset.o
+$(MODULE)-objs += dsl_deadlist.o
+$(MODULE)-objs += dsl_deleg.o
+$(MODULE)-objs += dsl_bookmark.o
+$(MODULE)-objs += dsl_dir.o
+$(MODULE)-objs += dsl_pool.o
+$(MODULE)-objs += dsl_prop.o
+$(MODULE)-objs += dsl_scan.o
+$(MODULE)-objs += dsl_synctask.o
+$(MODULE)-objs += fm.o
+$(MODULE)-objs += gzip.o
+$(MODULE)-objs += lzjb.o
+$(MODULE)-objs += lz4.o
+$(MODULE)-objs += metaslab.o
+$(MODULE)-objs += multilist.o
+$(MODULE)-objs += range_tree.o
+$(MODULE)-objs += refcount.o
+$(MODULE)-objs += rrwlock.o
+$(MODULE)-objs += sa.o
+$(MODULE)-objs += sha256.o
+$(MODULE)-objs += spa.o
+$(MODULE)-objs += spa_boot.o
+$(MODULE)-objs += spa_config.o
+$(MODULE)-objs += spa_errlog.o
+$(MODULE)-objs += spa_history.o
+$(MODULE)-objs += spa_misc.o
+$(MODULE)-objs += spa_stats.o
+$(MODULE)-objs += space_map.o
+$(MODULE)-objs += space_reftree.o
+$(MODULE)-objs += txg.o
+$(MODULE)-objs += trace.o
+$(MODULE)-objs += uberblock.o
+$(MODULE)-objs += unique.o
+$(MODULE)-objs += vdev.o
+$(MODULE)-objs += vdev_cache.o
+$(MODULE)-objs += vdev_disk.o
+$(MODULE)-objs += vdev_file.o
+$(MODULE)-objs += vdev_label.o
+$(MODULE)-objs += vdev_mirror.o
+$(MODULE)-objs += vdev_missing.o
+$(MODULE)-objs += vdev_queue.o
+$(MODULE)-objs += vdev_raidz.o
+$(MODULE)-objs += vdev_root.o
+$(MODULE)-objs += zap.o
+$(MODULE)-objs += zap_leaf.o
+$(MODULE)-objs += zap_micro.o
+$(MODULE)-objs += zfeature.o
+$(MODULE)-objs += zfeature_common.o
+$(MODULE)-objs += zfs_acl.o
+$(MODULE)-objs += zfs_byteswap.o
+$(MODULE)-objs += zfs_ctldir.o
+$(MODULE)-objs += zfs_debug.o
+$(MODULE)-objs += zfs_dir.o
+$(MODULE)-objs += zfs_fm.o
+$(MODULE)-objs += zfs_fuid.o
+$(MODULE)-objs += zfs_ioctl.o
+$(MODULE)-objs += zfs_log.o
+$(MODULE)-objs += zfs_onexit.o
+$(MODULE)-objs += zfs_replay.o
+$(MODULE)-objs += zfs_rlock.o
+$(MODULE)-objs += zfs_sa.o
+$(MODULE)-objs += zfs_vfsops.o
+$(MODULE)-objs += zfs_vnops.o
+$(MODULE)-objs += zfs_znode.o
+$(MODULE)-objs += zil.o
+$(MODULE)-objs += zio.o
+$(MODULE)-objs += zio_checksum.o
+$(MODULE)-objs += zio_compress.o
+$(MODULE)-objs += zio_inject.o
+$(MODULE)-objs += zle.o
+$(MODULE)-objs += zpl_ctldir.o
+$(MODULE)-objs += zpl_export.o
+$(MODULE)-objs += zpl_file.o
+$(MODULE)-objs += zpl_inode.o
+$(MODULE)-objs += zpl_super.o
+$(MODULE)-objs += zpl_xattr.o
+$(MODULE)-objs += zrlock.o
+$(MODULE)-objs += zvol.o
+$(MODULE)-objs += dsl_destroy.o
+$(MODULE)-objs += dsl_userhold.o
diff --git a/module/zfs/Makefile.in.rej b/module/zfs/Makefile.in.rej
new file mode 100644
index 000000000000..08dec3c3a861
--- /dev/null
+++ b/module/zfs/Makefile.in.rej
@@ -0,0 +1,10 @@
+--- module/zfs/Makefile.in
++++ module/zfs/Makefile.in
+@@ -1328,6 +1330,7 @@ ZFS_COMMON_OBJS +=		\
+ 	bplist.o		\
+ 	bpobj.o			\
+ 	bptree.o		\
++	bqueue.o		\
+ 	dbuf.o			\
+ 	ddt.o			\
+ 	ddt_zap.o		\
diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c
index 9f62d7b911f3..c0edc9d768e8 100644
--- a/module/zfs/bptree.c
+++ b/module/zfs/bptree.c
@@ -156,7 +156,7 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	int err;
 	struct bptree_args *ba = arg;
 
-	if (BP_IS_HOLE(bp))
+	if (bp == NULL || BP_IS_HOLE(bp))
 		return (0);
 
 	err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
diff --git a/module/zfs/bptree.c.orig b/module/zfs/bptree.c.orig
new file mode 100644
index 000000000000..9f62d7b911f3
--- /dev/null
+++ b/module/zfs/bptree.c.orig
@@ -0,0 +1,301 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/arc.h>
+#include <sys/bptree.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/refcount.h>
+#include <sys/spa.h>
+
+/*
+ * A bptree is a queue of root block pointers from destroyed datasets. When a
+ * dataset is destroyed its root block pointer is put on the end of the pool's
+ * bptree queue so the dataset's blocks can be freed asynchronously by
+ * dsl_scan_sync. This allows the delete operation to finish without traversing
+ * all the dataset's blocks.
+ *
+ * Note that while bt_begin and bt_end are only ever incremented in this code,
+ * they are effectively reset to 0 every time the entire bptree is freed because
+ * the bptree's object is destroyed and re-created.
+ */
+
+struct bptree_args {
+	bptree_phys_t *ba_phys;	/* data in bonus buffer, dirtied if freeing */
+	boolean_t ba_free;	/* true if freeing during traversal */
+
+	bptree_itor_t *ba_func;	/* function to call for each blockpointer */
+	void *ba_arg;		/* caller supplied argument to ba_func */
+	dmu_tx_t *ba_tx;	/* caller supplied tx, NULL if not freeing */
+} bptree_args_t;
+
+uint64_t
+bptree_alloc(objset_t *os, dmu_tx_t *tx)
+{
+	uint64_t obj;
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+
+	obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
+	    SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+	    sizeof (bptree_phys_t), tx);
+
+	/*
+	 * Bonus buffer contents are already initialized to 0, but for
+	 * readability we make it explicit.
+	 */
+	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+	dmu_buf_will_dirty(db, tx);
+	bt = db->db_data;
+	bt->bt_begin = 0;
+	bt->bt_end = 0;
+	bt->bt_bytes = 0;
+	bt->bt_comp = 0;
+	bt->bt_uncomp = 0;
+	dmu_buf_rele(db, FTAG);
+
+	return (obj);
+}
+
+int
+bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+	bt = db->db_data;
+	ASSERT3U(bt->bt_begin, ==, bt->bt_end);
+	ASSERT0(bt->bt_bytes);
+	ASSERT0(bt->bt_comp);
+	ASSERT0(bt->bt_uncomp);
+	dmu_buf_rele(db, FTAG);
+
+	return (dmu_object_free(os, obj, tx));
+}
+
+boolean_t
+bptree_is_empty(objset_t *os, uint64_t obj)
+{
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+	boolean_t rv;
+
+	VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
+	bt = db->db_data;
+	rv = (bt->bt_begin == bt->bt_end);
+	dmu_buf_rele(db, FTAG);
+	return (rv);
+}
+
+void
+bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
+    uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+	bptree_entry_phys_t *bte;
+
+	/*
+	 * bptree objects are in the pool mos, therefore they can only be
+	 * modified in syncing context. Furthermore, this is only modified
+	 * by the sync thread, so no locking is necessary.
+	 */
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+	bt = db->db_data;
+
+	bte = kmem_zalloc(sizeof (*bte), KM_SLEEP);
+	bte->be_birth_txg = birth_txg;
+	bte->be_bp = *bp;
+	dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx);
+	kmem_free(bte, sizeof (*bte));
+
+	dmu_buf_will_dirty(db, tx);
+	bt->bt_end++;
+	bt->bt_bytes += bytes;
+	bt->bt_comp += comp;
+	bt->bt_uncomp += uncomp;
+	dmu_buf_rele(db, FTAG);
+}
+
+/* ARGSUSED */
+static int
+bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	int err;
+	struct bptree_args *ba = arg;
+
+	if (BP_IS_HOLE(bp))
+		return (0);
+
+	err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
+	if (err == 0 && ba->ba_free) {
+		ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp);
+		ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp);
+		ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp);
+	}
+	return (err);
+}
+
+/*
+ * If "free" is set:
+ *  - It is assumed that "func" will be freeing the block pointers.
+ *  - If "func" returns nonzero, the bookmark will be remembered and
+ *    iteration will be restarted from this point on next invocation.
+ *  - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
+ *    bptree_iterate will remember the bookmark, continue traversing
+ *    any additional entries, and return 0.
+ *
+ * If "free" is not set, traversal will stop and return an error if
+ * an i/o error is encountered.
+ *
+ * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
+ * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
+ * traverse_dataset_destroyed()).
+ */
+int
+bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
+    void *arg, dmu_tx_t *tx)
+{
+	boolean_t ioerr = B_FALSE;
+	int err;
+	uint64_t i;
+	dmu_buf_t *db;
+	struct bptree_args ba;
+
+	ASSERT(!free || dmu_tx_is_syncing(tx));
+
+	err = dmu_bonus_hold(os, obj, FTAG, &db);
+	if (err != 0)
+		return (err);
+
+	if (free)
+		dmu_buf_will_dirty(db, tx);
+
+	ba.ba_phys = db->db_data;
+	ba.ba_free = free;
+	ba.ba_func = func;
+	ba.ba_arg = arg;
+	ba.ba_tx = tx;
+
+	err = 0;
+	for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
+		bptree_entry_phys_t bte;
+		int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
+
+		err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
+		    &bte, DMU_READ_NO_PREFETCH);
+		if (err != 0)
+			break;
+
+		if (zfs_free_leak_on_eio)
+			flags |= TRAVERSE_HARD;
+		zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld "
+		    "bookmark %lld/%lld/%lld/%lld",
+		    i, (longlong_t)bte.be_birth_txg,
+		    (longlong_t)bte.be_zb.zb_objset,
+		    (longlong_t)bte.be_zb.zb_object,
+		    (longlong_t)bte.be_zb.zb_level,
+		    (longlong_t)bte.be_zb.zb_blkid);
+		err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
+		    bte.be_birth_txg, &bte.be_zb, flags,
+		    bptree_visit_cb, &ba);
+		if (free) {
+			/*
+			 * The callback has freed the visited block pointers.
+			 * Record our traversal progress on disk, either by
+			 * updating this record's bookmark, or by logically
+			 * removing this record by advancing bt_begin.
+			 */
+			if (err != 0) {
+				/* save bookmark for future resume */
+				ASSERT3U(bte.be_zb.zb_objset, ==,
+				    ZB_DESTROYED_OBJSET);
+				ASSERT0(bte.be_zb.zb_level);
+				dmu_write(os, obj, i * sizeof (bte),
+				    sizeof (bte), &bte, tx);
+				if (err == EIO || err == ECKSUM ||
+				    err == ENXIO) {
+					/*
+					 * Skip the rest of this tree and
+					 * continue on to the next entry.
+					 */
+					err = 0;
+					ioerr = B_TRUE;
+				} else {
+					break;
+				}
+			} else if (ioerr) {
+				/*
+				 * This entry is finished, but there were
+				 * i/o errors on previous entries, so we
+				 * can't adjust bt_begin.  Set this entry's
+				 * be_birth_txg such that it will be
+				 * treated as a no-op in future traversals.
+				 */
+				bte.be_birth_txg = UINT64_MAX;
+				dmu_write(os, obj, i * sizeof (bte),
+				    sizeof (bte), &bte, tx);
+			}
+
+			if (!ioerr) {
+				ba.ba_phys->bt_begin++;
+				(void) dmu_free_range(os, obj,
+				    i * sizeof (bte), sizeof (bte), tx);
+			}
+		} else if (err != 0) {
+			break;
+		}
+	}
+
+	ASSERT(!free || err != 0 || ioerr ||
+	    ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
+
+	/* if all blocks are free there should be no used space */
+	if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
+		if (zfs_free_leak_on_eio) {
+			ba.ba_phys->bt_bytes = 0;
+			ba.ba_phys->bt_comp = 0;
+			ba.ba_phys->bt_uncomp = 0;
+		}
+
+		ASSERT0(ba.ba_phys->bt_bytes);
+		ASSERT0(ba.ba_phys->bt_comp);
+		ASSERT0(ba.ba_phys->bt_uncomp);
+	}
+
+	dmu_buf_rele(db, FTAG);
+
+	return (err);
+}
diff --git a/module/zfs/bqueue.c b/module/zfs/bqueue.c
new file mode 100644
index 000000000000..1ddc697b5424
--- /dev/null
+++ b/module/zfs/bqueue.c
@@ -0,0 +1,111 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+#include	<sys/bqueue.h>
+#include	<sys/zfs_context.h>
+
+static inline bqueue_node_t *
+obj2node(bqueue_t *q, void *data)
+{
+	return ((bqueue_node_t *)((char *)data + q->bq_node_offset));
+}
+
+/*
+ * Initialize a blocking queue  The maximum capacity of the queue is set to
+ * size.  Types that want to be stored in a bqueue must contain a bqueue_node_t,
+ * and offset should give its offset from the start of the struct.  Return 0 on
+ * success, or -1 on failure.
+ */
+int
+bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset)
+{
+	list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t),
+	    node_offset + offsetof(bqueue_node_t, bqn_node));
+	cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL);
+	q->bq_node_offset = node_offset;
+	q->bq_size = 0;
+	q->bq_maxsize = size;
+	return (0);
+}
+
+/*
+ * Destroy a blocking queue.  This function asserts that there are no
+ * elements in the queue, and no one is blocked on the condition
+ * variables.
+ */
+void
+bqueue_destroy(bqueue_t *q)
+{
+	ASSERT0(q->bq_size);
+	cv_destroy(&q->bq_add_cv);
+	cv_destroy(&q->bq_pop_cv);
+	mutex_destroy(&q->bq_lock);
+	list_destroy(&q->bq_list);
+}
+
+/*
+ * Add data to q, consuming size units of capacity.  If there is insufficient
+ * capacity to consume size units, block until capacity exists.  Asserts size is
+ * > 0.
+ */
+void
+bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
+{
+	ASSERT3U(item_size, >, 0);
+	ASSERT3U(item_size, <, q->bq_maxsize);
+	mutex_enter(&q->bq_lock);
+	obj2node(q, data)->bqn_size = item_size;
+	while (q->bq_size + item_size > q->bq_maxsize) {
+		cv_wait(&q->bq_add_cv, &q->bq_lock);
+	}
+	q->bq_size += item_size;
+	list_insert_tail(&q->bq_list, data);
+	cv_signal(&q->bq_pop_cv);
+	mutex_exit(&q->bq_lock);
+}
+/*
+ * Take the first element off of q.  If there are no elements on the queue, wait
+ * until one is put there.  Return the removed element.
+ */
+void *
+bqueue_dequeue(bqueue_t *q)
+{
+	void *ret;
+	uint64_t item_size;
+	mutex_enter(&q->bq_lock);
+	while (q->bq_size == 0) {
+		cv_wait(&q->bq_pop_cv, &q->bq_lock);
+	}
+	ret = list_remove_head(&q->bq_list);
+	item_size = obj2node(q, ret)->bqn_size;
+	q->bq_size -= item_size;
+	mutex_exit(&q->bq_lock);
+	cv_signal(&q->bq_add_cv);
+	return (ret);
+}
+
+/*
+ * Returns true if the space used is 0.
+ */
+boolean_t
+bqueue_empty(bqueue_t *q)
+{
+	return (q->bq_size == 0);
+}
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 31242d60154e..535916f85e15 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -604,11 +604,35 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 	return (abuf);
 }
 
+/*
+ * Calculate which level n block references the data at the level 0 offset
+ * provided.
+ */
 uint64_t
-dbuf_whichblock(dnode_t *dn, uint64_t offset)
+dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
 {
-	if (dn->dn_datablkshift) {
-		return (offset >> dn->dn_datablkshift);
+	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
+		/*
+		 * The level n blkid is equal to the level 0 blkid divided by
+		 * the number of level 0s in a level n block.
+		 *
+		 * The level 0 blkid is offset >> datablkshift =
+		 * offset / 2^datablkshift.
+		 *
+		 * The number of level 0s in a level n is the number of block
+		 * pointers in an indirect block, raised to the power of level.
+		 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
+		 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
+		 *
+		 * Thus, the level n blkid is: offset /
+		 * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
+		 * = offset / 2^(datablkshift + level *
+		 *   (indblkshift - SPA_BLKPTRSHIFT))
+		 * = offset >> (datablkshift + level *
+		 *   (indblkshift - SPA_BLKPTRSHIFT))
+		 */
+		return (offset >> (dn->dn_datablkshift + level *
+		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
 	} else {
 		ASSERT3U(offset, <, dn->dn_datablksz);
 		return (0);
@@ -1946,6 +1970,12 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 	return (db);
 }
 
+/*
+ * Note: While bpp will always be updated if the function returns success,
+ * parentp will not be updated if the dnode does not have dn_dbuf filled in;
+ * this happens when the dnode is the meta-dnode, or a userused or groupused
+ * object.
+ */
 static int
 dbuf_do_evict(void *private)
 {
@@ -2011,11 +2041,96 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 }
 
+typedef struct dbuf_prefetch_arg {
+	spa_t *dpa_spa;	/* The spa to issue the prefetch in. */
+	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
+	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
+	int dpa_curlevel; /* The current level that we're reading */
+	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
+	zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
+	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
+} dbuf_prefetch_arg_t;
+
+/*
+ * Actually issue the prefetch read for the block given.
+ */
+static void
+dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
+{
+	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+		return;
+
+	arc_flags_t aflags =
+	    dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+
+	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
+	ASSERT(dpa->dpa_zio != NULL);
+	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
+	    dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+	    &aflags, &dpa->dpa_zb);
+}
+
+/*
+ * Called when an indirect block above our prefetch target is read in.  This
+ * will either read in the next indirect block down the tree or issue the actual
+ * prefetch if the next block down is our target.
+ */
+static void
+dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
+{
+	dbuf_prefetch_arg_t *dpa = private;
+
+	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
+	ASSERT3S(dpa->dpa_curlevel, >, 0);
+	if (zio != NULL) {
+		ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
+		ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
+		ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
+	}
+
+	dpa->dpa_curlevel--;
+
+	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
+	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
+	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
+	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
+	if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
+		kmem_free(dpa, sizeof (*dpa));
+	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
+		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
+		dbuf_issue_final_prefetch(dpa, bp);
+		kmem_free(dpa, sizeof (*dpa));
+	} else {
+		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+		zbookmark_phys_t zb;
+
+		ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+
+		SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
+		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
+
+		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+		    bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+		    &iter_aflags, &zb);
+	}
+	(void) arc_buf_remove_ref(abuf, private);
+}
+
+/*
+ * Issue prefetch reads for the given block on the given level.  If the indirect
+ * blocks above that block are not in memory, we will read them in
+ * asynchronously.  As a result, this call never blocks waiting for a read to
+ * complete.
+ */
 void
-dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
+dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
+    arc_flags_t aflags)
 {
-	dmu_buf_impl_t *db = NULL;
-	blkptr_t *bp = NULL;
+	blkptr_t bp;
+	int epbs, nlevels, curlevel;
+	uint64_t curblkid;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@@ -2159,7 +2274,8 @@ __dbuf_hold_impl(struct dbuf_hold_impl_data *dh)
  * on the stack for 20 levels of recursion.
  */
 int
-dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
+    boolean_t fail_sparse, boolean_t fail_uncached,
     void *tag, dmu_buf_impl_t **dbp)
 {
 	struct dbuf_hold_impl_data *dh;
@@ -2194,16 +2310,14 @@ __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
 dmu_buf_impl_t *
 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
 {
-	dmu_buf_impl_t *db;
-	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
-	return (err ? NULL : db);
+	return (dbuf_hold_level(dn, 0, blkid, tag));
 }
 
 dmu_buf_impl_t *
 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
 {
 	dmu_buf_impl_t *db;
-	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
 	return (err ? NULL : db);
 }
 
@@ -2531,8 +2645,8 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 		if (parent == NULL) {
 			mutex_exit(&db->db_mtx);
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
-			(void) dbuf_hold_impl(dn, db->db_level+1,
-			    db->db_blkid >> epbs, FALSE, db, &parent);
+			parent = dbuf_hold_level(dn, db->db_level + 1,
+			    db->db_blkid >> epbs, db);
 			rw_exit(&dn->dn_struct_rwlock);
 			mutex_enter(&db->db_mtx);
 			db->db_parent = parent;
diff --git a/module/zfs/dbuf.c.orig b/module/zfs/dbuf.c.orig
new file mode 100644
index 000000000000..31242d60154e
--- /dev/null
+++ b/module/zfs/dbuf.c.orig
@@ -0,0 +1,3153 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/dmu.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/blkptr.h>
+#include <sys/range_tree.h>
+#include <sys/trace_dbuf.h>
+
+struct dbuf_hold_impl_data {
+	/* Function arguments */
+	dnode_t *dh_dn;
+	uint8_t dh_level;
+	uint64_t dh_blkid;
+	int dh_fail_sparse;
+	void *dh_tag;
+	dmu_buf_impl_t **dh_dbp;
+	/* Local variables */
+	dmu_buf_impl_t *dh_db;
+	dmu_buf_impl_t *dh_parent;
+	blkptr_t *dh_bp;
+	int dh_err;
+	dbuf_dirty_record_t *dh_dr;
+	arc_buf_contents_t dh_type;
+	int dh_depth;
+};
+
+static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
+    dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+    void *tag, dmu_buf_impl_t **dbp, int depth);
+static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
+
+/*
+ * Number of times that zfs_free_range() took the slow path while doing
+ * a zfs receive.  A nonzero value indicates a potential performance problem.
+ */
+uint64_t zfs_free_range_recv_miss;
+
+static void dbuf_destroy(dmu_buf_impl_t *db);
+static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
+
+#ifndef __lint
+extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
+    dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp);
+#endif /* ! __lint */
+
+/*
+ * Global data structures and functions for the dbuf cache.
+ */
+static kmem_cache_t *dbuf_cache;
+static taskq_t *dbu_evict_taskq;
+
+/* ARGSUSED */
+static int
+dbuf_cons(void *vdb, void *unused, int kmflag)
+{
+	dmu_buf_impl_t *db = vdb;
+	bzero(db, sizeof (dmu_buf_impl_t));
+
+	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
+	refcount_create(&db->db_holds);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+dbuf_dest(void *vdb, void *unused)
+{
+	dmu_buf_impl_t *db = vdb;
+	mutex_destroy(&db->db_mtx);
+	cv_destroy(&db->db_changed);
+	refcount_destroy(&db->db_holds);
+}
+
+/*
+ * dbuf hash table routines
+ */
+static dbuf_hash_table_t dbuf_hash_table;
+
+static uint64_t dbuf_hash_count;
+
+static uint64_t
+dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
+{
+	uintptr_t osv = (uintptr_t)os;
+	uint64_t crc = -1ULL;
+
+	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
+
+	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
+
+	return (crc);
+}
+
+#define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
+
+#define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
+	((dbuf)->db.db_object == (obj) &&		\
+	(dbuf)->db_objset == (os) &&			\
+	(dbuf)->db_level == (level) &&			\
+	(dbuf)->db_blkid == (blkid))
+
+dmu_buf_impl_t *
+dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
+{
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	uint64_t hv;
+	uint64_t idx;
+	dmu_buf_impl_t *db;
+
+	hv = DBUF_HASH(os, obj, level, blkid);
+	idx = hv & h->hash_table_mask;
+
+	mutex_enter(DBUF_HASH_MUTEX(h, idx));
+	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
+		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
+			mutex_enter(&db->db_mtx);
+			if (db->db_state != DB_EVICTING) {
+				mutex_exit(DBUF_HASH_MUTEX(h, idx));
+				return (db);
+			}
+			mutex_exit(&db->db_mtx);
+		}
+	}
+	mutex_exit(DBUF_HASH_MUTEX(h, idx));
+	return (NULL);
+}
+
+static dmu_buf_impl_t *
+dbuf_find_bonus(objset_t *os, uint64_t object)
+{
+	dnode_t *dn;
+	dmu_buf_impl_t *db = NULL;
+
+	if (dnode_hold(os, object, FTAG, &dn) == 0) {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		if (dn->dn_bonus != NULL) {
+			db = dn->dn_bonus;
+			mutex_enter(&db->db_mtx);
+		}
+		rw_exit(&dn->dn_struct_rwlock);
+		dnode_rele(dn, FTAG);
+	}
+	return (db);
+}
+
+/*
+ * Insert an entry into the hash table.  If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static dmu_buf_impl_t *
+dbuf_hash_insert(dmu_buf_impl_t *db)
+{
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	objset_t *os = db->db_objset;
+	uint64_t obj = db->db.db_object;
+	int level = db->db_level;
+	uint64_t blkid, hv, idx;
+	dmu_buf_impl_t *dbf;
+
+	blkid = db->db_blkid;
+	hv = DBUF_HASH(os, obj, level, blkid);
+	idx = hv & h->hash_table_mask;
+
+	mutex_enter(DBUF_HASH_MUTEX(h, idx));
+	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
+		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
+			mutex_enter(&dbf->db_mtx);
+			if (dbf->db_state != DB_EVICTING) {
+				mutex_exit(DBUF_HASH_MUTEX(h, idx));
+				return (dbf);
+			}
+			mutex_exit(&dbf->db_mtx);
+		}
+	}
+
+	mutex_enter(&db->db_mtx);
+	db->db_hash_next = h->hash_table[idx];
+	h->hash_table[idx] = db;
+	mutex_exit(DBUF_HASH_MUTEX(h, idx));
+	atomic_add_64(&dbuf_hash_count, 1);
+
+	return (NULL);
+}
+
+/*
+ * Remove an entry from the hash table.  It must be in the EVICTING state.
+ */
+static void
+dbuf_hash_remove(dmu_buf_impl_t *db)
+{
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	uint64_t hv, idx;
+	dmu_buf_impl_t *dbf, **dbp;
+
+	hv = DBUF_HASH(db->db_objset, db->db.db_object,
+	    db->db_level, db->db_blkid);
+	idx = hv & h->hash_table_mask;
+
+	/*
+	 * We musn't hold db_mtx to maintain lock ordering:
+	 * DBUF_HASH_MUTEX > db_mtx.
+	 */
+	ASSERT(refcount_is_zero(&db->db_holds));
+	ASSERT(db->db_state == DB_EVICTING);
+	ASSERT(!MUTEX_HELD(&db->db_mtx));
+
+	mutex_enter(DBUF_HASH_MUTEX(h, idx));
+	dbp = &h->hash_table[idx];
+	while ((dbf = *dbp) != db) {
+		dbp = &dbf->db_hash_next;
+		ASSERT(dbf != NULL);
+	}
+	*dbp = db->db_hash_next;
+	db->db_hash_next = NULL;
+	mutex_exit(DBUF_HASH_MUTEX(h, idx));
+	atomic_add_64(&dbuf_hash_count, -1);
+}
+
+static arc_evict_func_t dbuf_do_evict;
+
+typedef enum {
+	DBVU_EVICTING,
+	DBVU_NOT_EVICTING
+} dbvu_verify_type_t;
+
+static void
+dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
+{
+#ifdef ZFS_DEBUG
+	int64_t holds;
+
+	if (db->db_user == NULL)
+		return;
+
+	/* Only data blocks support the attachment of user data. */
+	ASSERT(db->db_level == 0);
+
+	/* Clients must resolve a dbuf before attaching user data. */
+	ASSERT(db->db.db_data != NULL);
+	ASSERT3U(db->db_state, ==, DB_CACHED);
+
+	holds = refcount_count(&db->db_holds);
+	if (verify_type == DBVU_EVICTING) {
+		/*
+		 * Immediate eviction occurs when holds == dirtycnt.
+		 * For normal eviction buffers, holds is zero on
+		 * eviction, except when dbuf_fix_old_data() calls
+		 * dbuf_clear_data().  However, the hold count can grow
+		 * during eviction even though db_mtx is held (see
+		 * dmu_bonus_hold() for an example), so we can only
+		 * test the generic invariant that holds >= dirtycnt.
+		 */
+		ASSERT3U(holds, >=, db->db_dirtycnt);
+	} else {
+		if (db->db_user_immediate_evict == TRUE)
+			ASSERT3U(holds, >=, db->db_dirtycnt);
+		else
+			ASSERT3U(holds, >, 0);
+	}
+#endif
+}
+
+static void
+dbuf_evict_user(dmu_buf_impl_t *db)
+{
+	dmu_buf_user_t *dbu = db->db_user;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	if (dbu == NULL)
+		return;
+
+	dbuf_verify_user(db, DBVU_EVICTING);
+	db->db_user = NULL;
+
+#ifdef ZFS_DEBUG
+	if (dbu->dbu_clear_on_evict_dbufp != NULL)
+		*dbu->dbu_clear_on_evict_dbufp = NULL;
+#endif
+
+	/*
+	 * Invoke the callback from a taskq to avoid lock order reversals
+	 * and limit stack depth.
+	 */
+	taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0,
+	    &dbu->dbu_tqent);
+}
+
+boolean_t
+dbuf_is_metadata(dmu_buf_impl_t *db)
+{
+	/*
+	 * Consider indirect blocks and spill blocks to be meta data.
+	 */
+	if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {
+		return (B_TRUE);
+	} else {
+		boolean_t is_metadata;
+
+		DB_DNODE_ENTER(db);
+		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
+		DB_DNODE_EXIT(db);
+
+		return (is_metadata);
+	}
+}
+
+void
+dbuf_evict(dmu_buf_impl_t *db)
+{
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(db->db_buf == NULL);
+	ASSERT(db->db_data_pending == NULL);
+
+	dbuf_clear(db);
+	dbuf_destroy(db);
+}
+
+void
+dbuf_init(void)
+{
+	uint64_t hsize = 1ULL << 16;
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	int i;
+
+	/*
+	 * The hash table is big enough to fill all of physical memory
+	 * with an average block size of zfs_arc_average_blocksize (default 8K).
+	 * By default, the table will take up
+	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
+	 */
+	while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
+		hsize <<= 1;
+
+retry:
+	h->hash_table_mask = hsize - 1;
+#if defined(_KERNEL) && defined(HAVE_SPL)
+	/*
+	 * Large allocations which do not require contiguous pages
+	 * should be using vmem_alloc() in the linux kernel
+	 */
+	h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
+#else
+	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
+#endif
+	if (h->hash_table == NULL) {
+		/* XXX - we should really return an error instead of assert */
+		ASSERT(hsize > (1ULL << 10));
+		hsize >>= 1;
+		goto retry;
+	}
+
+	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
+	    sizeof (dmu_buf_impl_t),
+	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
+
+	for (i = 0; i < DBUF_MUTEXES; i++)
+		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+
+	dbuf_stats_init(h);
+
+	/*
+	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
+	 * configuration is not required.
+	 */
+	dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
+}
+
+void
+dbuf_fini(void)
+{
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	int i;
+
+	dbuf_stats_destroy();
+
+	for (i = 0; i < DBUF_MUTEXES; i++)
+		mutex_destroy(&h->hash_mutexes[i]);
+#if defined(_KERNEL) && defined(HAVE_SPL)
+	/*
+	 * Large allocations which do not require contiguous pages
+	 * should be using vmem_free() in the linux kernel
+	 */
+	vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+#else
+	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+#endif
+	kmem_cache_destroy(dbuf_cache);
+	taskq_destroy(dbu_evict_taskq);
+}
+
+/*
+ * Other stuff.
+ */
+
+#ifdef ZFS_DEBUG
+static void
+dbuf_verify(dmu_buf_impl_t *db)
+{
+	dnode_t *dn;
+	dbuf_dirty_record_t *dr;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
+		return;
+
+	ASSERT(db->db_objset != NULL);
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	if (dn == NULL) {
+		ASSERT(db->db_parent == NULL);
+		ASSERT(db->db_blkptr == NULL);
+	} else {
+		ASSERT3U(db->db.db_object, ==, dn->dn_object);
+		ASSERT3P(db->db_objset, ==, dn->dn_objset);
+		ASSERT3U(db->db_level, <, dn->dn_nlevels);
+		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
+		    db->db_blkid == DMU_SPILL_BLKID ||
+		    !avl_is_empty(&dn->dn_dbufs));
+	}
+	if (db->db_blkid == DMU_BONUS_BLKID) {
+		ASSERT(dn != NULL);
+		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
+	} else if (db->db_blkid == DMU_SPILL_BLKID) {
+		ASSERT(dn != NULL);
+		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+		ASSERT0(db->db.db_offset);
+	} else {
+		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
+	}
+
+	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
+		ASSERT(dr->dr_dbuf == db);
+
+	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
+		ASSERT(dr->dr_dbuf == db);
+
+	/*
+	 * We can't assert that db_size matches dn_datablksz because it
+	 * can be momentarily different when another thread is doing
+	 * dnode_set_blksz().
+	 */
+	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
+		dr = db->db_data_pending;
+		/*
+		 * It should only be modified in syncing context, so
+		 * make sure we only have one copy of the data.
+		 */
+		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
+	}
+
+	/* verify db->db_blkptr */
+	if (db->db_blkptr) {
+		if (db->db_parent == dn->dn_dbuf) {
+			/* db is pointed to by the dnode */
+			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
+			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
+				ASSERT(db->db_parent == NULL);
+			else
+				ASSERT(db->db_parent != NULL);
+			if (db->db_blkid != DMU_SPILL_BLKID)
+				ASSERT3P(db->db_blkptr, ==,
+				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
+		} else {
+			/* db is pointed to by an indirect block */
+			ASSERTV(int epb = db->db_parent->db.db_size >>
+				SPA_BLKPTRSHIFT);
+			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
+			ASSERT3U(db->db_parent->db.db_object, ==,
+			    db->db.db_object);
+			/*
+			 * dnode_grow_indblksz() can make this fail if we don't
+			 * have the struct_rwlock.  XXX indblksz no longer
+			 * grows.  safe to do this now?
+			 */
+			if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+				ASSERT3P(db->db_blkptr, ==,
+				    ((blkptr_t *)db->db_parent->db.db_data +
+				    db->db_blkid % epb));
+			}
+		}
+	}
+	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
+	    (db->db_buf == NULL || db->db_buf->b_data) &&
+	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
+	    db->db_state != DB_FILL && !dn->dn_free_txg) {
+		/*
+		 * If the blkptr isn't set but they have nonzero data,
+		 * it had better be dirty, otherwise we'll lose that
+		 * data when we evict this buffer.
+		 */
+		if (db->db_dirtycnt == 0) {
+			ASSERTV(uint64_t *buf = db->db.db_data);
+			int i;
+
+			for (i = 0; i < db->db.db_size >> 3; i++) {
+				ASSERT(buf[i] == 0);
+			}
+		}
+	}
+	DB_DNODE_EXIT(db);
+}
+#endif
+
+static void
+dbuf_clear_data(dmu_buf_impl_t *db)
+{
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	dbuf_evict_user(db);
+	db->db_buf = NULL;
+	db->db.db_data = NULL;
+	if (db->db_state != DB_NOFILL)
+		db->db_state = DB_UNCACHED;
+}
+
+static void
+dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
+{
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(buf != NULL);
+
+	db->db_buf = buf;
+	ASSERT(buf->b_data != NULL);
+	db->db.db_data = buf->b_data;
+	if (!arc_released(buf))
+		arc_set_callback(buf, dbuf_do_evict, db);
+}
+
+/*
+ * Loan out an arc_buf for read.  Return the loaned arc_buf.
+ */
+arc_buf_t *
+dbuf_loan_arcbuf(dmu_buf_impl_t *db)
+{
+	arc_buf_t *abuf;
+
+	mutex_enter(&db->db_mtx);
+	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
+		int blksz = db->db.db_size;
+		spa_t *spa = db->db_objset->os_spa;
+
+		mutex_exit(&db->db_mtx);
+		abuf = arc_loan_buf(spa, blksz);
+		bcopy(db->db.db_data, abuf->b_data, blksz);
+	} else {
+		abuf = db->db_buf;
+		arc_loan_inuse_buf(abuf, db);
+		dbuf_clear_data(db);
+		mutex_exit(&db->db_mtx);
+	}
+	return (abuf);
+}
+
+uint64_t
+dbuf_whichblock(dnode_t *dn, uint64_t offset)
+{
+	if (dn->dn_datablkshift) {
+		return (offset >> dn->dn_datablkshift);
+	} else {
+		ASSERT3U(offset, <, dn->dn_datablksz);
+		return (0);
+	}
+}
+
+static void
+dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+	dmu_buf_impl_t *db = vdb;
+
+	mutex_enter(&db->db_mtx);
+	ASSERT3U(db->db_state, ==, DB_READ);
+	/*
+	 * All reads are synchronous, so we must have a hold on the dbuf
+	 */
+	ASSERT(refcount_count(&db->db_holds) > 0);
+	ASSERT(db->db_buf == NULL);
+	ASSERT(db->db.db_data == NULL);
+	if (db->db_level == 0 && db->db_freed_in_flight) {
+		/* we were freed in flight; disregard any error */
+		arc_release(buf, db);
+		bzero(buf->b_data, db->db.db_size);
+		arc_buf_freeze(buf);
+		db->db_freed_in_flight = FALSE;
+		dbuf_set_data(db, buf);
+		db->db_state = DB_CACHED;
+	} else if (zio == NULL || zio->io_error == 0) {
+		dbuf_set_data(db, buf);
+		db->db_state = DB_CACHED;
+	} else {
+		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+		ASSERT3P(db->db_buf, ==, NULL);
+		VERIFY(arc_buf_remove_ref(buf, db));
+		db->db_state = DB_UNCACHED;
+	}
+	cv_broadcast(&db->db_changed);
+	dbuf_rele_and_unlock(db, NULL);
+}
+
+static int
+dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
+{
+	dnode_t *dn;
+	zbookmark_phys_t zb;
+	uint32_t aflags = ARC_FLAG_NOWAIT;
+	int err;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	ASSERT(!refcount_is_zero(&db->db_holds));
+	/* We need the struct_rwlock to prevent db_blkptr from changing. */
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(db->db_state == DB_UNCACHED);
+	ASSERT(db->db_buf == NULL);
+
+	if (db->db_blkid == DMU_BONUS_BLKID) {
+		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
+
+		ASSERT3U(bonuslen, <=, db->db.db_size);
+		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+		if (bonuslen < DN_MAX_BONUSLEN)
+			bzero(db->db.db_data, DN_MAX_BONUSLEN);
+		if (bonuslen)
+			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+		DB_DNODE_EXIT(db);
+		db->db_state = DB_CACHED;
+		mutex_exit(&db->db_mtx);
+		return (0);
+	}
+
+	/*
+	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
+	 * processes the delete record and clears the bp while we are waiting
+	 * for the dn_mtx (resulting in a "no" from block_freed).
+	 */
+	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
+	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
+	    BP_IS_HOLE(db->db_blkptr)))) {
+		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
+		DB_DNODE_EXIT(db);
+		dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
+		    db->db.db_size, db, type));
+		bzero(db->db.db_data, db->db.db_size);
+		db->db_state = DB_CACHED;
+		*flags |= DB_RF_CACHED;
+		mutex_exit(&db->db_mtx);
+		return (0);
+	}
+
+	DB_DNODE_EXIT(db);
+
+	db->db_state = DB_READ;
+	mutex_exit(&db->db_mtx);
+
+	if (DBUF_IS_L2CACHEABLE(db))
+		aflags |= ARC_FLAG_L2CACHE;
+	if (DBUF_IS_L2COMPRESSIBLE(db))
+		aflags |= ARC_FLAG_L2COMPRESS;
+
+	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
+	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+	    db->db.db_object, db->db_level, db->db_blkid);
+
+	dbuf_add_ref(db, NULL);
+
+	err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
+	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
+	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
+	    &aflags, &zb);
+	if (aflags & ARC_FLAG_CACHED)
+		*flags |= DB_RF_CACHED;
+
+	return (SET_ERROR(err));
+}
+
+int
+dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+{
+	int err = 0;
+	boolean_t havepzio = (zio != NULL);
+	boolean_t prefetch;
+	dnode_t *dn;
+
+	/*
+	 * We don't have to hold the mutex to check db_state because it
+	 * can't be freed while we have a hold on the buffer.
+	 */
+	ASSERT(!refcount_is_zero(&db->db_holds));
+
+	if (db->db_state == DB_NOFILL)
+		return (SET_ERROR(EIO));
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	if ((flags & DB_RF_HAVESTRUCT) == 0)
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
+	    DBUF_IS_CACHEABLE(db);
+
+	mutex_enter(&db->db_mtx);
+	if (db->db_state == DB_CACHED) {
+		mutex_exit(&db->db_mtx);
+		if (prefetch)
+			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
+			    db->db.db_size, TRUE);
+		if ((flags & DB_RF_HAVESTRUCT) == 0)
+			rw_exit(&dn->dn_struct_rwlock);
+		DB_DNODE_EXIT(db);
+	} else if (db->db_state == DB_UNCACHED) {
+		spa_t *spa = dn->dn_objset->os_spa;
+
+		if (zio == NULL)
+			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+
+		err = dbuf_read_impl(db, zio, &flags);
+
+		/* dbuf_read_impl has dropped db_mtx for us */
+
+		if (!err && prefetch)
+			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
+			    db->db.db_size, flags & DB_RF_CACHED);
+
+		if ((flags & DB_RF_HAVESTRUCT) == 0)
+			rw_exit(&dn->dn_struct_rwlock);
+		DB_DNODE_EXIT(db);
+
+		if (!err && !havepzio)
+			err = zio_wait(zio);
+	} else {
+		/*
+		 * Another reader came in while the dbuf was in flight
+		 * between UNCACHED and CACHED.  Either a writer will finish
+		 * writing the buffer (sending the dbuf to CACHED) or the
+		 * first reader's request will reach the read_done callback
+		 * and send the dbuf to CACHED.  Otherwise, a failure
+		 * occurred and the dbuf went to UNCACHED.
+		 */
+		mutex_exit(&db->db_mtx);
+		if (prefetch)
+			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
+			    db->db.db_size, TRUE);
+		if ((flags & DB_RF_HAVESTRUCT) == 0)
+			rw_exit(&dn->dn_struct_rwlock);
+		DB_DNODE_EXIT(db);
+
+		/* Skip the wait per the caller's request. */
+		mutex_enter(&db->db_mtx);
+		if ((flags & DB_RF_NEVERWAIT) == 0) {
+			while (db->db_state == DB_READ ||
+			    db->db_state == DB_FILL) {
+				ASSERT(db->db_state == DB_READ ||
+				    (flags & DB_RF_HAVESTRUCT) == 0);
+				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
+				    db, zio_t *, zio);
+				cv_wait(&db->db_changed, &db->db_mtx);
+			}
+			if (db->db_state == DB_UNCACHED)
+				err = SET_ERROR(EIO);
+		}
+		mutex_exit(&db->db_mtx);
+	}
+
+	ASSERT(err || havepzio || db->db_state == DB_CACHED);
+	return (err);
+}
+
+static void
+dbuf_noread(dmu_buf_impl_t *db)
+{
+	ASSERT(!refcount_is_zero(&db->db_holds));
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+	mutex_enter(&db->db_mtx);
+	while (db->db_state == DB_READ || db->db_state == DB_FILL)
+		cv_wait(&db->db_changed, &db->db_mtx);
+	if (db->db_state == DB_UNCACHED) {
+		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+		spa_t *spa = db->db_objset->os_spa;
+
+		ASSERT(db->db_buf == NULL);
+		ASSERT(db->db.db_data == NULL);
+		dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
+		db->db_state = DB_FILL;
+	} else if (db->db_state == DB_NOFILL) {
+		dbuf_clear_data(db);
+	} else {
+		ASSERT3U(db->db_state, ==, DB_CACHED);
+	}
+	mutex_exit(&db->db_mtx);
+}
+
+/*
+ * This is our just-in-time copy function.  It makes a copy of
+ * buffers, that have been modified in a previous transaction
+ * group, before we modify them in the current active group.
+ *
+ * This function is used in two places: when we are dirtying a
+ * buffer for the first time in a txg, and when we are freeing
+ * a range in a dnode that includes this buffer.
+ *
+ * Note that when we are called from dbuf_free_range() we do
+ * not put a hold on the buffer, we just traverse the active
+ * dbuf list for the dnode.
+ */
+static void
+dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
+{
+	dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(db->db.db_data != NULL);
+	ASSERT(db->db_level == 0);
+	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
+
+	if (dr == NULL ||
+	    (dr->dt.dl.dr_data !=
+	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
+		return;
+
+	/*
+	 * If the last dirty record for this dbuf has not yet synced
+	 * and its referencing the dbuf data, either:
+	 *	reset the reference to point to a new copy,
+	 * or (if there a no active holders)
+	 *	just null out the current db_data pointer.
+	 */
+	ASSERT(dr->dr_txg >= txg - 2);
+	if (db->db_blkid == DMU_BONUS_BLKID) {
+		/* Note that the data bufs here are zio_bufs */
+		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
+	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+		int size = db->db.db_size;
+		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+		spa_t *spa = db->db_objset->os_spa;
+
+		dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
+		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
+	} else {
+		dbuf_clear_data(db);
+	}
+}
+
+void
+dbuf_unoverride(dbuf_dirty_record_t *dr)
+{
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
+	uint64_t txg = dr->dr_txg;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
+	ASSERT(db->db_level == 0);
+
+	if (db->db_blkid == DMU_BONUS_BLKID ||
+	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
+		return;
+
+	ASSERT(db->db_data_pending != dr);
+
+	/* free this block */
+	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
+		zio_free(db->db_objset->os_spa, txg, bp);
+
+	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+	dr->dt.dl.dr_nopwrite = B_FALSE;
+
+	/*
+	 * Release the already-written buffer, so we leave it in
+	 * a consistent dirty state.  Note that all callers are
+	 * modifying the buffer, so they will immediately do
+	 * another (redundant) arc_release().  Therefore, leave
+	 * the buf thawed to save the effort of freezing &
+	 * immediately re-thawing it.
+	 */
+	arc_release(dr->dt.dl.dr_data, db);
+}
+
+/*
+ * Evict (if its unreferenced) or clear (if its referenced) any level-0
+ * data blocks in the free range, so that any future readers will find
+ * empty blocks.
+ *
+ * This is a no-op if the dataset is in the middle of an incremental
+ * receive; see comment below for details.
+ */
+void
+dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+    dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db_search;
+	dmu_buf_impl_t *db, *db_next;
+	uint64_t txg = tx->tx_txg;
+	avl_index_t where;
+	boolean_t freespill =
+	    (start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID);
+
+	if (end_blkid > dn->dn_maxblkid && !freespill)
+		end_blkid = dn->dn_maxblkid;
+	dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
+
+	db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
+	db_search->db_level = 0;
+	db_search->db_blkid = start_blkid;
+	db_search->db_state = DB_SEARCH;
+
+	mutex_enter(&dn->dn_dbufs_mtx);
+	if (start_blkid >= dn->dn_unlisted_l0_blkid && !freespill) {
+		/* There can't be any dbufs in this range; no need to search. */
+#ifdef DEBUG
+		db = avl_find(&dn->dn_dbufs, db_search, &where);
+		ASSERT3P(db, ==, NULL);
+		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+		ASSERT(db == NULL || db->db_level > 0);
+#endif
+		goto out;
+	} else if (dmu_objset_is_receiving(dn->dn_objset)) {
+		/*
+		 * If we are receiving, we expect there to be no dbufs in
+		 * the range to be freed, because receive modifies each
+		 * block at most once, and in offset order.  If this is
+		 * not the case, it can lead to performance problems,
+		 * so note that we unexpectedly took the slow path.
+		 */
+		atomic_inc_64(&zfs_free_range_recv_miss);
+	}
+
+	db = avl_find(&dn->dn_dbufs, db_search, &where);
+	ASSERT3P(db, ==, NULL);
+	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+	for (; db != NULL; db = db_next) {
+		db_next = AVL_NEXT(&dn->dn_dbufs, db);
+		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+		if (db->db_level != 0 || db->db_blkid > end_blkid) {
+			break;
+		}
+		ASSERT3U(db->db_blkid, >=, start_blkid);
+
+		/* found a level 0 buffer in the range */
+		mutex_enter(&db->db_mtx);
+		if (dbuf_undirty(db, tx)) {
+			/* mutex has been dropped and dbuf destroyed */
+			continue;
+		}
+
+		if (db->db_state == DB_UNCACHED ||
+		    db->db_state == DB_NOFILL ||
+		    db->db_state == DB_EVICTING) {
+			ASSERT(db->db.db_data == NULL);
+			mutex_exit(&db->db_mtx);
+			continue;
+		}
+		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
+			/* will be handled in dbuf_read_done or dbuf_rele */
+			db->db_freed_in_flight = TRUE;
+			mutex_exit(&db->db_mtx);
+			continue;
+		}
+		if (refcount_count(&db->db_holds) == 0) {
+			ASSERT(db->db_buf);
+			dbuf_clear(db);
+			continue;
+		}
+		/* The dbuf is referenced */
+
+		if (db->db_last_dirty != NULL) {
+			dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+			if (dr->dr_txg == txg) {
+				/*
+				 * This buffer is "in-use", re-adjust the file
+				 * size to reflect that this buffer may
+				 * contain new data when we sync.
+				 */
+				if (db->db_blkid != DMU_SPILL_BLKID &&
+				    db->db_blkid > dn->dn_maxblkid)
+					dn->dn_maxblkid = db->db_blkid;
+				dbuf_unoverride(dr);
+			} else {
+				/*
+				 * This dbuf is not dirty in the open context.
+				 * Either uncache it (if its not referenced in
+				 * the open context) or reset its contents to
+				 * empty.
+				 */
+				dbuf_fix_old_data(db, txg);
+			}
+		}
+		/* clear the contents if its cached */
+		if (db->db_state == DB_CACHED) {
+			ASSERT(db->db.db_data != NULL);
+			arc_release(db->db_buf, db);
+			bzero(db->db.db_data, db->db.db_size);
+			arc_buf_freeze(db->db_buf);
+		}
+
+		mutex_exit(&db->db_mtx);
+	}
+
+out:
+	kmem_free(db_search, sizeof (dmu_buf_impl_t));
+	mutex_exit(&dn->dn_dbufs_mtx);
+}
+
+static int
+dbuf_block_freeable(dmu_buf_impl_t *db)
+{
+	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
+	uint64_t birth_txg = 0;
+
+	/*
+	 * We don't need any locking to protect db_blkptr:
+	 * If it's syncing, then db_last_dirty will be set
+	 * so we'll ignore db_blkptr.
+	 *
+	 * This logic ensures that only block births for
+	 * filled blocks are considered.
+	 */
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	if (db->db_last_dirty && (db->db_blkptr == NULL ||
+	    !BP_IS_HOLE(db->db_blkptr))) {
+		birth_txg = db->db_last_dirty->dr_txg;
+	} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
+		birth_txg = db->db_blkptr->blk_birth;
+	}
+
+	/*
+	 * If this block don't exist or is in a snapshot, it can't be freed.
+	 * Don't pass the bp to dsl_dataset_block_freeable() since we
+	 * are holding the db_mtx lock and might deadlock if we are
+	 * prefetching a dedup-ed block.
+	 */
+	if (birth_txg != 0)
+		return (ds == NULL ||
+		    dsl_dataset_block_freeable(ds, NULL, birth_txg));
+	else
+		return (B_FALSE);
+}
+
+void
+dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
+{
+	arc_buf_t *buf, *obuf;
+	int osize = db->db.db_size;
+	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+	dnode_t *dn;
+
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	/* XXX does *this* func really need the lock? */
+	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+	/*
+	 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
+	 * is OK, because there can be no other references to the db
+	 * when we are changing its size, so no concurrent DB_FILL can
+	 * be happening.
+	 */
+	/*
+	 * XXX we should be doing a dbuf_read, checking the return
+	 * value and returning that up to our callers
+	 */
+	dmu_buf_will_dirty(&db->db, tx);
+
+	/* create the data buffer for the new block */
+	buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
+
+	/* copy old block data to the new block */
+	obuf = db->db_buf;
+	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
+	/* zero the remainder */
+	if (size > osize)
+		bzero((uint8_t *)buf->b_data + osize, size - osize);
+
+	mutex_enter(&db->db_mtx);
+	dbuf_set_data(db, buf);
+	VERIFY(arc_buf_remove_ref(obuf, db));
+	db->db.db_size = size;
+
+	if (db->db_level == 0) {
+		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+		db->db_last_dirty->dt.dl.dr_data = buf;
+	}
+	mutex_exit(&db->db_mtx);
+
+	dnode_willuse_space(dn, size-osize, tx);
+	DB_DNODE_EXIT(db);
+}
+
+void
+dbuf_release_bp(dmu_buf_impl_t *db)
+{
+	ASSERTV(objset_t *os = db->db_objset);
+
+	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+	ASSERT(arc_released(os->os_phys_buf) ||
+	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
+	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
+	(void) arc_release(db->db_buf, db);
+}
+
+dbuf_dirty_record_t *
+dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	objset_t *os;
+	dbuf_dirty_record_t **drp, *dr;
+	int drop_struct_lock = FALSE;
+	boolean_t do_free_accounting = B_FALSE;
+	int txgoff = tx->tx_txg & TXG_MASK;
+
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(!refcount_is_zero(&db->db_holds));
+	DMU_TX_DIRTY_BUF(tx, db);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	/*
+	 * Shouldn't dirty a regular buffer in syncing context.  Private
+	 * objects may be dirtied in syncing context, but only if they
+	 * were already pre-dirtied in open context.
+	 */
+	ASSERT(!dmu_tx_is_syncing(tx) ||
+	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
+	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
+	    dn->dn_objset->os_dsl_dataset == NULL);
+	/*
+	 * We make this assert for private objects as well, but after we
+	 * check if we're already dirty.  They are allowed to re-dirty
+	 * in syncing context.
+	 */
+	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+	mutex_enter(&db->db_mtx);
+	/*
+	 * XXX make this true for indirects too?  The problem is that
+	 * transactions created with dmu_tx_create_assigned() from
+	 * syncing context don't bother holding ahead.
+	 */
+	ASSERT(db->db_level != 0 ||
+	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
+	    db->db_state == DB_NOFILL);
+
+	mutex_enter(&dn->dn_mtx);
+	/*
+	 * Don't set dirtyctx to SYNC if we're just modifying this as we
+	 * initialize the objset.
+	 */
+	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
+	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
+		dn->dn_dirtyctx =
+		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
+		ASSERT(dn->dn_dirtyctx_firstset == NULL);
+		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+	}
+	mutex_exit(&dn->dn_mtx);
+
+	if (db->db_blkid == DMU_SPILL_BLKID)
+		dn->dn_have_spill = B_TRUE;
+
+	/*
+	 * If this buffer is already dirty, we're done.
+	 */
+	drp = &db->db_last_dirty;
+	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
+	    db->db.db_object == DMU_META_DNODE_OBJECT);
+	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
+		drp = &dr->dr_next;
+	if (dr && dr->dr_txg == tx->tx_txg) {
+		DB_DNODE_EXIT(db);
+
+		if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
+			/*
+			 * If this buffer has already been written out,
+			 * we now need to reset its state.
+			 */
+			dbuf_unoverride(dr);
+			if (db->db.db_object != DMU_META_DNODE_OBJECT &&
+			    db->db_state != DB_NOFILL)
+				arc_buf_thaw(db->db_buf);
+		}
+		mutex_exit(&db->db_mtx);
+		return (dr);
+	}
+
+	/*
+	 * Only valid if not already dirty.
+	 */
+	ASSERT(dn->dn_object == 0 ||
+	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+	ASSERT3U(dn->dn_nlevels, >, db->db_level);
+	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
+	    dn->dn_phys->dn_nlevels > db->db_level ||
+	    dn->dn_next_nlevels[txgoff] > db->db_level ||
+	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
+	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
+
+	/*
+	 * We should only be dirtying in syncing context if it's the
+	 * mos or we're initializing the os or it's a special object.
+	 * However, we are allowed to dirty in syncing context provided
+	 * we already dirtied it in open context.  Hence we must make
+	 * this assertion only if we're not already dirty.
+	 */
+	os = dn->dn_objset;
+	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
+	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
+	ASSERT(db->db.db_size != 0);
+
+	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+	if (db->db_blkid != DMU_BONUS_BLKID) {
+		/*
+		 * Update the accounting.
+		 * Note: we delay "free accounting" until after we drop
+		 * the db_mtx.  This keeps us from grabbing other locks
+		 * (and possibly deadlocking) in bp_get_dsize() while
+		 * also holding the db_mtx.
+		 */
+		dnode_willuse_space(dn, db->db.db_size, tx);
+		do_free_accounting = dbuf_block_freeable(db);
+	}
+
+	/*
+	 * If this buffer is dirty in an old transaction group we need
+	 * to make a copy of it so that the changes we make in this
+	 * transaction group won't leak out when we sync the older txg.
+	 */
+	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
+	list_link_init(&dr->dr_dirty_node);
+	if (db->db_level == 0) {
+		void *data_old = db->db_buf;
+
+		if (db->db_state != DB_NOFILL) {
+			if (db->db_blkid == DMU_BONUS_BLKID) {
+				dbuf_fix_old_data(db, tx->tx_txg);
+				data_old = db->db.db_data;
+			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
+				/*
+				 * Release the data buffer from the cache so
+				 * that we can modify it without impacting
+				 * possible other users of this cached data
+				 * block.  Note that indirect blocks and
+				 * private objects are not released until the
+				 * syncing state (since they are only modified
+				 * then).
+				 */
+				arc_release(db->db_buf, db);
+				dbuf_fix_old_data(db, tx->tx_txg);
+				data_old = db->db_buf;
+			}
+			ASSERT(data_old != NULL);
+		}
+		dr->dt.dl.dr_data = data_old;
+	} else {
+		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
+		list_create(&dr->dt.di.dr_children,
+		    sizeof (dbuf_dirty_record_t),
+		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
+	}
+	if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
+		dr->dr_accounted = db->db.db_size;
+	dr->dr_dbuf = db;
+	dr->dr_txg = tx->tx_txg;
+	dr->dr_next = *drp;
+	*drp = dr;
+
+	/*
+	 * We could have been freed_in_flight between the dbuf_noread
+	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
+	 * happened after the free.
+	 */
+	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+	    db->db_blkid != DMU_SPILL_BLKID) {
+		mutex_enter(&dn->dn_mtx);
+		if (dn->dn_free_ranges[txgoff] != NULL) {
+			range_tree_clear(dn->dn_free_ranges[txgoff],
+			    db->db_blkid, 1);
+		}
+		mutex_exit(&dn->dn_mtx);
+		db->db_freed_in_flight = FALSE;
+	}
+
+	/*
+	 * This buffer is now part of this txg
+	 */
+	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
+	db->db_dirtycnt += 1;
+	ASSERT3U(db->db_dirtycnt, <=, 3);
+
+	mutex_exit(&db->db_mtx);
+
+	if (db->db_blkid == DMU_BONUS_BLKID ||
+	    db->db_blkid == DMU_SPILL_BLKID) {
+		mutex_enter(&dn->dn_mtx);
+		ASSERT(!list_link_active(&dr->dr_dirty_node));
+		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+		mutex_exit(&dn->dn_mtx);
+		dnode_setdirty(dn, tx);
+		DB_DNODE_EXIT(db);
+		return (dr);
+	} else if (do_free_accounting) {
+		blkptr_t *bp = db->db_blkptr;
+		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
+		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
+		/*
+		 * This is only a guess -- if the dbuf is dirty
+		 * in a previous txg, we don't know how much
+		 * space it will use on disk yet.  We should
+		 * really have the struct_rwlock to access
+		 * db_blkptr, but since this is just a guess,
+		 * it's OK if we get an odd answer.
+		 */
+		ddt_prefetch(os->os_spa, bp);
+		dnode_willuse_space(dn, -willfree, tx);
+	}
+
+	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		drop_struct_lock = TRUE;
+	}
+
+	if (db->db_level == 0) {
+		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
+		ASSERT(dn->dn_maxblkid >= db->db_blkid);
+	}
+
+	if (db->db_level+1 < dn->dn_nlevels) {
+		dmu_buf_impl_t *parent = db->db_parent;
+		dbuf_dirty_record_t *di;
+		int parent_held = FALSE;
+
+		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
+			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+			parent = dbuf_hold_level(dn, db->db_level+1,
+			    db->db_blkid >> epbs, FTAG);
+			ASSERT(parent != NULL);
+			parent_held = TRUE;
+		}
+		if (drop_struct_lock)
+			rw_exit(&dn->dn_struct_rwlock);
+		ASSERT3U(db->db_level+1, ==, parent->db_level);
+		di = dbuf_dirty(parent, tx);
+		if (parent_held)
+			dbuf_rele(parent, FTAG);
+
+		mutex_enter(&db->db_mtx);
+		/*
+		 * Since we've dropped the mutex, it's possible that
+		 * dbuf_undirty() might have changed this out from under us.
+		 */
+		if (db->db_last_dirty == dr ||
+		    dn->dn_object == DMU_META_DNODE_OBJECT) {
+			mutex_enter(&di->dt.di.dr_mtx);
+			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
+			ASSERT(!list_link_active(&dr->dr_dirty_node));
+			list_insert_tail(&di->dt.di.dr_children, dr);
+			mutex_exit(&di->dt.di.dr_mtx);
+			dr->dr_parent = di;
+		}
+		mutex_exit(&db->db_mtx);
+	} else {
+		ASSERT(db->db_level+1 == dn->dn_nlevels);
+		ASSERT(db->db_blkid < dn->dn_nblkptr);
+		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
+		mutex_enter(&dn->dn_mtx);
+		ASSERT(!list_link_active(&dr->dr_dirty_node));
+		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+		mutex_exit(&dn->dn_mtx);
+		if (drop_struct_lock)
+			rw_exit(&dn->dn_struct_rwlock);
+	}
+
+	dnode_setdirty(dn, tx);
+	DB_DNODE_EXIT(db);
+	return (dr);
+}
+
+/*
+ * Undirty a buffer in the transaction group referenced by the given
+ * transaction.  Return whether this evicted the dbuf.
+ */
+static boolean_t
+dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	uint64_t txg = tx->tx_txg;
+	dbuf_dirty_record_t *dr, **drp;
+
+	ASSERT(txg != 0);
+
+	/*
+	 * Due to our use of dn_nlevels below, this can only be called
+	 * in open context, unless we are operating on the MOS.
+	 * From syncing context, dn_nlevels may be different from the
+	 * dn_nlevels used when dbuf was dirtied.
+	 */
+	ASSERT(db->db_objset ==
+	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
+	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+	ASSERT0(db->db_level);
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	/*
+	 * If this buffer is not dirty, we're done.
+	 */
+	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
+		if (dr->dr_txg <= txg)
+			break;
+	if (dr == NULL || dr->dr_txg < txg)
+		return (B_FALSE);
+	ASSERT(dr->dr_txg == txg);
+	ASSERT(dr->dr_dbuf == db);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+	ASSERT(db->db.db_size != 0);
+
+	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
+	    dr->dr_accounted, txg);
+
+	*drp = dr->dr_next;
+
+	/*
+	 * Note that there are three places in dbuf_dirty()
+	 * where this dirty record may be put on a list.
+	 * Make sure to do a list_remove corresponding to
+	 * every one of those list_insert calls.
+	 */
+	if (dr->dr_parent) {
+		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
+		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
+		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
+	} else if (db->db_blkid == DMU_SPILL_BLKID ||
+	    db->db_level + 1 == dn->dn_nlevels) {
+		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
+		mutex_enter(&dn->dn_mtx);
+		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
+		mutex_exit(&dn->dn_mtx);
+	}
+	DB_DNODE_EXIT(db);
+
+	if (db->db_state != DB_NOFILL) {
+		dbuf_unoverride(dr);
+
+		ASSERT(db->db_buf != NULL);
+		ASSERT(dr->dt.dl.dr_data != NULL);
+		if (dr->dt.dl.dr_data != db->db_buf)
+			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
+	}
+
+	kmem_free(dr, sizeof (dbuf_dirty_record_t));
+
+	ASSERT(db->db_dirtycnt > 0);
+	db->db_dirtycnt -= 1;
+
+	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
+		arc_buf_t *buf = db->db_buf;
+
+		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
+		dbuf_clear_data(db);
+		VERIFY(arc_buf_remove_ref(buf, db));
+		dbuf_evict(db);
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+void
+dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
+
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(!refcount_is_zero(&db->db_holds));
+
+	DB_DNODE_ENTER(db);
+	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
+		rf |= DB_RF_HAVESTRUCT;
+	DB_DNODE_EXIT(db);
+	(void) dbuf_read(db, NULL, rf);
+	(void) dbuf_dirty(db, tx);
+}
+
+void
+dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	db->db_state = DB_NOFILL;
+
+	dmu_buf_will_fill(db_fake, tx);
+}
+
+void
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(db->db_level == 0);
+	ASSERT(!refcount_is_zero(&db->db_holds));
+
+	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
+	    dmu_tx_private_ok(tx));
+
+	dbuf_noread(db);
+	(void) dbuf_dirty(db, tx);
+}
+
+#pragma weak dmu_buf_fill_done = dbuf_fill_done
+/* ARGSUSED */
+void
+dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	mutex_enter(&db->db_mtx);
+	DBUF_VERIFY(db);
+
+	if (db->db_state == DB_FILL) {
+		if (db->db_level == 0 && db->db_freed_in_flight) {
+			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+			/* we were freed while filling */
+			/* XXX dbuf_undirty? */
+			bzero(db->db.db_data, db->db.db_size);
+			db->db_freed_in_flight = FALSE;
+		}
+		db->db_state = DB_CACHED;
+		cv_broadcast(&db->db_changed);
+	}
+	mutex_exit(&db->db_mtx);
+}
+
+void
+dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+    bp_embedded_type_t etype, enum zio_compress comp,
+    int uncompressed_size, int compressed_size, int byteorder,
+    dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+	struct dirty_leaf *dl;
+	dmu_object_type_t type;
+
+	if (etype == BP_EMBEDDED_TYPE_DATA) {
+		ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
+		    SPA_FEATURE_EMBEDDED_DATA));
+	}
+
+	DB_DNODE_ENTER(db);
+	type = DB_DNODE(db)->dn_type;
+	DB_DNODE_EXIT(db);
+
+	ASSERT0(db->db_level);
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+	dmu_buf_will_not_fill(dbuf, tx);
+
+	ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+	dl = &db->db_last_dirty->dt.dl;
+	encode_embedded_bp_compressed(&dl->dr_overridden_by,
+	    data, comp, uncompressed_size, compressed_size);
+	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
+	BP_SET_TYPE(&dl->dr_overridden_by, type);
+	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
+	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
+
+	dl->dr_override_state = DR_OVERRIDDEN;
+	dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
+}
+
+/*
+ * Directly assign a provided arc buf to a given dbuf if it's not referenced
+ * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
+ */
+void
+dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
+{
+	ASSERT(!refcount_is_zero(&db->db_holds));
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+	ASSERT(db->db_level == 0);
+	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
+	ASSERT(buf != NULL);
+	ASSERT(arc_buf_size(buf) == db->db.db_size);
+	ASSERT(tx->tx_txg != 0);
+
+	arc_return_buf(buf, db);
+	ASSERT(arc_released(buf));
+
+	mutex_enter(&db->db_mtx);
+
+	while (db->db_state == DB_READ || db->db_state == DB_FILL)
+		cv_wait(&db->db_changed, &db->db_mtx);
+
+	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
+
+	if (db->db_state == DB_CACHED &&
+	    refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
+		mutex_exit(&db->db_mtx);
+		(void) dbuf_dirty(db, tx);
+		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
+		VERIFY(arc_buf_remove_ref(buf, db));
+		xuio_stat_wbuf_copied();
+		return;
+	}
+
+	xuio_stat_wbuf_nocopy();
+	if (db->db_state == DB_CACHED) {
+		dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+		ASSERT(db->db_buf != NULL);
+		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
+			ASSERT(dr->dt.dl.dr_data == db->db_buf);
+			if (!arc_released(db->db_buf)) {
+				ASSERT(dr->dt.dl.dr_override_state ==
+				    DR_OVERRIDDEN);
+				arc_release(db->db_buf, db);
+			}
+			dr->dt.dl.dr_data = buf;
+			VERIFY(arc_buf_remove_ref(db->db_buf, db));
+		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
+			arc_release(db->db_buf, db);
+			VERIFY(arc_buf_remove_ref(db->db_buf, db));
+		}
+		db->db_buf = NULL;
+	}
+	ASSERT(db->db_buf == NULL);
+	dbuf_set_data(db, buf);
+	db->db_state = DB_FILL;
+	mutex_exit(&db->db_mtx);
+	(void) dbuf_dirty(db, tx);
+	dmu_buf_fill_done(&db->db, tx);
+}
+
+/*
+ * "Clear" the contents of this dbuf.  This will mark the dbuf
+ * EVICTING and clear *most* of its references.  Unfortunately,
+ * when we are not holding the dn_dbufs_mtx, we can't clear the
+ * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
+ * in this case.  For callers from the DMU we will usually see:
+ *	dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
+ * For the arc callback, we will usually see:
+ *	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
+ * Sometimes, though, we will get a mix of these two:
+ *	DMU: dbuf_clear()->arc_clear_callback()
+ *	ARC: dbuf_do_evict()->dbuf_destroy()
+ *
+ * This routine will dissociate the dbuf from the arc, by calling
+ * arc_clear_callback(), but will not evict the data from the ARC.
+ */
+void
+dbuf_clear(dmu_buf_impl_t *db)
+{
+	dnode_t *dn;
+	dmu_buf_impl_t *parent = db->db_parent;
+	dmu_buf_impl_t *dndb;
+	boolean_t dbuf_gone = B_FALSE;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(refcount_is_zero(&db->db_holds));
+
+	dbuf_evict_user(db);
+
+	if (db->db_state == DB_CACHED) {
+		ASSERT(db->db.db_data != NULL);
+		if (db->db_blkid == DMU_BONUS_BLKID) {
+			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
+			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+		}
+		db->db.db_data = NULL;
+		db->db_state = DB_UNCACHED;
+	}
+
+	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
+	ASSERT(db->db_data_pending == NULL);
+
+	db->db_state = DB_EVICTING;
+	db->db_blkptr = NULL;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	dndb = dn->dn_dbuf;
+	if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
+		avl_remove(&dn->dn_dbufs, db);
+		atomic_dec_32(&dn->dn_dbufs_count);
+		membar_producer();
+		DB_DNODE_EXIT(db);
+		/*
+		 * Decrementing the dbuf count means that the hold corresponding
+		 * to the removed dbuf is no longer discounted in dnode_move(),
+		 * so the dnode cannot be moved until after we release the hold.
+		 * The membar_producer() ensures visibility of the decremented
+		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
+		 * release any lock.
+		 */
+		dnode_rele(dn, db);
+		db->db_dnode_handle = NULL;
+	} else {
+		DB_DNODE_EXIT(db);
+	}
+
+	if (db->db_buf)
+		dbuf_gone = arc_clear_callback(db->db_buf);
+
+	if (!dbuf_gone)
+		mutex_exit(&db->db_mtx);
+
+	/*
+	 * If this dbuf is referenced from an indirect dbuf,
+	 * decrement the ref count on the indirect dbuf.
+	 */
+	if (parent && parent != dndb)
+		dbuf_rele(parent, db);
+}
+
+__attribute__((always_inline))
+static inline int
+dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
+    dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh)
+{
+	int nlevels, epbs;
+
+	*parentp = NULL;
+	*bpp = NULL;
+
+	ASSERT(blkid != DMU_BONUS_BLKID);
+
+	if (blkid == DMU_SPILL_BLKID) {
+		mutex_enter(&dn->dn_mtx);
+		if (dn->dn_have_spill &&
+		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+			*bpp = &dn->dn_phys->dn_spill;
+		else
+			*bpp = NULL;
+		dbuf_add_ref(dn->dn_dbuf, NULL);
+		*parentp = dn->dn_dbuf;
+		mutex_exit(&dn->dn_mtx);
+		return (0);
+	}
+
+	if (dn->dn_phys->dn_nlevels == 0)
+		nlevels = 1;
+	else
+		nlevels = dn->dn_phys->dn_nlevels;
+
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+	ASSERT3U(level * epbs, <, 64);
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+	if (level >= nlevels ||
+	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
+		/* the buffer has no parent yet */
+		return (SET_ERROR(ENOENT));
+	} else if (level < nlevels-1) {
+		/* this block is referenced from an indirect block */
+		int err;
+		if (dh == NULL) {
+			err = dbuf_hold_impl(dn, level+1, blkid >> epbs,
+					fail_sparse, NULL, parentp);
+		} else {
+			__dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1,
+					blkid >> epbs, fail_sparse, NULL,
+					parentp, dh->dh_depth + 1);
+			err = __dbuf_hold_impl(dh + 1);
+		}
+		if (err)
+			return (err);
+		err = dbuf_read(*parentp, NULL,
+		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
+		if (err) {
+			dbuf_rele(*parentp, NULL);
+			*parentp = NULL;
+			return (err);
+		}
+		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
+		    (blkid & ((1ULL << epbs) - 1));
+		return (0);
+	} else {
+		/* the block is referenced from the dnode */
+		ASSERT3U(level, ==, nlevels-1);
+		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
+		    blkid < dn->dn_phys->dn_nblkptr);
+		if (dn->dn_dbuf) {
+			dbuf_add_ref(dn->dn_dbuf, NULL);
+			*parentp = dn->dn_dbuf;
+		}
+		*bpp = &dn->dn_phys->dn_blkptr[blkid];
+		return (0);
+	}
+}
+
+static dmu_buf_impl_t *
+dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
+    dmu_buf_impl_t *parent, blkptr_t *blkptr)
+{
+	objset_t *os = dn->dn_objset;
+	dmu_buf_impl_t *db, *odb;
+
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+	ASSERT(dn->dn_type != DMU_OT_NONE);
+
+	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
+
+	db->db_objset = os;
+	db->db.db_object = dn->dn_object;
+	db->db_level = level;
+	db->db_blkid = blkid;
+	db->db_last_dirty = NULL;
+	db->db_dirtycnt = 0;
+	db->db_dnode_handle = dn->dn_handle;
+	db->db_parent = parent;
+	db->db_blkptr = blkptr;
+
+	db->db_user = NULL;
+	db->db_user_immediate_evict = FALSE;
+	db->db_freed_in_flight = FALSE;
+	db->db_pending_evict = FALSE;
+
+	if (blkid == DMU_BONUS_BLKID) {
+		ASSERT3P(parent, ==, dn->dn_dbuf);
+		db->db.db_size = DN_MAX_BONUSLEN -
+		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+		db->db.db_offset = DMU_BONUS_BLKID;
+		db->db_state = DB_UNCACHED;
+		/* the bonus dbuf is not placed in the hash table */
+		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
+		return (db);
+	} else if (blkid == DMU_SPILL_BLKID) {
+		db->db.db_size = (blkptr != NULL) ?
+		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
+		db->db.db_offset = 0;
+	} else {
+		int blocksize =
+		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
+		db->db.db_size = blocksize;
+		db->db.db_offset = db->db_blkid * blocksize;
+	}
+
+	/*
+	 * Hold the dn_dbufs_mtx while we get the new dbuf
+	 * in the hash table *and* added to the dbufs list.
+	 * This prevents a possible deadlock with someone
+	 * trying to look up this dbuf before its added to the
+	 * dn_dbufs list.
+	 */
+	mutex_enter(&dn->dn_dbufs_mtx);
+	db->db_state = DB_EVICTING;
+	if ((odb = dbuf_hash_insert(db)) != NULL) {
+		/* someone else inserted it first */
+		kmem_cache_free(dbuf_cache, db);
+		mutex_exit(&dn->dn_dbufs_mtx);
+		return (odb);
+	}
+	avl_add(&dn->dn_dbufs, db);
+	if (db->db_level == 0 && db->db_blkid >=
+	    dn->dn_unlisted_l0_blkid)
+		dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
+	db->db_state = DB_UNCACHED;
+	mutex_exit(&dn->dn_dbufs_mtx);
+	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
+
+	if (parent && parent != dn->dn_dbuf)
+		dbuf_add_ref(parent, db);
+
+	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+	    refcount_count(&dn->dn_holds) > 0);
+	(void) refcount_add(&dn->dn_holds, db);
+	atomic_inc_32(&dn->dn_dbufs_count);
+
+	dprintf_dbuf(db, "db=%p\n", db);
+
+	return (db);
+}
+
+static int
+dbuf_do_evict(void *private)
+{
+	dmu_buf_impl_t *db = private;
+
+	if (!MUTEX_HELD(&db->db_mtx))
+		mutex_enter(&db->db_mtx);
+
+	ASSERT(refcount_is_zero(&db->db_holds));
+
+	if (db->db_state != DB_EVICTING) {
+		ASSERT(db->db_state == DB_CACHED);
+		DBUF_VERIFY(db);
+		db->db_buf = NULL;
+		dbuf_evict(db);
+	} else {
+		mutex_exit(&db->db_mtx);
+		dbuf_destroy(db);
+	}
+	return (0);
+}
+
+static void
+dbuf_destroy(dmu_buf_impl_t *db)
+{
+	ASSERT(refcount_is_zero(&db->db_holds));
+
+	if (db->db_blkid != DMU_BONUS_BLKID) {
+		/*
+		 * If this dbuf is still on the dn_dbufs list,
+		 * remove it from that list.
+		 */
+		if (db->db_dnode_handle != NULL) {
+			dnode_t *dn;
+
+			DB_DNODE_ENTER(db);
+			dn = DB_DNODE(db);
+			mutex_enter(&dn->dn_dbufs_mtx);
+			avl_remove(&dn->dn_dbufs, db);
+			atomic_dec_32(&dn->dn_dbufs_count);
+			mutex_exit(&dn->dn_dbufs_mtx);
+			DB_DNODE_EXIT(db);
+			/*
+			 * Decrementing the dbuf count means that the hold
+			 * corresponding to the removed dbuf is no longer
+			 * discounted in dnode_move(), so the dnode cannot be
+			 * moved until after we release the hold.
+			 */
+			dnode_rele(dn, db);
+			db->db_dnode_handle = NULL;
+		}
+		dbuf_hash_remove(db);
+	}
+	db->db_parent = NULL;
+	db->db_buf = NULL;
+
+	ASSERT(db->db.db_data == NULL);
+	ASSERT(db->db_hash_next == NULL);
+	ASSERT(db->db_blkptr == NULL);
+	ASSERT(db->db_data_pending == NULL);
+
+	kmem_cache_free(dbuf_cache, db);
+	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
+}
+
+void
+dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
+{
+	dmu_buf_impl_t *db = NULL;
+	blkptr_t *bp = NULL;
+
+	ASSERT(blkid != DMU_BONUS_BLKID);
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
+	if (dnode_block_freed(dn, blkid))
+		return;
+
+	/* dbuf_find() returns with db_mtx held */
+	if ((db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid))) {
+		/*
+		 * This dbuf is already in the cache.  We assume that
+		 * it is already CACHED, or else about to be either
+		 * read or filled.
+		 */
+		mutex_exit(&db->db_mtx);
+		return;
+	}
+
+	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) {
+		if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
+			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+			arc_flags_t aflags =
+			    ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+			zbookmark_phys_t zb;
+
+			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+			    dn->dn_object, 0, blkid);
+
+			(void) arc_read(NULL, dn->dn_objset->os_spa,
+			    bp, NULL, NULL, prio,
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+			    &aflags, &zb);
+		}
+		if (db)
+			dbuf_rele(db, NULL);
+	}
+}
+
+#define	DBUF_HOLD_IMPL_MAX_DEPTH	20
+
+/*
+ * Returns with db_holds incremented, and db_mtx not held.
+ * Note: dn_struct_rwlock must be held.
+ */
+static int
+__dbuf_hold_impl(struct dbuf_hold_impl_data *dh)
+{
+	ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH);
+	dh->dh_parent = NULL;
+
+	ASSERT(dh->dh_blkid != DMU_BONUS_BLKID);
+	ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock));
+	ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level);
+
+	*(dh->dh_dbp) = NULL;
+top:
+	/* dbuf_find() returns with db_mtx held */
+	dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object,
+	    dh->dh_level, dh->dh_blkid);
+
+	if (dh->dh_db == NULL) {
+		dh->dh_bp = NULL;
+
+		ASSERT3P(dh->dh_parent, ==, NULL);
+		dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid,
+					dh->dh_fail_sparse, &dh->dh_parent,
+					&dh->dh_bp, dh);
+		if (dh->dh_fail_sparse) {
+			if (dh->dh_err == 0 &&
+			    dh->dh_bp && BP_IS_HOLE(dh->dh_bp))
+				dh->dh_err = SET_ERROR(ENOENT);
+			if (dh->dh_err) {
+				if (dh->dh_parent)
+					dbuf_rele(dh->dh_parent, NULL);
+				return (dh->dh_err);
+			}
+		}
+		if (dh->dh_err && dh->dh_err != ENOENT)
+			return (dh->dh_err);
+		dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid,
+					dh->dh_parent, dh->dh_bp);
+	}
+
+	if (dh->dh_db->db_buf && refcount_is_zero(&dh->dh_db->db_holds)) {
+		arc_buf_add_ref(dh->dh_db->db_buf, dh->dh_db);
+		if (dh->dh_db->db_buf->b_data == NULL) {
+			dbuf_clear(dh->dh_db);
+			if (dh->dh_parent) {
+				dbuf_rele(dh->dh_parent, NULL);
+				dh->dh_parent = NULL;
+			}
+			goto top;
+		}
+		ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data);
+	}
+
+	ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf));
+
+	/*
+	 * If this buffer is currently syncing out, and we are are
+	 * still referencing it from db_data, we need to make a copy
+	 * of it in case we decide we want to dirty it again in this txg.
+	 */
+	if (dh->dh_db->db_level == 0 &&
+	    dh->dh_db->db_blkid != DMU_BONUS_BLKID &&
+	    dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT &&
+	    dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) {
+		dh->dh_dr = dh->dh_db->db_data_pending;
+
+		if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf) {
+			dh->dh_type = DBUF_GET_BUFC_TYPE(dh->dh_db);
+
+			dbuf_set_data(dh->dh_db,
+			    arc_buf_alloc(dh->dh_dn->dn_objset->os_spa,
+			    dh->dh_db->db.db_size, dh->dh_db, dh->dh_type));
+			bcopy(dh->dh_dr->dt.dl.dr_data->b_data,
+			    dh->dh_db->db.db_data, dh->dh_db->db.db_size);
+		}
+	}
+
+	(void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag);
+	DBUF_VERIFY(dh->dh_db);
+	mutex_exit(&dh->dh_db->db_mtx);
+
+	/* NOTE: we can't rele the parent until after we drop the db_mtx */
+	if (dh->dh_parent)
+		dbuf_rele(dh->dh_parent, NULL);
+
+	ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn);
+	ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid);
+	ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level);
+	*(dh->dh_dbp) = dh->dh_db;
+
+	return (0);
+}
+
+/*
+ * The following code preserves the recursive function dbuf_hold_impl()
+ * but moves the local variables AND function arguments to the heap to
+ * minimize the stack frame size.  Enough space is initially allocated
+ * on the stack for 20 levels of recursion.
+ */
+int
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+    void *tag, dmu_buf_impl_t **dbp)
+{
+	struct dbuf_hold_impl_data *dh;
+	int error;
+
+	dh = kmem_zalloc(sizeof (struct dbuf_hold_impl_data) *
+	    DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP);
+	__dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, tag, dbp, 0);
+
+	error = __dbuf_hold_impl(dh);
+
+	kmem_free(dh, sizeof (struct dbuf_hold_impl_data) *
+	    DBUF_HOLD_IMPL_MAX_DEPTH);
+
+	return (error);
+}
+
+static void
+__dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
+    dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+    void *tag, dmu_buf_impl_t **dbp, int depth)
+{
+	dh->dh_dn = dn;
+	dh->dh_level = level;
+	dh->dh_blkid = blkid;
+	dh->dh_fail_sparse = fail_sparse;
+	dh->dh_tag = tag;
+	dh->dh_dbp = dbp;
+	dh->dh_depth = depth;
+}
+
+dmu_buf_impl_t *
+dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
+{
+	dmu_buf_impl_t *db;
+	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
+	return (err ? NULL : db);
+}
+
+dmu_buf_impl_t *
+dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
+{
+	dmu_buf_impl_t *db;
+	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+	return (err ? NULL : db);
+}
+
+void
+dbuf_create_bonus(dnode_t *dn)
+{
+	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+	ASSERT(dn->dn_bonus == NULL);
+	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
+}
+
+int
+dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+
+	if (db->db_blkid != DMU_SPILL_BLKID)
+		return (SET_ERROR(ENOTSUP));
+	if (blksz == 0)
+		blksz = SPA_MINBLOCKSIZE;
+	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
+	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	dbuf_new_size(db, blksz, tx);
+	rw_exit(&dn->dn_struct_rwlock);
+	DB_DNODE_EXIT(db);
+
+	return (0);
+}
+
+void
+dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
+}
+
+#pragma weak dmu_buf_add_ref = dbuf_add_ref
+void
+dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
+{
+	VERIFY(refcount_add(&db->db_holds, tag) > 1);
+}
+
+#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
+boolean_t
+dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
+    void *tag)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dmu_buf_impl_t *found_db;
+	boolean_t result = B_FALSE;
+
+	if (blkid == DMU_BONUS_BLKID)
+		found_db = dbuf_find_bonus(os, obj);
+	else
+		found_db = dbuf_find(os, obj, 0, blkid);
+
+	if (found_db != NULL) {
+		if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
+			(void) refcount_add(&db->db_holds, tag);
+			result = B_TRUE;
+		}
+		mutex_exit(&found_db->db_mtx);
+	}
+	return (result);
+}
+
+/*
+ * If you call dbuf_rele() you had better not be referencing the dnode handle
+ * unless you have some other direct or indirect hold on the dnode. (An indirect
+ * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
+ * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
+ * dnode's parent dbuf evicting its dnode handles.
+ */
+void
+dbuf_rele(dmu_buf_impl_t *db, void *tag)
+{
+	mutex_enter(&db->db_mtx);
+	dbuf_rele_and_unlock(db, tag);
+}
+
+void
+dmu_buf_rele(dmu_buf_t *db, void *tag)
+{
+	dbuf_rele((dmu_buf_impl_t *)db, tag);
+}
+
+/*
+ * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
+ * db_dirtycnt and db_holds to be updated atomically.
+ */
+void
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
+{
+	int64_t holds;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	DBUF_VERIFY(db);
+
+	/*
+	 * Remove the reference to the dbuf before removing its hold on the
+	 * dnode so we can guarantee in dnode_move() that a referenced bonus
+	 * buffer has a corresponding dnode hold.
+	 */
+	holds = refcount_remove(&db->db_holds, tag);
+	ASSERT(holds >= 0);
+
+	/*
+	 * We can't freeze indirects if there is a possibility that they
+	 * may be modified in the current syncing context.
+	 */
+	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
+		arc_buf_freeze(db->db_buf);
+
+	if (holds == db->db_dirtycnt &&
+	    db->db_level == 0 && db->db_user_immediate_evict)
+		dbuf_evict_user(db);
+
+	if (holds == 0) {
+		if (db->db_blkid == DMU_BONUS_BLKID) {
+			dnode_t *dn;
+			boolean_t evict_dbuf = db->db_pending_evict;
+
+			/*
+			 * If the dnode moves here, we cannot cross this
+			 * barrier until the move completes.
+			 */
+			DB_DNODE_ENTER(db);
+
+			dn = DB_DNODE(db);
+			atomic_dec_32(&dn->dn_dbufs_count);
+
+			/*
+			 * Decrementing the dbuf count means that the bonus
+			 * buffer's dnode hold is no longer discounted in
+			 * dnode_move(). The dnode cannot move until after
+			 * the dnode_rele() below.
+			 */
+			DB_DNODE_EXIT(db);
+
+			/*
+			 * Do not reference db after its lock is dropped.
+			 * Another thread may evict it.
+			 */
+			mutex_exit(&db->db_mtx);
+
+			if (evict_dbuf)
+				dnode_evict_bonus(dn);
+
+			dnode_rele(dn, db);
+		} else if (db->db_buf == NULL) {
+			/*
+			 * This is a special case: we never associated this
+			 * dbuf with any data allocated from the ARC.
+			 */
+			ASSERT(db->db_state == DB_UNCACHED ||
+			    db->db_state == DB_NOFILL);
+			dbuf_evict(db);
+		} else if (arc_released(db->db_buf)) {
+			arc_buf_t *buf = db->db_buf;
+			/*
+			 * This dbuf has anonymous data associated with it.
+			 */
+			dbuf_clear_data(db);
+			VERIFY(arc_buf_remove_ref(buf, db));
+			dbuf_evict(db);
+		} else {
+			VERIFY(!arc_buf_remove_ref(db->db_buf, db));
+
+			/*
+			 * A dbuf will be eligible for eviction if either the
+			 * 'primarycache' property is set or a duplicate
+			 * copy of this buffer is already cached in the arc.
+			 *
+			 * In the case of the 'primarycache' a buffer
+			 * is considered for eviction if it matches the
+			 * criteria set in the property.
+			 *
+			 * To decide if our buffer is considered a
+			 * duplicate, we must call into the arc to determine
+			 * if multiple buffers are referencing the same
+			 * block on-disk. If so, then we simply evict
+			 * ourselves.
+			 */
+			if (!DBUF_IS_CACHEABLE(db)) {
+				if (db->db_blkptr != NULL &&
+				    !BP_IS_HOLE(db->db_blkptr) &&
+				    !BP_IS_EMBEDDED(db->db_blkptr)) {
+					spa_t *spa =
+					    dmu_objset_spa(db->db_objset);
+					blkptr_t bp = *db->db_blkptr;
+					dbuf_clear(db);
+					arc_freed(spa, &bp);
+				} else {
+					dbuf_clear(db);
+				}
+			} else if (db->db_pending_evict ||
+			    arc_buf_eviction_needed(db->db_buf)) {
+				dbuf_clear(db);
+			} else {
+				mutex_exit(&db->db_mtx);
+			}
+		}
+	} else {
+		mutex_exit(&db->db_mtx);
+	}
+}
+
+#pragma weak dmu_buf_refcount = dbuf_refcount
+uint64_t
+dbuf_refcount(dmu_buf_impl_t *db)
+{
+	return (refcount_count(&db->db_holds));
+}
+
+void *
+dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
+    dmu_buf_user_t *new_user)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	mutex_enter(&db->db_mtx);
+	dbuf_verify_user(db, DBVU_NOT_EVICTING);
+	if (db->db_user == old_user)
+		db->db_user = new_user;
+	else
+		old_user = db->db_user;
+	dbuf_verify_user(db, DBVU_NOT_EVICTING);
+	mutex_exit(&db->db_mtx);
+
+	return (old_user);
+}
+
+void *
+dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+	return (dmu_buf_replace_user(db_fake, NULL, user));
+}
+
+void *
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	db->db_user_immediate_evict = TRUE;
+	return (dmu_buf_set_user(db_fake, user));
+}
+
+void *
+dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+	return (dmu_buf_replace_user(db_fake, user, NULL));
+}
+
+void *
+dmu_buf_get_user(dmu_buf_t *db_fake)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	dbuf_verify_user(db, DBVU_NOT_EVICTING);
+	return (db->db_user);
+}
+
+void
+dmu_buf_user_evict_wait()
+{
+	taskq_wait(dbu_evict_taskq);
+}
+
+boolean_t
+dmu_buf_freeable(dmu_buf_t *dbuf)
+{
+	boolean_t res = B_FALSE;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+
+	if (db->db_blkptr)
+		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
+		    db->db_blkptr, db->db_blkptr->blk_birth);
+
+	return (res);
+}
+
+blkptr_t *
+dmu_buf_get_blkptr(dmu_buf_t *db)
+{
+	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+	return (dbi->db_blkptr);
+}
+
+static void
+dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
+{
+	/* ASSERT(dmu_tx_is_syncing(tx) */
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	if (db->db_blkptr != NULL)
+		return;
+
+	if (db->db_blkid == DMU_SPILL_BLKID) {
+		db->db_blkptr = &dn->dn_phys->dn_spill;
+		BP_ZERO(db->db_blkptr);
+		return;
+	}
+	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
+		/*
+		 * This buffer was allocated at a time when there was
+		 * no available blkptrs from the dnode, or it was
+		 * inappropriate to hook it in (i.e., nlevels mis-match).
+		 */
+		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
+		ASSERT(db->db_parent == NULL);
+		db->db_parent = dn->dn_dbuf;
+		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
+		DBUF_VERIFY(db);
+	} else {
+		dmu_buf_impl_t *parent = db->db_parent;
+		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+		ASSERT(dn->dn_phys->dn_nlevels > 1);
+		if (parent == NULL) {
+			mutex_exit(&db->db_mtx);
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			(void) dbuf_hold_impl(dn, db->db_level+1,
+			    db->db_blkid >> epbs, FALSE, db, &parent);
+			rw_exit(&dn->dn_struct_rwlock);
+			mutex_enter(&db->db_mtx);
+			db->db_parent = parent;
+		}
+		db->db_blkptr = (blkptr_t *)parent->db.db_data +
+		    (db->db_blkid & ((1ULL << epbs) - 1));
+		DBUF_VERIFY(db);
+	}
+}
+
+/*
+ * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
+ * is critical the we not allow the compiler to inline this function in to
+ * dbuf_sync_list() thereby drastically bloating the stack usage.
+ */
+noinline static void
+dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	dnode_t *dn;
+	zio_t *zio;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+	mutex_enter(&db->db_mtx);
+
+	ASSERT(db->db_level > 0);
+	DBUF_VERIFY(db);
+
+	/* Read the block if it hasn't been read yet. */
+	if (db->db_buf == NULL) {
+		mutex_exit(&db->db_mtx);
+		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+		mutex_enter(&db->db_mtx);
+	}
+	ASSERT3U(db->db_state, ==, DB_CACHED);
+	ASSERT(db->db_buf != NULL);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	/* Indirect block size must match what the dnode thinks it is. */
+	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+	dbuf_check_blkptr(dn, db);
+	DB_DNODE_EXIT(db);
+
+	/* Provide the pending dirty record to child dbufs */
+	db->db_data_pending = dr;
+
+	mutex_exit(&db->db_mtx);
+	dbuf_write(dr, db->db_buf, tx);
+
+	zio = dr->dr_zio;
+	mutex_enter(&dr->dt.di.dr_mtx);
+	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
+	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+	mutex_exit(&dr->dt.di.dr_mtx);
+	zio_nowait(zio);
+}
+
+/*
+ * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
+ * critical the we not allow the compiler to inline this function in to
+ * dbuf_sync_list() thereby drastically bloating the stack usage.
+ */
+noinline static void
+dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+	arc_buf_t **datap = &dr->dt.dl.dr_data;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	dnode_t *dn;
+	objset_t *os;
+	uint64_t txg = tx->tx_txg;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+	mutex_enter(&db->db_mtx);
+	/*
+	 * To be synced, we must be dirtied.  But we
+	 * might have been freed after the dirty.
+	 */
+	if (db->db_state == DB_UNCACHED) {
+		/* This buffer has been freed since it was dirtied */
+		ASSERT(db->db.db_data == NULL);
+	} else if (db->db_state == DB_FILL) {
+		/* This buffer was freed and is now being re-filled */
+		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
+	} else {
+		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
+	}
+	DBUF_VERIFY(db);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	if (db->db_blkid == DMU_SPILL_BLKID) {
+		mutex_enter(&dn->dn_mtx);
+		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
+		mutex_exit(&dn->dn_mtx);
+	}
+
+	/*
+	 * If this is a bonus buffer, simply copy the bonus data into the
+	 * dnode.  It will be written out when the dnode is synced (and it
+	 * will be synced, since it must have been dirty for dbuf_sync to
+	 * be called).
+	 */
+	if (db->db_blkid == DMU_BONUS_BLKID) {
+		dbuf_dirty_record_t **drp;
+
+		ASSERT(*datap != NULL);
+		ASSERT0(db->db_level);
+		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+		DB_DNODE_EXIT(db);
+
+		if (*datap != db->db.db_data) {
+			zio_buf_free(*datap, DN_MAX_BONUSLEN);
+			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+		}
+		db->db_data_pending = NULL;
+		drp = &db->db_last_dirty;
+		while (*drp != dr)
+			drp = &(*drp)->dr_next;
+		ASSERT(dr->dr_next == NULL);
+		ASSERT(dr->dr_dbuf == db);
+		*drp = dr->dr_next;
+		if (dr->dr_dbuf->db_level != 0) {
+			mutex_destroy(&dr->dt.di.dr_mtx);
+			list_destroy(&dr->dt.di.dr_children);
+		}
+		kmem_free(dr, sizeof (dbuf_dirty_record_t));
+		ASSERT(db->db_dirtycnt > 0);
+		db->db_dirtycnt -= 1;
+		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+		return;
+	}
+
+	os = dn->dn_objset;
+
+	/*
+	 * This function may have dropped the db_mtx lock allowing a dmu_sync
+	 * operation to sneak in. As a result, we need to ensure that we
+	 * don't check the dr_override_state until we have returned from
+	 * dbuf_check_blkptr.
+	 */
+	dbuf_check_blkptr(dn, db);
+
+	/*
+	 * If this buffer is in the middle of an immediate write,
+	 * wait for the synchronous IO to complete.
+	 */
+	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+		cv_wait(&db->db_changed, &db->db_mtx);
+		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
+	}
+
+	if (db->db_state != DB_NOFILL &&
+	    dn->dn_object != DMU_META_DNODE_OBJECT &&
+	    refcount_count(&db->db_holds) > 1 &&
+	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
+	    *datap == db->db_buf) {
+		/*
+		 * If this buffer is currently "in use" (i.e., there
+		 * are active holds and db_data still references it),
+		 * then make a copy before we start the write so that
+		 * any modifications from the open txg will not leak
+		 * into this write.
+		 *
+		 * NOTE: this copy does not need to be made for
+		 * objects only modified in the syncing context (e.g.
+		 * DNONE_DNODE blocks).
+		 */
+		int blksz = arc_buf_size(*datap);
+		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+		*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
+		bcopy(db->db.db_data, (*datap)->b_data, blksz);
+	}
+	db->db_data_pending = dr;
+
+	mutex_exit(&db->db_mtx);
+
+	dbuf_write(dr, *datap, tx);
+
+	ASSERT(!list_link_active(&dr->dr_dirty_node));
+	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
+		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
+		DB_DNODE_EXIT(db);
+	} else {
+		/*
+		 * Although zio_nowait() does not "wait for an IO", it does
+		 * initiate the IO. If this is an empty write it seems plausible
+		 * that the IO could actually be completed before the nowait
+		 * returns. We need to DB_DNODE_EXIT() first in case
+		 * zio_nowait() invalidates the dbuf.
+		 */
+		DB_DNODE_EXIT(db);
+		zio_nowait(dr->dr_zio);
+	}
+}
+
+void
+dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
+{
+	dbuf_dirty_record_t *dr;
+
+	while ((dr = list_head(list))) {
+		if (dr->dr_zio != NULL) {
+			/*
+			 * If we find an already initialized zio then we
+			 * are processing the meta-dnode, and we have finished.
+			 * The dbufs for all dnodes are put back on the list
+			 * during processing, so that we can zio_wait()
+			 * these IOs after initiating all child IOs.
+			 */
+			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
+			    DMU_META_DNODE_OBJECT);
+			break;
+		}
+		if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+			VERIFY3U(dr->dr_dbuf->db_level, ==, level);
+		}
+		list_remove(list, dr);
+		if (dr->dr_dbuf->db_level > 0)
+			dbuf_sync_indirect(dr, tx);
+		else
+			dbuf_sync_leaf(dr, tx);
+	}
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+	dmu_buf_impl_t *db = vdb;
+	dnode_t *dn;
+	blkptr_t *bp = zio->io_bp;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
+	spa_t *spa = zio->io_spa;
+	int64_t delta;
+	uint64_t fill = 0;
+	int i;
+
+	ASSERT3P(db->db_blkptr, ==, bp);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
+	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
+	zio->io_prev_space_delta = delta;
+
+	if (bp->blk_birth != 0) {
+		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
+		    BP_GET_TYPE(bp) == dn->dn_type) ||
+		    (db->db_blkid == DMU_SPILL_BLKID &&
+		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
+		    BP_IS_EMBEDDED(bp));
+		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
+	}
+
+	mutex_enter(&db->db_mtx);
+
+#ifdef ZFS_DEBUG
+	if (db->db_blkid == DMU_SPILL_BLKID) {
+		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+		    db->db_blkptr == &dn->dn_phys->dn_spill);
+	}
+#endif
+
+	if (db->db_level == 0) {
+		mutex_enter(&dn->dn_mtx);
+		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
+		    db->db_blkid != DMU_SPILL_BLKID)
+			dn->dn_phys->dn_maxblkid = db->db_blkid;
+		mutex_exit(&dn->dn_mtx);
+
+		if (dn->dn_type == DMU_OT_DNODE) {
+			dnode_phys_t *dnp = db->db.db_data;
+			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
+			    i--, dnp++) {
+				if (dnp->dn_type != DMU_OT_NONE)
+					fill++;
+			}
+		} else {
+			if (BP_IS_HOLE(bp)) {
+				fill = 0;
+			} else {
+				fill = 1;
+			}
+		}
+	} else {
+		blkptr_t *ibp = db->db.db_data;
+		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
+			if (BP_IS_HOLE(ibp))
+				continue;
+			fill += BP_GET_FILL(ibp);
+		}
+	}
+	DB_DNODE_EXIT(db);
+
+	if (!BP_IS_EMBEDDED(bp))
+		bp->blk_fill = fill;
+
+	mutex_exit(&db->db_mtx);
+}
+
+/*
+ * The SPA will call this callback several times for each zio - once
+ * for every physical child i/o (zio->io_phys_children times).  This
+ * allows the DMU to monitor the progress of each logical i/o.  For example,
+ * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
+ * block.  There may be a long delay before all copies/fragments are completed,
+ * so this callback allows us to retire dirty space gradually, as the physical
+ * i/os complete.
+ */
+/* ARGSUSED */
+static void
+dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+	dmu_buf_impl_t *db = arg;
+	objset_t *os = db->db_objset;
+	dsl_pool_t *dp = dmu_objset_pool(os);
+	dbuf_dirty_record_t *dr;
+	int delta = 0;
+
+	dr = db->db_data_pending;
+	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+
+	/*
+	 * The callback will be called io_phys_children times.  Retire one
+	 * portion of our dirty space each time we are called.  Any rounding
+	 * error will be cleaned up by dsl_pool_sync()'s call to
+	 * dsl_pool_undirty_space().
+	 */
+	delta = dr->dr_accounted / zio->io_phys_children;
+	dsl_pool_undirty_space(dp, delta, zio->io_txg);
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+	dmu_buf_impl_t *db = vdb;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
+	blkptr_t *bp = db->db_blkptr;
+	objset_t *os = db->db_objset;
+	dmu_tx_t *tx = os->os_synctx;
+	dbuf_dirty_record_t **drp, *dr;
+
+	ASSERT0(zio->io_error);
+	ASSERT(db->db_blkptr == bp);
+
+	/*
+	 * For nopwrites and rewrites we ensure that the bp matches our
+	 * original and bypass all the accounting.
+	 */
+	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
+		ASSERT(BP_EQUAL(bp, bp_orig));
+	} else {
+		dsl_dataset_t *ds = os->os_dsl_dataset;
+		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+		dsl_dataset_block_born(ds, bp, tx);
+	}
+
+	mutex_enter(&db->db_mtx);
+
+	DBUF_VERIFY(db);
+
+	drp = &db->db_last_dirty;
+	while ((dr = *drp) != db->db_data_pending)
+		drp = &dr->dr_next;
+	ASSERT(!list_link_active(&dr->dr_dirty_node));
+	ASSERT(dr->dr_dbuf == db);
+	ASSERT(dr->dr_next == NULL);
+	*drp = dr->dr_next;
+
+#ifdef ZFS_DEBUG
+	if (db->db_blkid == DMU_SPILL_BLKID) {
+		dnode_t *dn;
+
+		DB_DNODE_ENTER(db);
+		dn = DB_DNODE(db);
+		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+		    db->db_blkptr == &dn->dn_phys->dn_spill);
+		DB_DNODE_EXIT(db);
+	}
+#endif
+
+	if (db->db_level == 0) {
+		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+		if (db->db_state != DB_NOFILL) {
+			if (dr->dt.dl.dr_data != db->db_buf)
+				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
+				    db));
+			else if (!arc_released(db->db_buf))
+				arc_set_callback(db->db_buf, dbuf_do_evict, db);
+		}
+	} else {
+		dnode_t *dn;
+
+		DB_DNODE_ENTER(db);
+		dn = DB_DNODE(db);
+		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
+		if (!BP_IS_HOLE(db->db_blkptr)) {
+			ASSERTV(int epbs = dn->dn_phys->dn_indblkshift -
+			    SPA_BLKPTRSHIFT);
+			ASSERT3U(db->db_blkid, <=,
+			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
+			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+			    db->db.db_size);
+			if (!arc_released(db->db_buf))
+				arc_set_callback(db->db_buf, dbuf_do_evict, db);
+		}
+		DB_DNODE_EXIT(db);
+		mutex_destroy(&dr->dt.di.dr_mtx);
+		list_destroy(&dr->dt.di.dr_children);
+	}
+	kmem_free(dr, sizeof (dbuf_dirty_record_t));
+
+	cv_broadcast(&db->db_changed);
+	ASSERT(db->db_dirtycnt > 0);
+	db->db_dirtycnt -= 1;
+	db->db_data_pending = NULL;
+	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
+}
+
+static void
+dbuf_write_nofill_ready(zio_t *zio)
+{
+	dbuf_write_ready(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_nofill_done(zio_t *zio)
+{
+	dbuf_write_done(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_override_ready(zio_t *zio)
+{
+	dbuf_dirty_record_t *dr = zio->io_private;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+
+	dbuf_write_ready(zio, NULL, db);
+}
+
+static void
+dbuf_write_override_done(zio_t *zio)
+{
+	dbuf_dirty_record_t *dr = zio->io_private;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
+
+	mutex_enter(&db->db_mtx);
+	if (!BP_EQUAL(zio->io_bp, obp)) {
+		if (!BP_IS_HOLE(obp))
+			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
+		arc_release(dr->dt.dl.dr_data, db);
+	}
+	mutex_exit(&db->db_mtx);
+
+	dbuf_write_done(zio, NULL, db);
+}
+
+/* Issue I/O to commit a dirty buffer to disk. */
+static void
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	dnode_t *dn;
+	objset_t *os;
+	dmu_buf_impl_t *parent = db->db_parent;
+	uint64_t txg = tx->tx_txg;
+	zbookmark_phys_t zb;
+	zio_prop_t zp;
+	zio_t *zio;
+	int wp_flag = 0;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	os = dn->dn_objset;
+
+	if (db->db_state != DB_NOFILL) {
+		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+			/*
+			 * Private object buffers are released here rather
+			 * than in dbuf_dirty() since they are only modified
+			 * in the syncing context and we don't want the
+			 * overhead of making multiple copies of the data.
+			 */
+			if (BP_IS_HOLE(db->db_blkptr)) {
+				arc_buf_thaw(data);
+			} else {
+				dbuf_release_bp(db);
+			}
+		}
+	}
+
+	if (parent != dn->dn_dbuf) {
+		/* Our parent is an indirect block. */
+		/* We have a dirty parent that has been scheduled for write. */
+		ASSERT(parent && parent->db_data_pending);
+		/* Our parent's buffer is one level closer to the dnode. */
+		ASSERT(db->db_level == parent->db_level-1);
+		/*
+		 * We're about to modify our parent's db_data by modifying
+		 * our block pointer, so the parent must be released.
+		 */
+		ASSERT(arc_released(parent->db_buf));
+		zio = parent->db_data_pending->dr_zio;
+	} else {
+		/* Our parent is the dnode itself. */
+		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
+		    db->db_blkid != DMU_SPILL_BLKID) ||
+		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
+		if (db->db_blkid != DMU_SPILL_BLKID)
+			ASSERT3P(db->db_blkptr, ==,
+			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
+		zio = dn->dn_zio;
+	}
+
+	ASSERT(db->db_level == 0 || data == db->db_buf);
+	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+	ASSERT(zio);
+
+	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+	    db->db.db_object, db->db_level, db->db_blkid);
+
+	if (db->db_blkid == DMU_SPILL_BLKID)
+		wp_flag = WP_SPILL;
+	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
+
+	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
+	DB_DNODE_EXIT(db);
+
+	if (db->db_level == 0 &&
+	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+		/*
+		 * The BP for this block has been provided by open context
+		 * (by dmu_sync() or dmu_buf_write_embedded()).
+		 */
+		void *contents = (data != NULL) ? data->b_data : NULL;
+
+		dr->dr_zio = zio_write(zio, os->os_spa, txg,
+		    db->db_blkptr, contents, db->db.db_size, &zp,
+		    dbuf_write_override_ready, NULL, dbuf_write_override_done,
+		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+		mutex_enter(&db->db_mtx);
+		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
+		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
+		mutex_exit(&db->db_mtx);
+	} else if (db->db_state == DB_NOFILL) {
+		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+		dr->dr_zio = zio_write(zio, os->os_spa, txg,
+		    db->db_blkptr, NULL, db->db.db_size, &zp,
+		    dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
+		    ZIO_PRIORITY_ASYNC_WRITE,
+		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
+	} else {
+		ASSERT(arc_released(data));
+		dr->dr_zio = arc_write(zio, os->os_spa, txg,
+		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
+		    DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
+		    dbuf_write_physdone, dbuf_write_done, db,
+		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+	}
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(dbuf_find);
+EXPORT_SYMBOL(dbuf_is_metadata);
+EXPORT_SYMBOL(dbuf_evict);
+EXPORT_SYMBOL(dbuf_loan_arcbuf);
+EXPORT_SYMBOL(dbuf_whichblock);
+EXPORT_SYMBOL(dbuf_read);
+EXPORT_SYMBOL(dbuf_unoverride);
+EXPORT_SYMBOL(dbuf_free_range);
+EXPORT_SYMBOL(dbuf_new_size);
+EXPORT_SYMBOL(dbuf_release_bp);
+EXPORT_SYMBOL(dbuf_dirty);
+EXPORT_SYMBOL(dmu_buf_will_dirty);
+EXPORT_SYMBOL(dmu_buf_will_not_fill);
+EXPORT_SYMBOL(dmu_buf_will_fill);
+EXPORT_SYMBOL(dmu_buf_fill_done);
+EXPORT_SYMBOL(dmu_buf_rele);
+EXPORT_SYMBOL(dbuf_assign_arcbuf);
+EXPORT_SYMBOL(dbuf_clear);
+EXPORT_SYMBOL(dbuf_prefetch);
+EXPORT_SYMBOL(dbuf_hold_impl);
+EXPORT_SYMBOL(dbuf_hold);
+EXPORT_SYMBOL(dbuf_hold_level);
+EXPORT_SYMBOL(dbuf_create_bonus);
+EXPORT_SYMBOL(dbuf_spill_set_blksz);
+EXPORT_SYMBOL(dbuf_rm_spill);
+EXPORT_SYMBOL(dbuf_add_ref);
+EXPORT_SYMBOL(dbuf_rele);
+EXPORT_SYMBOL(dbuf_rele_and_unlock);
+EXPORT_SYMBOL(dbuf_refcount);
+EXPORT_SYMBOL(dbuf_sync_list);
+EXPORT_SYMBOL(dmu_buf_set_user);
+EXPORT_SYMBOL(dmu_buf_set_user_ie);
+EXPORT_SYMBOL(dmu_buf_get_user);
+EXPORT_SYMBOL(dmu_buf_freeable);
+EXPORT_SYMBOL(dmu_buf_get_blkptr);
+#endif
diff --git a/module/zfs/dbuf.c.rej b/module/zfs/dbuf.c.rej
new file mode 100644
index 000000000000..d70ba573e845
--- /dev/null
+++ b/module/zfs/dbuf.c.rej
@@ -0,0 +1,159 @@
+--- module/zfs/dbuf.c
++++ module/zfs/dbuf.c
+@@ -1785,7 +1815,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
+ 	} else if (level < nlevels-1) {
+ 		/* this block is referenced from an indirect block */
+ 		int err = dbuf_hold_impl(dn, level+1,
+-		    blkid >> epbs, fail_sparse, NULL, parentp);
++		    blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
+ 		if (err)
+ 			return (err);
+ 		err = dbuf_read(*parentp, NULL,
+@@ -2057,35 +2172,104 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
+ 	if (dnode_block_freed(dn, blkid))
+ 		return;
+ 
+-	/* dbuf_find() returns with db_mtx held */
+-	if (db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)) {
++	/*
++	 * This dnode hasn't been written to disk yet, so there's nothing to
++	 * prefetch.
++	 */
++	nlevels = dn->dn_phys->dn_nlevels;
++	if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
++		return;
++
++	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
++	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
++		return;
++
++	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
++	    level, blkid);
++	if (db != NULL) {
++		mutex_exit(&db->db_mtx);
+ 		/*
+-		 * This dbuf is already in the cache.  We assume that
+-		 * it is already CACHED, or else about to be either
+-		 * read or filled.
++		 * This dbuf already exists.  It is either CACHED, or
++		 * (we assume) about to be read or filled.
+ 		 */
+-		mutex_exit(&db->db_mtx);
+ 		return;
+ 	}
+ 
+-	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
+-		if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
+-			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+-			arc_flags_t aflags =
+-			    ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+-			zbookmark_phys_t zb;
++	/*
++	 * Find the closest ancestor (indirect block) of the target block
++	 * that is present in the cache.  In this indirect block, we will
++	 * find the bp that is at curlevel, curblkid.
++	 */
++	curlevel = level;
++	curblkid = blkid;
++	while (curlevel < nlevels - 1) {
++		int parent_level = curlevel + 1;
++		uint64_t parent_blkid = curblkid >> epbs;
++		dmu_buf_impl_t *db;
++
++		if (dbuf_hold_impl(dn, parent_level, parent_blkid,
++		    FALSE, TRUE, FTAG, &db) == 0) {
++			blkptr_t *bpp = db->db_buf->b_data;
++			bp = bpp[P2PHASE(curblkid, 1 << epbs)];
++			dbuf_rele(db, FTAG);
++			break;
++		}
+ 
+-			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+-			    dn->dn_object, 0, blkid);
++		curlevel = parent_level;
++		curblkid = parent_blkid;
++	}
+ 
+-			(void) arc_read(NULL, dn->dn_objset->os_spa,
+-			    bp, NULL, NULL, prio,
+-			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+-			    &aflags, &zb);
+-		}
+-		if (db)
+-			dbuf_rele(db, NULL);
++	if (curlevel == nlevels - 1) {
++		/* No cached indirect blocks found. */
++		ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
++		bp = dn->dn_phys->dn_blkptr[curblkid];
+ 	}
++	if (BP_IS_HOLE(&bp))
++		return;
++
++	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
++
++	zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
++	    ZIO_FLAG_CANFAIL);
++
++	dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
++	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
++	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
++	    dn->dn_object, level, blkid);
++	dpa->dpa_curlevel = curlevel;
++	dpa->dpa_prio = prio;
++	dpa->dpa_aflags = aflags;
++	dpa->dpa_spa = dn->dn_objset->os_spa;
++	dpa->dpa_epbs = epbs;
++	dpa->dpa_zio = pio;
++
++	/*
++	 * If we have the indirect just above us, no need to do the asynchronous
++	 * prefetch chain; we'll just run the last step ourselves.  If we're at
++	 * a higher level, though, we want to issue the prefetches for all the
++	 * indirect blocks asynchronously, so we can go on with whatever we were
++	 * doing.
++	 */
++	if (curlevel == level) {
++		ASSERT3U(curblkid, ==, blkid);
++		dbuf_issue_final_prefetch(dpa, &bp);
++		kmem_free(dpa, sizeof (*dpa));
++	} else {
++		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
++		zbookmark_phys_t zb;
++
++		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
++		    dn->dn_object, curlevel, curblkid);
++		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
++		    &bp, dbuf_prefetch_indirect_done, dpa, prio,
++		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
++		    &iter_aflags, &zb);
++	}
++	/*
++	 * We use pio here instead of dpa_zio since it's possible that
++	 * dpa may have already been freed.
++	 */
++	zio_nowait(pio);
+ }
+ 
+ /*
+@@ -2112,6 +2297,9 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+ 		blkptr_t *bp = NULL;
+ 		int err;
+ 
++		if (fail_uncached)
++			return (SET_ERROR(ENOENT));
++
+ 		ASSERT3P(parent, ==, NULL);
+ 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
+ 		if (fail_sparse) {
+@@ -2128,6 +2316,11 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+ 		db = dbuf_create(dn, level, blkid, parent, bp);
+ 	}
+ 
++	if (fail_uncached && db->db_state != DB_CACHED) {
++		mutex_exit(&db->db_mtx);
++		return (SET_ERROR(ENOENT));
++	}
++
+ 	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
+ 		arc_buf_add_ref(db->db_buf, db);
+ 		if (db->db_buf->b_data == NULL) {
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 5e2a1db601b4..fc8a4a3b9b1b 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -138,7 +138,7 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
-	blkid = dbuf_whichblock(dn, offset);
+	blkid = dbuf_whichblock(dn, 0, offset);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
@@ -421,7 +421,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-	blkid = dbuf_whichblock(dn, offset);
+	blkid = dbuf_whichblock(dn, 0, offset);
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
 		if (db == NULL) {
@@ -522,17 +522,16 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
 }
 
 /*
- * Issue prefetch i/os for the given blocks.
+ * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
+ * indirect blocks prefeteched will be those that point to the blocks containing
+ * the data starting at offset, and continuing to offset + len.
  *
- * Note: The assumption is that we *know* these blocks will be needed
- * almost immediately.  Therefore, the prefetch i/os will be issued at
- * ZIO_PRIORITY_SYNC_READ
- *
- * Note: indirect blocks and other metadata will be read synchronously,
- * causing this function to block if they are not already cached.
+ * Note that if the indirect blocks above the blocks being prefetched are not in
+ * cache, they will be asychronously read in.
  */
 void
-dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
+dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
+    uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
 	uint64_t blkid;
@@ -548,8 +547,9 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 			return;
 
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
-		dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ);
+		blkid = dbuf_whichblock(dn, level,
+		    object * sizeof (dnode_phys_t));
+		dbuf_prefetch(dn, level, blkid, pri, 0);
 		rw_exit(&dn->dn_struct_rwlock);
 		return;
 	}
@@ -1457,7 +1457,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
 	DB_DNODE_ENTER(dbuf);
 	dn = DB_DNODE(dbuf);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	blkid = dbuf_whichblock(dn, offset);
+	blkid = dbuf_whichblock(dn, 0, offset);
 	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
 	rw_exit(&dn->dn_struct_rwlock);
 	DB_DNODE_EXIT(dbuf);
diff --git a/module/zfs/dmu.c.orig b/module/zfs/dmu.c.orig
new file mode 100644
index 000000000000..5e2a1db601b4
--- /dev/null
+++ b/module/zfs/dmu.c.orig
@@ -0,0 +1,2182 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_prop.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/sa.h>
+#include <sys/zfeature.h>
+#ifdef _KERNEL
+#include <sys/vmsystm.h>
+#include <sys/zfs_znode.h>
+#endif
+
+/*
+ * Enable/disable nopwrite feature.
+ */
+int zfs_nopwrite_enabled = 1;
+
+const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
+	{	DMU_BSWAP_UINT8,	TRUE,	"unallocated"		},
+	{	DMU_BSWAP_ZAP,		TRUE,	"object directory"	},
+	{	DMU_BSWAP_UINT64,	TRUE,	"object array"		},
+	{	DMU_BSWAP_UINT8,	TRUE,	"packed nvlist"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"packed nvlist size"	},
+	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj"			},
+	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj header"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map header"	},
+	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"ZIL intent log"	},
+	{	DMU_BSWAP_DNODE,	TRUE,	"DMU dnode"		},
+	{	DMU_BSWAP_OBJSET,	TRUE,	"DMU objset"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"DSL directory"		},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL directory child map"},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset snap map"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL props"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"DSL dataset"		},
+	{	DMU_BSWAP_ZNODE,	TRUE,	"ZFS znode"		},
+	{	DMU_BSWAP_OLDACL,	TRUE,	"ZFS V0 ACL"		},
+	{	DMU_BSWAP_UINT8,	FALSE,	"ZFS plain file"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS directory"		},
+	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS master node"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS delete queue"	},
+	{	DMU_BSWAP_UINT8,	FALSE,	"zvol object"		},
+	{	DMU_BSWAP_ZAP,		TRUE,	"zvol prop"		},
+	{	DMU_BSWAP_UINT8,	FALSE,	"other uint8[]"		},
+	{	DMU_BSWAP_UINT64,	FALSE,	"other uint64[]"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"other ZAP"		},
+	{	DMU_BSWAP_ZAP,		TRUE,	"persistent error log"	},
+	{	DMU_BSWAP_UINT8,	TRUE,	"SPA history"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"SPA history offsets"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"Pool properties"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL permissions"	},
+	{	DMU_BSWAP_ACL,		TRUE,	"ZFS ACL"		},
+	{	DMU_BSWAP_UINT8,	TRUE,	"ZFS SYSACL"		},
+	{	DMU_BSWAP_UINT8,	TRUE,	"FUID table"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"FUID table size"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset next clones"},
+	{	DMU_BSWAP_ZAP,		TRUE,	"scan work queue"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group used"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group quota"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"snapshot refcount tags"},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DDT ZAP algorithm"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DDT statistics"	},
+	{	DMU_BSWAP_UINT8,	TRUE,	"System attributes"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"SA master node"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr registration"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr layouts"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"scan translations"	},
+	{	DMU_BSWAP_UINT8,	FALSE,	"deduplicated block"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL deadlist map"	},
+	{	DMU_BSWAP_UINT64,	TRUE,	"DSL deadlist map hdr"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dir clones"	},
+	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj subobj"		}
+};
+
+const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
+	{	byteswap_uint8_array,	"uint8"		},
+	{	byteswap_uint16_array,	"uint16"	},
+	{	byteswap_uint32_array,	"uint32"	},
+	{	byteswap_uint64_array,	"uint64"	},
+	{	zap_byteswap,		"zap"		},
+	{	dnode_buf_byteswap,	"dnode"		},
+	{	dmu_objset_byteswap,	"objset"	},
+	{	zfs_znode_byteswap,	"znode"		},
+	{	zfs_oldacl_byteswap,	"oldacl"	},
+	{	zfs_acl_byteswap,	"acl"		}
+};
+
+int
+dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
+    void *tag, dmu_buf_t **dbp)
+{
+	dnode_t *dn;
+	uint64_t blkid;
+	dmu_buf_impl_t *db;
+	int err;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+	blkid = dbuf_whichblock(dn, offset);
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	db = dbuf_hold(dn, blkid, tag);
+	rw_exit(&dn->dn_struct_rwlock);
+	dnode_rele(dn, FTAG);
+
+	if (db == NULL) {
+		*dbp = NULL;
+		return (SET_ERROR(EIO));
+	}
+
+	*dbp = &db->db;
+	return (err);
+}
+
+int
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+    void *tag, dmu_buf_t **dbp, int flags)
+{
+	int err;
+	int db_flags = DB_RF_CANFAIL;
+
+	if (flags & DMU_READ_NO_PREFETCH)
+		db_flags |= DB_RF_NOPREFETCH;
+
+	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
+	if (err == 0) {
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
+		err = dbuf_read(db, NULL, db_flags);
+		if (err != 0) {
+			dbuf_rele(db, tag);
+			*dbp = NULL;
+		}
+	}
+
+	return (err);
+}
+
+int
+dmu_bonus_max(void)
+{
+	return (DN_MAX_BONUSLEN);
+}
+
+int
+dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+	int error;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	if (dn->dn_bonus != db) {
+		error = SET_ERROR(EINVAL);
+	} else if (newsize < 0 || newsize > db_fake->db_size) {
+		error = SET_ERROR(EINVAL);
+	} else {
+		dnode_setbonuslen(dn, newsize, tx);
+		error = 0;
+	}
+
+	DB_DNODE_EXIT(db);
+	return (error);
+}
+
+int
+dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+	int error;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	if (!DMU_OT_IS_VALID(type)) {
+		error = SET_ERROR(EINVAL);
+	} else if (dn->dn_bonus != db) {
+		error = SET_ERROR(EINVAL);
+	} else {
+		dnode_setbonus_type(dn, type, tx);
+		error = 0;
+	}
+
+	DB_DNODE_EXIT(db);
+	return (error);
+}
+
+dmu_object_type_t
+dmu_get_bonustype(dmu_buf_t *db_fake)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+	dmu_object_type_t type;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	type = dn->dn_bonustype;
+	DB_DNODE_EXIT(db);
+
+	return (type);
+}
+
+int
+dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int error;
+
+	error = dnode_hold(os, object, FTAG, &dn);
+	dbuf_rm_spill(dn, tx);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	dnode_rm_spill(dn, tx);
+	rw_exit(&dn->dn_struct_rwlock);
+	dnode_rele(dn, FTAG);
+	return (error);
+}
+
+/*
+ * returns ENOENT, EIO, or 0.
+ */
+int
+dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
+{
+	dnode_t *dn;
+	dmu_buf_impl_t *db;
+	int error;
+
+	error = dnode_hold(os, object, FTAG, &dn);
+	if (error)
+		return (error);
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	if (dn->dn_bonus == NULL) {
+		rw_exit(&dn->dn_struct_rwlock);
+		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		if (dn->dn_bonus == NULL)
+			dbuf_create_bonus(dn);
+	}
+	db = dn->dn_bonus;
+
+	/* as long as the bonus buf is held, the dnode will be held */
+	if (refcount_add(&db->db_holds, tag) == 1) {
+		VERIFY(dnode_add_ref(dn, db));
+		atomic_inc_32(&dn->dn_dbufs_count);
+	}
+
+	/*
+	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
+	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
+	 * a dnode hold for every dbuf.
+	 */
+	rw_exit(&dn->dn_struct_rwlock);
+
+	dnode_rele(dn, FTAG);
+
+	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
+
+	*dbp = &db->db;
+	return (0);
+}
+
+/*
+ * returns ENOENT, EIO, or 0.
+ *
+ * This interface will allocate a blank spill dbuf when a spill blk
+ * doesn't already exist on the dnode.
+ *
+ * if you only want to find an already existing spill db, then
+ * dmu_spill_hold_existing() should be used.
+ */
+int
+dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
+{
+	dmu_buf_impl_t *db = NULL;
+	int err;
+
+	if ((flags & DB_RF_HAVESTRUCT) == 0)
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
+
+	if ((flags & DB_RF_HAVESTRUCT) == 0)
+		rw_exit(&dn->dn_struct_rwlock);
+
+	ASSERT(db != NULL);
+	err = dbuf_read(db, NULL, flags);
+	if (err == 0)
+		*dbp = &db->db;
+	else
+		dbuf_rele(db, tag);
+	return (err);
+}
+
+int
+dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+	dnode_t *dn;
+	int err;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
+		err = SET_ERROR(EINVAL);
+	} else {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+		if (!dn->dn_have_spill) {
+			err = SET_ERROR(ENOENT);
+		} else {
+			err = dmu_spill_hold_by_dnode(dn,
+			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
+		}
+
+		rw_exit(&dn->dn_struct_rwlock);
+	}
+
+	DB_DNODE_EXIT(db);
+	return (err);
+}
+
+int
+dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+	dnode_t *dn;
+	int err;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
+	DB_DNODE_EXIT(db);
+
+	return (err);
+}
+
+/*
+ * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
+ * to take a held dnode rather than <os, object> -- the lookup is wasteful,
+ * and can induce severe lock contention when writing to several files
+ * whose dnodes are in the same block.
+ */
+static int
+dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
+    int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
+{
+	dmu_buf_t **dbp;
+	uint64_t blkid, nblks, i;
+	uint32_t dbuf_flags;
+	int err;
+	zio_t *zio;
+
+	ASSERT(length <= DMU_MAX_ACCESS);
+
+	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
+	if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
+		dbuf_flags |= DB_RF_NOPREFETCH;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	if (dn->dn_datablkshift) {
+		int blkshift = dn->dn_datablkshift;
+		nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
+		    P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
+	} else {
+		if (offset + length > dn->dn_datablksz) {
+			zfs_panic_recover("zfs: accessing past end of object "
+			    "%llx/%llx (size=%u access=%llu+%llu)",
+			    (longlong_t)dn->dn_objset->
+			    os_dsl_dataset->ds_object,
+			    (longlong_t)dn->dn_object, dn->dn_datablksz,
+			    (longlong_t)offset, (longlong_t)length);
+			rw_exit(&dn->dn_struct_rwlock);
+			return (SET_ERROR(EIO));
+		}
+		nblks = 1;
+	}
+	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+
+	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+	blkid = dbuf_whichblock(dn, offset);
+	for (i = 0; i < nblks; i++) {
+		dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
+		if (db == NULL) {
+			rw_exit(&dn->dn_struct_rwlock);
+			dmu_buf_rele_array(dbp, nblks, tag);
+			zio_nowait(zio);
+			return (SET_ERROR(EIO));
+		}
+		/* initiate async i/o */
+		if (read) {
+			(void) dbuf_read(db, zio, dbuf_flags);
+		}
+		dbp[i] = &db->db;
+	}
+	rw_exit(&dn->dn_struct_rwlock);
+
+	/* wait for async i/o */
+	err = zio_wait(zio);
+	if (err) {
+		dmu_buf_rele_array(dbp, nblks, tag);
+		return (err);
+	}
+
+	/* wait for other io to complete */
+	if (read) {
+		for (i = 0; i < nblks; i++) {
+			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+			mutex_enter(&db->db_mtx);
+			while (db->db_state == DB_READ ||
+			    db->db_state == DB_FILL)
+				cv_wait(&db->db_changed, &db->db_mtx);
+			if (db->db_state == DB_UNCACHED)
+				err = SET_ERROR(EIO);
+			mutex_exit(&db->db_mtx);
+			if (err) {
+				dmu_buf_rele_array(dbp, nblks, tag);
+				return (err);
+			}
+		}
+	}
+
+	*numbufsp = nblks;
+	*dbpp = dbp;
+	return (0);
+}
+
+static int
+dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+	dnode_t *dn;
+	int err;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+
+	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+	    numbufsp, dbpp, DMU_READ_PREFETCH);
+
+	dnode_rele(dn, FTAG);
+
+	return (err);
+}
+
+int
+dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
+    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+	int err;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+	    numbufsp, dbpp, DMU_READ_PREFETCH);
+	DB_DNODE_EXIT(db);
+
+	return (err);
+}
+
+void
+dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
+{
+	int i;
+	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+
+	if (numbufs == 0)
+		return;
+
+	for (i = 0; i < numbufs; i++) {
+		if (dbp[i])
+			dbuf_rele(dbp[i], tag);
+	}
+
+	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
+}
+
+/*
+ * Issue prefetch i/os for the given blocks.
+ *
+ * Note: The assumption is that we *know* these blocks will be needed
+ * almost immediately.  Therefore, the prefetch i/os will be issued at
+ * ZIO_PRIORITY_SYNC_READ
+ *
+ * Note: indirect blocks and other metadata will be read synchronously,
+ * causing this function to block if they are not already cached.
+ */
+void
+dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
+{
+	dnode_t *dn;
+	uint64_t blkid;
+	int nblks, err;
+
+	if (zfs_prefetch_disable)
+		return;
+
+	if (len == 0) {  /* they're interested in the bonus buffer */
+		dn = DMU_META_DNODE(os);
+
+		if (object == 0 || object >= DN_MAX_OBJECT)
+			return;
+
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
+		dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ);
+		rw_exit(&dn->dn_struct_rwlock);
+		return;
+	}
+
+	/*
+	 * XXX - Note, if the dnode for the requested object is not
+	 * already cached, we will do a *synchronous* read in the
+	 * dnode_hold() call.  The same is true for any indirects.
+	 */
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err != 0)
+		return;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	if (dn->dn_datablkshift) {
+		int blkshift = dn->dn_datablkshift;
+		nblks = (P2ROUNDUP(offset + len, 1 << blkshift) -
+		    P2ALIGN(offset, 1 << blkshift)) >> blkshift;
+	} else {
+		nblks = (offset < dn->dn_datablksz);
+	}
+
+	if (nblks != 0) {
+		int i;
+
+		blkid = dbuf_whichblock(dn, offset);
+		for (i = 0; i < nblks; i++)
+			dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ);
+	}
+
+	rw_exit(&dn->dn_struct_rwlock);
+
+	dnode_rele(dn, FTAG);
+}
+
+/*
+ * Get the next "chunk" of file data to free.  We traverse the file from
+ * the end so that the file gets shorter over time (if we crashes in the
+ * middle, this will leave us in a better state).  We find allocated file
+ * data by simply searching the allocated level 1 indirects.
+ *
+ * On input, *start should be the first offset that does not need to be
+ * freed (e.g. "offset + length").  On return, *start will be the first
+ * offset that should be freed.
+ */
+static int
+get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
+{
+	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
+	/* bytes of data covered by a level-1 indirect block */
+	uint64_t iblkrange =
+	    dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
+	uint64_t blks;
+
+	ASSERT3U(minimum, <=, *start);
+
+	if (*start - minimum <= iblkrange * maxblks) {
+		*start = minimum;
+		return (0);
+	}
+	ASSERT(ISP2(iblkrange));
+
+	for (blks = 0; *start > minimum && blks < maxblks; blks++) {
+		int err;
+
+		/*
+		 * dnode_next_offset(BACKWARDS) will find an allocated L1
+		 * indirect block at or before the input offset.  We must
+		 * decrement *start so that it is at the end of the region
+		 * to search.
+		 */
+		(*start)--;
+		err = dnode_next_offset(dn,
+		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
+
+		/* if there are no indirect blocks before start, we are done */
+		if (err == ESRCH) {
+			*start = minimum;
+			break;
+		} else if (err != 0) {
+			return (err);
+		}
+
+		/* set start to the beginning of this L1 indirect */
+		*start = P2ALIGN(*start, iblkrange);
+	}
+	if (*start < minimum)
+		*start = minimum;
+	return (0);
+}
+
+static int
+dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
+    uint64_t length)
+{
+	uint64_t object_size;
+	int err;
+
+	if (dn == NULL)
+		return (SET_ERROR(EINVAL));
+
+	object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
+	if (offset >= object_size)
+		return (0);
+
+	if (length == DMU_OBJECT_END || offset + length > object_size)
+		length = object_size - offset;
+
+	while (length != 0) {
+		uint64_t chunk_end, chunk_begin;
+		dmu_tx_t *tx;
+
+		chunk_end = chunk_begin = offset + length;
+
+		/* move chunk_begin backwards to the beginning of this chunk */
+		err = get_next_chunk(dn, &chunk_begin, offset);
+		if (err)
+			return (err);
+		ASSERT3U(chunk_begin, >=, offset);
+		ASSERT3U(chunk_begin, <=, chunk_end);
+
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_free(tx, dn->dn_object,
+		    chunk_begin, chunk_end - chunk_begin);
+		err = dmu_tx_assign(tx, TXG_WAIT);
+		if (err) {
+			dmu_tx_abort(tx);
+			return (err);
+		}
+		dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx);
+		dmu_tx_commit(tx);
+
+		length -= chunk_end - chunk_begin;
+	}
+	return (0);
+}
+
+int
+dmu_free_long_range(objset_t *os, uint64_t object,
+    uint64_t offset, uint64_t length)
+{
+	dnode_t *dn;
+	int err;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err != 0)
+		return (err);
+	err = dmu_free_long_range_impl(os, dn, offset, length);
+
+	/*
+	 * It is important to zero out the maxblkid when freeing the entire
+	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
+	 * will take the fast path, and (b) dnode_reallocate() can verify
+	 * that the entire file has been freed.
+	 */
+	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
+		dn->dn_maxblkid = 0;
+
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
+int
+dmu_free_long_object(objset_t *os, uint64_t object)
+{
+	dmu_tx_t *tx;
+	int err;
+
+	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
+	if (err != 0)
+		return (err);
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_bonus(tx, object);
+	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err == 0) {
+		err = dmu_object_free(os, object, tx);
+		dmu_tx_commit(tx);
+	} else {
+		dmu_tx_abort(tx);
+	}
+
+	return (err);
+}
+
+int
+dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t size, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+	ASSERT(offset < UINT64_MAX);
+	ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
+	dnode_free_range(dn, offset, size, tx);
+	dnode_rele(dn, FTAG);
+	return (0);
+}
+
+int
+dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    void *buf, uint32_t flags)
+{
+	dnode_t *dn;
+	dmu_buf_t **dbp;
+	int numbufs, err;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+
+	/*
+	 * Deal with odd block sizes, where there can't be data past the first
+	 * block.  If we ever do the tail block optimization, we will need to
+	 * handle that here as well.
+	 */
+	if (dn->dn_maxblkid == 0) {
+		uint64_t newsz = offset > dn->dn_datablksz ? 0 :
+		    MIN(size, dn->dn_datablksz - offset);
+		bzero((char *)buf + newsz, size - newsz);
+		size = newsz;
+	}
+
+	while (size > 0) {
+		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
+		int i;
+
+		/*
+		 * NB: we could do this block-at-a-time, but it's nice
+		 * to be reading in parallel.
+		 */
+		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
+		    TRUE, FTAG, &numbufs, &dbp, flags);
+		if (err)
+			break;
+
+		for (i = 0; i < numbufs; i++) {
+			uint64_t tocpy;
+			int64_t bufoff;
+			dmu_buf_t *db = dbp[i];
+
+			ASSERT(size > 0);
+
+			bufoff = offset - db->db_offset;
+			tocpy = MIN(db->db_size - bufoff, size);
+
+			(void) memcpy(buf, (char *)db->db_data + bufoff, tocpy);
+
+			offset += tocpy;
+			size -= tocpy;
+			buf = (char *)buf + tocpy;
+		}
+		dmu_buf_rele_array(dbp, numbufs, FTAG);
+	}
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
+void
+dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    const void *buf, dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	int numbufs, i;
+
+	if (size == 0)
+		return;
+
+	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
+	    FALSE, FTAG, &numbufs, &dbp));
+
+	for (i = 0; i < numbufs; i++) {
+		uint64_t tocpy;
+		int64_t bufoff;
+		dmu_buf_t *db = dbp[i];
+
+		ASSERT(size > 0);
+
+		bufoff = offset - db->db_offset;
+		tocpy = MIN(db->db_size - bufoff, size);
+
+		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+		if (tocpy == db->db_size)
+			dmu_buf_will_fill(db, tx);
+		else
+			dmu_buf_will_dirty(db, tx);
+
+		(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
+
+		if (tocpy == db->db_size)
+			dmu_buf_fill_done(db, tx);
+
+		offset += tocpy;
+		size -= tocpy;
+		buf = (char *)buf + tocpy;
+	}
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+void
+dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	int numbufs, i;
+
+	if (size == 0)
+		return;
+
+	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
+	    FALSE, FTAG, &numbufs, &dbp));
+
+	for (i = 0; i < numbufs; i++) {
+		dmu_buf_t *db = dbp[i];
+
+		dmu_buf_will_not_fill(db, tx);
+	}
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+    void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+    int compressed_size, int byteorder, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+
+	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
+	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+	VERIFY0(dmu_buf_hold_noread(os, object, offset,
+	    FTAG, &db));
+
+	dmu_buf_write_embedded(db,
+	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
+	    uncompressed_size, compressed_size, byteorder, tx);
+
+	dmu_buf_rele(db, FTAG);
+}
+
+/*
+ * DMU support for xuio
+ */
+kstat_t *xuio_ksp = NULL;
+
+typedef struct xuio_stats {
+	/* loaned yet not returned arc_buf */
+	kstat_named_t xuiostat_onloan_rbuf;
+	kstat_named_t xuiostat_onloan_wbuf;
+	/* whether a copy is made when loaning out a read buffer */
+	kstat_named_t xuiostat_rbuf_copied;
+	kstat_named_t xuiostat_rbuf_nocopy;
+	/* whether a copy is made when assigning a write buffer */
+	kstat_named_t xuiostat_wbuf_copied;
+	kstat_named_t xuiostat_wbuf_nocopy;
+} xuio_stats_t;
+
+static xuio_stats_t xuio_stats = {
+	{ "onloan_read_buf",	KSTAT_DATA_UINT64 },
+	{ "onloan_write_buf",	KSTAT_DATA_UINT64 },
+	{ "read_buf_copied",	KSTAT_DATA_UINT64 },
+	{ "read_buf_nocopy",	KSTAT_DATA_UINT64 },
+	{ "write_buf_copied",	KSTAT_DATA_UINT64 },
+	{ "write_buf_nocopy",	KSTAT_DATA_UINT64 }
+};
+
+#define	XUIOSTAT_INCR(stat, val)        \
+	atomic_add_64(&xuio_stats.stat.value.ui64, (val))
+#define	XUIOSTAT_BUMP(stat)	XUIOSTAT_INCR(stat, 1)
+
+int
+dmu_xuio_init(xuio_t *xuio, int nblk)
+{
+	dmu_xuio_t *priv;
+	uio_t *uio = &xuio->xu_uio;
+
+	uio->uio_iovcnt = nblk;
+	uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
+
+	priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
+	priv->cnt = nblk;
+	priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
+	priv->iovp = (iovec_t *)uio->uio_iov;
+	XUIO_XUZC_PRIV(xuio) = priv;
+
+	if (XUIO_XUZC_RW(xuio) == UIO_READ)
+		XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
+	else
+		XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
+
+	return (0);
+}
+
+void
+dmu_xuio_fini(xuio_t *xuio)
+{
+	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+	int nblk = priv->cnt;
+
+	kmem_free(priv->iovp, nblk * sizeof (iovec_t));
+	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
+	kmem_free(priv, sizeof (dmu_xuio_t));
+
+	if (XUIO_XUZC_RW(xuio) == UIO_READ)
+		XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
+	else
+		XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
+}
+
+/*
+ * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
+ * and increase priv->next by 1.
+ */
+int
+dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
+{
+	struct iovec *iov;
+	uio_t *uio = &xuio->xu_uio;
+	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+	int i = priv->next++;
+
+	ASSERT(i < priv->cnt);
+	ASSERT(off + n <= arc_buf_size(abuf));
+	iov = (iovec_t *)uio->uio_iov + i;
+	iov->iov_base = (char *)abuf->b_data + off;
+	iov->iov_len = n;
+	priv->bufs[i] = abuf;
+	return (0);
+}
+
+int
+dmu_xuio_cnt(xuio_t *xuio)
+{
+	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+	return (priv->cnt);
+}
+
+arc_buf_t *
+dmu_xuio_arcbuf(xuio_t *xuio, int i)
+{
+	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+	ASSERT(i < priv->cnt);
+	return (priv->bufs[i]);
+}
+
+void
+dmu_xuio_clear(xuio_t *xuio, int i)
+{
+	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+	ASSERT(i < priv->cnt);
+	priv->bufs[i] = NULL;
+}
+
+static void
+xuio_stat_init(void)
+{
+	xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+	if (xuio_ksp != NULL) {
+		xuio_ksp->ks_data = &xuio_stats;
+		kstat_install(xuio_ksp);
+	}
+}
+
+static void
+xuio_stat_fini(void)
+{
+	if (xuio_ksp != NULL) {
+		kstat_delete(xuio_ksp);
+		xuio_ksp = NULL;
+	}
+}
+
+void
+xuio_stat_wbuf_copied()
+{
+	XUIOSTAT_BUMP(xuiostat_wbuf_copied);
+}
+
+void
+xuio_stat_wbuf_nocopy()
+{
+	XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
+}
+
+#ifdef _KERNEL
+
+/*
+ * Copy up to size bytes between arg_buf and req based on the data direction
+ * described by the req.  If an entire req's data cannot be transfered in one
+ * pass, you should pass in @req_offset to indicate where to continue. The
+ * return value is the number of bytes successfully copied to arg_buf.
+ */
+static int
+dmu_bio_copy(void *arg_buf, int size, struct bio *bio, size_t bio_offset)
+{
+	struct bio_vec bv, *bvp = &bv;
+	bvec_iterator_t iter;
+	char *bv_buf;
+	int tocpy, bv_len, bv_offset;
+	int offset = 0;
+
+	bio_for_each_segment4(bv, bvp, bio, iter) {
+
+		/*
+		 * Fully consumed the passed arg_buf. We use goto here because
+		 * rq_for_each_segment is a double loop
+		 */
+		ASSERT3S(offset, <=, size);
+		if (size == offset)
+			goto out;
+
+		/* Skip already copied bvp */
+		if (bio_offset >= bvp->bv_len) {
+			bio_offset -= bvp->bv_len;
+			continue;
+		}
+
+		bv_len = bvp->bv_len - bio_offset;
+		bv_offset = bvp->bv_offset + bio_offset;
+		bio_offset = 0;
+
+		tocpy = MIN(bv_len, size - offset);
+		ASSERT3S(tocpy, >=, 0);
+
+		bv_buf = page_address(bvp->bv_page) + bv_offset;
+		ASSERT3P(bv_buf, !=, NULL);
+
+		if (bio_data_dir(bio) == WRITE)
+			memcpy(arg_buf + offset, bv_buf, tocpy);
+		else
+			memcpy(bv_buf, arg_buf + offset, tocpy);
+
+		offset += tocpy;
+	}
+out:
+	return (offset);
+}
+
+int
+dmu_read_bio(objset_t *os, uint64_t object, struct bio *bio)
+{
+	uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+	uint64_t size = BIO_BI_SIZE(bio);
+	dmu_buf_t **dbp;
+	int numbufs, i, err;
+	size_t bio_offset;
+
+	/*
+	 * NB: we could do this block-at-a-time, but it's nice
+	 * to be reading in parallel.
+	 */
+	err = dmu_buf_hold_array(os, object, offset, size, TRUE, FTAG,
+	    &numbufs, &dbp);
+	if (err)
+		return (err);
+
+	bio_offset = 0;
+	for (i = 0; i < numbufs; i++) {
+		uint64_t tocpy;
+		int64_t bufoff;
+		int didcpy;
+		dmu_buf_t *db = dbp[i];
+
+		bufoff = offset - db->db_offset;
+		ASSERT3S(bufoff, >=, 0);
+
+		tocpy = MIN(db->db_size - bufoff, size);
+		if (tocpy == 0)
+			break;
+
+		didcpy = dmu_bio_copy(db->db_data + bufoff, tocpy, bio,
+		    bio_offset);
+
+		if (didcpy < tocpy)
+			err = EIO;
+
+		if (err)
+			break;
+
+		size -= tocpy;
+		offset += didcpy;
+		bio_offset += didcpy;
+		err = 0;
+	}
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+	return (err);
+}
+
+int
+dmu_write_bio(objset_t *os, uint64_t object, struct bio *bio, dmu_tx_t *tx)
+{
+	uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+	uint64_t size = BIO_BI_SIZE(bio);
+	dmu_buf_t **dbp;
+	int numbufs, i, err;
+	size_t bio_offset;
+
+	if (size == 0)
+		return (0);
+
+	err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG,
+	    &numbufs, &dbp);
+	if (err)
+		return (err);
+
+	bio_offset = 0;
+	for (i = 0; i < numbufs; i++) {
+		uint64_t tocpy;
+		int64_t bufoff;
+		int didcpy;
+		dmu_buf_t *db = dbp[i];
+
+		bufoff = offset - db->db_offset;
+		ASSERT3S(bufoff, >=, 0);
+
+		tocpy = MIN(db->db_size - bufoff, size);
+		if (tocpy == 0)
+			break;
+
+		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+		if (tocpy == db->db_size)
+			dmu_buf_will_fill(db, tx);
+		else
+			dmu_buf_will_dirty(db, tx);
+
+		didcpy = dmu_bio_copy(db->db_data + bufoff, tocpy, bio,
+		    bio_offset);
+
+		if (tocpy == db->db_size)
+			dmu_buf_fill_done(db, tx);
+
+		if (didcpy < tocpy)
+			err = EIO;
+
+		if (err)
+			break;
+
+		size -= tocpy;
+		offset += didcpy;
+		bio_offset += didcpy;
+		err = 0;
+	}
+
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+	return (err);
+}
+
+static int
+dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
+{
+	dmu_buf_t **dbp;
+	int numbufs, i, err;
+	xuio_t *xuio = NULL;
+
+	/*
+	 * NB: we could do this block-at-a-time, but it's nice
+	 * to be reading in parallel.
+	 */
+	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
+	    TRUE, FTAG, &numbufs, &dbp, 0);
+	if (err)
+		return (err);
+
+	for (i = 0; i < numbufs; i++) {
+		uint64_t tocpy;
+		int64_t bufoff;
+		dmu_buf_t *db = dbp[i];
+
+		ASSERT(size > 0);
+
+		bufoff = uio->uio_loffset - db->db_offset;
+		tocpy = MIN(db->db_size - bufoff, size);
+
+		if (xuio) {
+			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+			arc_buf_t *dbuf_abuf = dbi->db_buf;
+			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
+			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
+			if (!err) {
+				uio->uio_resid -= tocpy;
+				uio->uio_loffset += tocpy;
+			}
+
+			if (abuf == dbuf_abuf)
+				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
+			else
+				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
+		} else {
+			err = uiomove((char *)db->db_data + bufoff, tocpy,
+			    UIO_READ, uio);
+		}
+		if (err)
+			break;
+
+		size -= tocpy;
+	}
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+	return (err);
+}
+
+/*
+ * Read 'size' bytes into the uio buffer.
+ * From object zdb->db_object.
+ * Starting at offset uio->uio_loffset.
+ *
+ * If the caller already has a dbuf in the target object
+ * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
+ * because we don't have to find the dnode_t for the object.
+ */
+int
+dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+	dnode_t *dn;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	err = dmu_read_uio_dnode(dn, uio, size);
+	DB_DNODE_EXIT(db);
+
+	return (err);
+}
+
+/*
+ * Read 'size' bytes into the uio buffer.
+ * From the specified object
+ * Starting at offset uio->uio_loffset.
+ */
+int
+dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
+{
+	dnode_t *dn;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+
+	err = dmu_read_uio_dnode(dn, uio, size);
+
+	dnode_rele(dn, FTAG);
+
+	return (err);
+}
+
+static int
+dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	int numbufs;
+	int err = 0;
+	int i;
+
+	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
+	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
+	if (err)
+		return (err);
+
+	for (i = 0; i < numbufs; i++) {
+		uint64_t tocpy;
+		int64_t bufoff;
+		dmu_buf_t *db = dbp[i];
+
+		ASSERT(size > 0);
+
+		bufoff = uio->uio_loffset - db->db_offset;
+		tocpy = MIN(db->db_size - bufoff, size);
+
+		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+		if (tocpy == db->db_size)
+			dmu_buf_will_fill(db, tx);
+		else
+			dmu_buf_will_dirty(db, tx);
+
+		/*
+		 * XXX uiomove could block forever (eg.nfs-backed
+		 * pages).  There needs to be a uiolockdown() function
+		 * to lock the pages in memory, so that uiomove won't
+		 * block.
+		 */
+		err = uiomove((char *)db->db_data + bufoff, tocpy,
+		    UIO_WRITE, uio);
+
+		if (tocpy == db->db_size)
+			dmu_buf_fill_done(db, tx);
+
+		if (err)
+			break;
+
+		size -= tocpy;
+	}
+
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+	return (err);
+}
+
+/*
+ * Write 'size' bytes from the uio buffer.
+ * To object zdb->db_object.
+ * Starting at offset uio->uio_loffset.
+ *
+ * If the caller already has a dbuf in the target object
+ * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
+ * because we don't have to find the dnode_t for the object.
+ */
+int
+dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
+    dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+	dnode_t *dn;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	err = dmu_write_uio_dnode(dn, uio, size, tx);
+	DB_DNODE_EXIT(db);
+
+	return (err);
+}
+
+/*
+ * Write 'size' bytes from the uio buffer.
+ * To the specified object.
+ * Starting at offset uio->uio_loffset.
+ */
+int
+dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
+    dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+
+	err = dmu_write_uio_dnode(dn, uio, size, tx);
+
+	dnode_rele(dn, FTAG);
+
+	return (err);
+}
+#endif /* _KERNEL */
+
+/*
+ * Allocate a loaned anonymous arc buffer.
+ */
+arc_buf_t *
+dmu_request_arcbuf(dmu_buf_t *handle, int size)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
+
+	return (arc_loan_buf(db->db_objset->os_spa, size));
+}
+
+/*
+ * Free a loaned arc buffer.
+ */
+void
+dmu_return_arcbuf(arc_buf_t *buf)
+{
+	arc_return_buf(buf, FTAG);
+	VERIFY(arc_buf_remove_ref(buf, FTAG));
+}
+
+/*
+ * When possible directly assign passed loaned arc buffer to a dbuf.
+ * If this is not possible copy the contents of passed arc buf via
+ * dmu_write().
+ */
+void
+dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
+    dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
+	dnode_t *dn;
+	dmu_buf_impl_t *db;
+	uint32_t blksz = (uint32_t)arc_buf_size(buf);
+	uint64_t blkid;
+
+	DB_DNODE_ENTER(dbuf);
+	dn = DB_DNODE(dbuf);
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	blkid = dbuf_whichblock(dn, offset);
+	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
+	rw_exit(&dn->dn_struct_rwlock);
+	DB_DNODE_EXIT(dbuf);
+
+	/*
+	 * We can only assign if the offset is aligned, the arc buf is the
+	 * same size as the dbuf, and the dbuf is not metadata.  It
+	 * can't be metadata because the loaned arc buf comes from the
+	 * user-data kmem area.
+	 */
+	if (offset == db->db.db_offset && blksz == db->db.db_size &&
+	    DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) {
+		dbuf_assign_arcbuf(db, buf, tx);
+		dbuf_rele(db, FTAG);
+	} else {
+		objset_t *os;
+		uint64_t object;
+
+		DB_DNODE_ENTER(dbuf);
+		dn = DB_DNODE(dbuf);
+		os = dn->dn_objset;
+		object = dn->dn_object;
+		DB_DNODE_EXIT(dbuf);
+
+		dbuf_rele(db, FTAG);
+		dmu_write(os, object, offset, blksz, buf->b_data, tx);
+		dmu_return_arcbuf(buf);
+		XUIOSTAT_BUMP(xuiostat_wbuf_copied);
+	}
+}
+
+typedef struct {
+	dbuf_dirty_record_t	*dsa_dr;
+	dmu_sync_cb_t		*dsa_done;
+	zgd_t			*dsa_zgd;
+	dmu_tx_t		*dsa_tx;
+} dmu_sync_arg_t;
+
+/* ARGSUSED */
+static void
+dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+	dmu_sync_arg_t *dsa = varg;
+	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
+	blkptr_t *bp = zio->io_bp;
+
+	if (zio->io_error == 0) {
+		if (BP_IS_HOLE(bp)) {
+			/*
+			 * A block of zeros may compress to a hole, but the
+			 * block size still needs to be known for replay.
+			 */
+			BP_SET_LSIZE(bp, db->db_size);
+		} else if (!BP_IS_EMBEDDED(bp)) {
+			ASSERT(BP_GET_LEVEL(bp) == 0);
+			bp->blk_fill = 1;
+		}
+	}
+}
+
+static void
+dmu_sync_late_arrival_ready(zio_t *zio)
+{
+	dmu_sync_ready(zio, NULL, zio->io_private);
+}
+
+/* ARGSUSED */
+static void
+dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+	dmu_sync_arg_t *dsa = varg;
+	dbuf_dirty_record_t *dr = dsa->dsa_dr;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+
+	mutex_enter(&db->db_mtx);
+	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
+	if (zio->io_error == 0) {
+		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
+		if (dr->dt.dl.dr_nopwrite) {
+			ASSERTV(blkptr_t *bp = zio->io_bp);
+			ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
+			ASSERTV(uint8_t chksum = BP_GET_CHECKSUM(bp_orig));
+
+			ASSERT(BP_EQUAL(bp, bp_orig));
+			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
+			ASSERT(zio_checksum_table[chksum].ci_dedup);
+		}
+		dr->dt.dl.dr_overridden_by = *zio->io_bp;
+		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
+
+		/*
+		 * Old style holes are filled with all zeros, whereas
+		 * new-style holes maintain their lsize, type, level,
+		 * and birth time (see zio_write_compress). While we
+		 * need to reset the BP_SET_LSIZE() call that happened
+		 * in dmu_sync_ready for old style holes, we do *not*
+		 * want to wipe out the information contained in new
+		 * style holes. Thus, only zero out the block pointer if
+		 * it's an old style hole.
+		 */
+		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
+		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
+			BP_ZERO(&dr->dt.dl.dr_overridden_by);
+	} else {
+		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+	}
+	cv_broadcast(&db->db_changed);
+	mutex_exit(&db->db_mtx);
+
+	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+	kmem_free(dsa, sizeof (*dsa));
+}
+
+static void
+dmu_sync_late_arrival_done(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	dmu_sync_arg_t *dsa = zio->io_private;
+	ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
+
+	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
+		/*
+		 * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
+		 * then there is nothing to do here. Otherwise, free the
+		 * newly allocated block in this txg.
+		 */
+		if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
+			ASSERT(BP_EQUAL(bp, bp_orig));
+		} else {
+			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
+			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+		}
+	}
+
+	dmu_tx_commit(dsa->dsa_tx);
+
+	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+	kmem_free(dsa, sizeof (*dsa));
+}
+
+static int
+dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
+    zio_prop_t *zp, zbookmark_phys_t *zb)
+{
+	dmu_sync_arg_t *dsa;
+	dmu_tx_t *tx;
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
+	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
+		dmu_tx_abort(tx);
+		/* Make zl_get_data do txg_waited_synced() */
+		return (SET_ERROR(EIO));
+	}
+
+	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+	dsa->dsa_dr = NULL;
+	dsa->dsa_done = done;
+	dsa->dsa_zgd = zgd;
+	dsa->dsa_tx = tx;
+
+	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
+	    zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
+	    dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa,
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL|ZIO_FLAG_FASTWRITE, zb));
+
+	return (0);
+}
+
+/*
+ * Intent log support: sync the block associated with db to disk.
+ * N.B. and XXX: the caller is responsible for making sure that the
+ * data isn't changing while dmu_sync() is writing it.
+ *
+ * Return values:
+ *
+ *	EEXIST: this txg has already been synced, so there's nothing to do.
+ *		The caller should not log the write.
+ *
+ *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
+ *		The caller should not log the write.
+ *
+ *	EALREADY: this block is already in the process of being synced.
+ *		The caller should track its progress (somehow).
+ *
+ *	EIO: could not do the I/O.
+ *		The caller should do a txg_wait_synced().
+ *
+ *	0: the I/O has been initiated.
+ *		The caller should log this blkptr in the done callback.
+ *		It is possible that the I/O will fail, in which case
+ *		the error will be reported to the done callback and
+ *		propagated to pio from zio_done().
+ */
+int
+dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
+{
+	blkptr_t *bp = zgd->zgd_bp;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
+	objset_t *os = db->db_objset;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+	dbuf_dirty_record_t *dr;
+	dmu_sync_arg_t *dsa;
+	zbookmark_phys_t zb;
+	zio_prop_t zp;
+	dnode_t *dn;
+
+	ASSERT(pio != NULL);
+	ASSERT(txg != 0);
+
+	SET_BOOKMARK(&zb, ds->ds_object,
+	    db->db.db_object, db->db_level, db->db_blkid);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
+	DB_DNODE_EXIT(db);
+
+	/*
+	 * If we're frozen (running ziltest), we always need to generate a bp.
+	 */
+	if (txg > spa_freeze_txg(os->os_spa))
+		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
+
+	/*
+	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
+	 * and us.  If we determine that this txg is not yet syncing,
+	 * but it begins to sync a moment later, that's OK because the
+	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
+	 */
+	mutex_enter(&db->db_mtx);
+
+	if (txg <= spa_last_synced_txg(os->os_spa)) {
+		/*
+		 * This txg has already synced.  There's nothing to do.
+		 */
+		mutex_exit(&db->db_mtx);
+		return (SET_ERROR(EEXIST));
+	}
+
+	if (txg <= spa_syncing_txg(os->os_spa)) {
+		/*
+		 * This txg is currently syncing, so we can't mess with
+		 * the dirty record anymore; just write a new log block.
+		 */
+		mutex_exit(&db->db_mtx);
+		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
+	}
+
+	dr = db->db_last_dirty;
+	while (dr && dr->dr_txg != txg)
+		dr = dr->dr_next;
+
+	if (dr == NULL) {
+		/*
+		 * There's no dr for this dbuf, so it must have been freed.
+		 * There's no need to log writes to freed blocks, so we're done.
+		 */
+		mutex_exit(&db->db_mtx);
+		return (SET_ERROR(ENOENT));
+	}
+
+	ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
+
+	/*
+	 * Assume the on-disk data is X, the current syncing data (in
+	 * txg - 1) is Y, and the current in-memory data is Z (currently
+	 * in dmu_sync).
+	 *
+	 * We usually want to perform a nopwrite if X and Z are the
+	 * same.  However, if Y is different (i.e. the BP is going to
+	 * change before this write takes effect), then a nopwrite will
+	 * be incorrect - we would override with X, which could have
+	 * been freed when Y was written.
+	 *
+	 * (Note that this is not a concern when we are nop-writing from
+	 * syncing context, because X and Y must be identical, because
+	 * all previous txgs have been synced.)
+	 *
+	 * Therefore, we disable nopwrite if the current BP could change
+	 * before this TXG.  There are two ways it could change: by
+	 * being dirty (dr_next is non-NULL), or by being freed
+	 * (dnode_block_freed()).  This behavior is verified by
+	 * zio_done(), which VERIFYs that the override BP is identical
+	 * to the on-disk BP.
+	 */
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
+		zp.zp_nopwrite = B_FALSE;
+	DB_DNODE_EXIT(db);
+
+	ASSERT(dr->dr_txg == txg);
+	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
+	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+		/*
+		 * We have already issued a sync write for this buffer,
+		 * or this buffer has already been synced.  It could not
+		 * have been dirtied since, or we would have cleared the state.
+		 */
+		mutex_exit(&db->db_mtx);
+		return (SET_ERROR(EALREADY));
+	}
+
+	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
+	mutex_exit(&db->db_mtx);
+
+	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+	dsa->dsa_dr = dr;
+	dsa->dsa_done = done;
+	dsa->dsa_zgd = zgd;
+	dsa->dsa_tx = NULL;
+
+	zio_nowait(arc_write(pio, os->os_spa, txg,
+	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
+	    DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready,
+	    NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
+	    ZIO_FLAG_CANFAIL, &zb));
+
+	return (0);
+}
+
+int
+dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
+	dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int err;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+	err = dnode_set_blksz(dn, size, ibs, tx);
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
+void
+dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+	dmu_tx_t *tx)
+{
+	dnode_t *dn;
+
+	/*
+	 * Send streams include each object's checksum function.  This
+	 * check ensures that the receiving system can understand the
+	 * checksum function transmitted.
+	 */
+	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
+
+	VERIFY0(dnode_hold(os, object, FTAG, &dn));
+	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
+	dn->dn_checksum = checksum;
+	dnode_setdirty(dn, tx);
+	dnode_rele(dn, FTAG);
+}
+
+void
+dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+	dmu_tx_t *tx)
+{
+	dnode_t *dn;
+
+	/*
+	 * Send streams include each object's compression function.  This
+	 * check ensures that the receiving system can understand the
+	 * compression function transmitted.
+	 */
+	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
+
+	VERIFY0(dnode_hold(os, object, FTAG, &dn));
+	dn->dn_compress = compress;
+	dnode_setdirty(dn, tx);
+	dnode_rele(dn, FTAG);
+}
+
+int zfs_mdcomp_disable = 0;
+
+/*
+ * When the "redundant_metadata" property is set to "most", only indirect
+ * blocks of this level and higher will have an additional ditto block.
+ */
+int zfs_redundant_metadata_most_ditto_level = 2;
+
+void
+dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
+{
+	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
+	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
+	    (wp & WP_SPILL));
+	enum zio_checksum checksum = os->os_checksum;
+	enum zio_compress compress = os->os_compress;
+	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
+	boolean_t dedup = B_FALSE;
+	boolean_t nopwrite = B_FALSE;
+	boolean_t dedup_verify = os->os_dedup_verify;
+	int copies = os->os_copies;
+
+	/*
+	 * We maintain different write policies for each of the following
+	 * types of data:
+	 *	 1. metadata
+	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
+	 *	 3. all other level 0 blocks
+	 */
+	if (ismd) {
+		if (zfs_mdcomp_disable) {
+			compress = ZIO_COMPRESS_EMPTY;
+		} else {
+			/*
+			 * XXX -- we should design a compression algorithm
+			 * that specializes in arrays of bps.
+			 */
+			compress = zio_compress_select(os->os_spa,
+			    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
+		}
+
+		/*
+		 * Metadata always gets checksummed.  If the data
+		 * checksum is multi-bit correctable, and it's not a
+		 * ZBT-style checksum, then it's suitable for metadata
+		 * as well.  Otherwise, the metadata checksum defaults
+		 * to fletcher4.
+		 */
+		if (zio_checksum_table[checksum].ci_correctable < 1 ||
+		    zio_checksum_table[checksum].ci_eck)
+			checksum = ZIO_CHECKSUM_FLETCHER_4;
+
+		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
+		    (os->os_redundant_metadata ==
+		    ZFS_REDUNDANT_METADATA_MOST &&
+		    (level >= zfs_redundant_metadata_most_ditto_level ||
+		    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
+			copies++;
+	} else if (wp & WP_NOFILL) {
+		ASSERT(level == 0);
+
+		/*
+		 * If we're writing preallocated blocks, we aren't actually
+		 * writing them so don't set any policy properties.  These
+		 * blocks are currently only used by an external subsystem
+		 * outside of zfs (i.e. dump) and not written by the zio
+		 * pipeline.
+		 */
+		compress = ZIO_COMPRESS_OFF;
+		checksum = ZIO_CHECKSUM_OFF;
+	} else {
+		compress = zio_compress_select(os->os_spa, dn->dn_compress,
+		    compress);
+
+		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
+		    zio_checksum_select(dn->dn_checksum, checksum) :
+		    dedup_checksum;
+
+		/*
+		 * Determine dedup setting.  If we are in dmu_sync(),
+		 * we won't actually dedup now because that's all
+		 * done in syncing context; but we do want to use the
+		 * dedup checkum.  If the checksum is not strong
+		 * enough to ensure unique signatures, force
+		 * dedup_verify.
+		 */
+		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
+			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
+			if (!zio_checksum_table[checksum].ci_dedup)
+				dedup_verify = B_TRUE;
+		}
+
+		/*
+		 * Enable nopwrite if we have a cryptographically secure
+		 * checksum that has no known collisions (i.e. SHA-256)
+		 * and compression is enabled.  We don't enable nopwrite if
+		 * dedup is enabled as the two features are mutually exclusive.
+		 */
+		nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup &&
+		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
+	}
+
+	zp->zp_checksum = checksum;
+	zp->zp_compress = compress;
+	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
+	zp->zp_level = level;
+	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
+	zp->zp_dedup = dedup;
+	zp->zp_dedup_verify = dedup && dedup_verify;
+	zp->zp_nopwrite = nopwrite;
+}
+
+int
+dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
+{
+	dnode_t *dn;
+	int i, err;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+	/*
+	 * Sync any current changes before
+	 * we go trundling through the block pointers.
+	 */
+	for (i = 0; i < TXG_SIZE; i++) {
+		if (list_link_active(&dn->dn_dirty_link[i]))
+			break;
+	}
+	if (i != TXG_SIZE) {
+		dnode_rele(dn, FTAG);
+		txg_wait_synced(dmu_objset_pool(os), 0);
+		err = dnode_hold(os, object, FTAG, &dn);
+		if (err)
+			return (err);
+	}
+
+	err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
+	dnode_rele(dn, FTAG);
+
+	return (err);
+}
+
+void
+__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
+{
+	dnode_phys_t *dnp = dn->dn_phys;
+	int i;
+
+	doi->doi_data_block_size = dn->dn_datablksz;
+	doi->doi_metadata_block_size = dn->dn_indblkshift ?
+	    1ULL << dn->dn_indblkshift : 0;
+	doi->doi_type = dn->dn_type;
+	doi->doi_bonus_type = dn->dn_bonustype;
+	doi->doi_bonus_size = dn->dn_bonuslen;
+	doi->doi_indirection = dn->dn_nlevels;
+	doi->doi_checksum = dn->dn_checksum;
+	doi->doi_compress = dn->dn_compress;
+	doi->doi_nblkptr = dn->dn_nblkptr;
+	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
+	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
+	doi->doi_fill_count = 0;
+	for (i = 0; i < dnp->dn_nblkptr; i++)
+		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
+}
+
+void
+dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
+{
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	mutex_enter(&dn->dn_mtx);
+
+	__dmu_object_info_from_dnode(dn, doi);
+
+	mutex_exit(&dn->dn_mtx);
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+/*
+ * Get information on a DMU object.
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int
+dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
+{
+	dnode_t *dn;
+	int err = dnode_hold(os, object, FTAG, &dn);
+
+	if (err)
+		return (err);
+
+	if (doi != NULL)
+		dmu_object_info_from_dnode(dn, doi);
+
+	dnode_rele(dn, FTAG);
+	return (0);
+}
+
+/*
+ * As above, but faster; can be used when you have a held dbuf in hand.
+ */
+void
+dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	DB_DNODE_ENTER(db);
+	dmu_object_info_from_dnode(DB_DNODE(db), doi);
+	DB_DNODE_EXIT(db);
+}
+
+/*
+ * Faster still when you only care about the size.
+ * This is specifically optimized for zfs_getattr().
+ */
+void
+dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
+    u_longlong_t *nblk512)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	*blksize = dn->dn_datablksz;
+	/* add 1 for dnode space */
+	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
+	    SPA_MINBLOCKSHIFT) + 1;
+	DB_DNODE_EXIT(db);
+}
+
+void
+byteswap_uint64_array(void *vbuf, size_t size)
+{
+	uint64_t *buf = vbuf;
+	size_t count = size >> 3;
+	int i;
+
+	ASSERT((size & 7) == 0);
+
+	for (i = 0; i < count; i++)
+		buf[i] = BSWAP_64(buf[i]);
+}
+
+void
+byteswap_uint32_array(void *vbuf, size_t size)
+{
+	uint32_t *buf = vbuf;
+	size_t count = size >> 2;
+	int i;
+
+	ASSERT((size & 3) == 0);
+
+	for (i = 0; i < count; i++)
+		buf[i] = BSWAP_32(buf[i]);
+}
+
+void
+byteswap_uint16_array(void *vbuf, size_t size)
+{
+	uint16_t *buf = vbuf;
+	size_t count = size >> 1;
+	int i;
+
+	ASSERT((size & 1) == 0);
+
+	for (i = 0; i < count; i++)
+		buf[i] = BSWAP_16(buf[i]);
+}
+
+/* ARGSUSED */
+void
+byteswap_uint8_array(void *vbuf, size_t size)
+{
+}
+
+void
+dmu_init(void)
+{
+	zfs_dbgmsg_init();
+	sa_cache_init();
+	xuio_stat_init();
+	dmu_objset_init();
+	dnode_init();
+	dbuf_init();
+	zfetch_init();
+	dmu_tx_init();
+	l2arc_init();
+	arc_init();
+}
+
+void
+dmu_fini(void)
+{
+	arc_fini(); /* arc depends on l2arc, so arc must go first */
+	l2arc_fini();
+	dmu_tx_fini();
+	zfetch_fini();
+	dbuf_fini();
+	dnode_fini();
+	dmu_objset_fini();
+	xuio_stat_fini();
+	sa_cache_fini();
+	zfs_dbgmsg_fini();
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(dmu_bonus_hold);
+EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
+EXPORT_SYMBOL(dmu_buf_rele_array);
+EXPORT_SYMBOL(dmu_prefetch);
+EXPORT_SYMBOL(dmu_free_range);
+EXPORT_SYMBOL(dmu_free_long_range);
+EXPORT_SYMBOL(dmu_free_long_object);
+EXPORT_SYMBOL(dmu_read);
+EXPORT_SYMBOL(dmu_write);
+EXPORT_SYMBOL(dmu_prealloc);
+EXPORT_SYMBOL(dmu_object_info);
+EXPORT_SYMBOL(dmu_object_info_from_dnode);
+EXPORT_SYMBOL(dmu_object_info_from_db);
+EXPORT_SYMBOL(dmu_object_size_from_db);
+EXPORT_SYMBOL(dmu_object_set_blocksize);
+EXPORT_SYMBOL(dmu_object_set_checksum);
+EXPORT_SYMBOL(dmu_object_set_compress);
+EXPORT_SYMBOL(dmu_write_policy);
+EXPORT_SYMBOL(dmu_sync);
+EXPORT_SYMBOL(dmu_request_arcbuf);
+EXPORT_SYMBOL(dmu_return_arcbuf);
+EXPORT_SYMBOL(dmu_assign_arcbuf);
+EXPORT_SYMBOL(dmu_buf_hold);
+EXPORT_SYMBOL(dmu_ot);
+
+module_param(zfs_mdcomp_disable, int, 0644);
+MODULE_PARM_DESC(zfs_mdcomp_disable, "Disable meta data compression");
+
+module_param(zfs_nopwrite_enabled, int, 0644);
+MODULE_PARM_DESC(zfs_nopwrite_enabled, "Enable NOP writes");
+
+#endif
diff --git a/module/zfs/dmu.c.rej b/module/zfs/dmu.c.rej
new file mode 100644
index 000000000000..d23c22caa66e
--- /dev/null
+++ b/module/zfs/dmu.c.rej
@@ -0,0 +1,33 @@
+--- module/zfs/dmu.c
++++ module/zfs/dmu.c
+@@ -564,18 +564,24 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
+ 		return;
+ 
+ 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+-	if (dn->dn_datablkshift) {
+-		int blkshift = dn->dn_datablkshift;
+-		nblks = (P2ROUNDUP(offset + len, 1 << blkshift) -
+-		    P2ALIGN(offset, 1 << blkshift)) >> blkshift;
++	/*
++	 * offset + len - 1 is the last byte we want to prefetch for, and offset
++	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
++	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
++	 * offset)  is the first.  Then the number we need to prefetch is the
++	 * last - first + 1.
++	 */
++	if (level > 0 || dn->dn_datablkshift != 0) {
++		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
++		    dbuf_whichblock(dn, level, offset) + 1;
+ 	} else {
+ 		nblks = (offset < dn->dn_datablksz);
+ 	}
+ 
+ 	if (nblks != 0) {
+-		blkid = dbuf_whichblock(dn, offset);
++		blkid = dbuf_whichblock(dn, level, offset);
+ 		for (int i = 0; i < nblks; i++)
+-			dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ);
++			dbuf_prefetch(dn, level, blkid + i, pri, 0);
+ 	}
+ 
+ 	rw_exit(&dn->dn_struct_rwlock);
diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c
index 91415d0d2dcb..7665d1ca591d 100644
--- a/module/zfs/dmu_diff.c
+++ b/module/zfs/dmu_diff.c
@@ -115,7 +115,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	if (issig(JUSTLOOKING) && issig(FORREAL))
 		return (SET_ERROR(EINTR));
 
-	if (zb->zb_object != DMU_META_DNODE_OBJECT)
+	if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT)
 		return (0);
 
 	if (BP_IS_HOLE(bp)) {
diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c
index c0e8d7b7fed3..127a6757f85b 100644
--- a/module/zfs/dmu_object.c
+++ b/module/zfs/dmu_object.c
@@ -148,6 +148,11 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
 	return (0);
 }
 
+/*
+ * Return (in *objectp) the next object which is allocated (or a hole)
+ * after *object, taking into account only objects that may have been modified
+ * after the specified txg.
+ */
 int
 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
 {
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index b8ecc24b3af7..c4e9645aa459 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -466,58 +466,115 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
 	return (B_FALSE);
 }
 
-#define	BP_SPAN(dnp, level) \
-	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
-	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+/*
+ * This is the callback function to traverse_dataset that acts as the worker
+ * thread for dmu_send_impl.
+ */
+/*ARGSUSED*/
+static int
+send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
+{
+	struct send_thread_arg *sta = arg;
+	struct send_block_record *record;
+	uint64_t record_size;
+	int err = 0;
 
-/* ARGSUSED */
+	if (sta->cancel)
+		return (SET_ERROR(EINTR));
+
+	if (bp == NULL) {
+		ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
+		return (0);
+	} else if (zb->zb_level < 0) {
+		return (0);
+	}
+
+	record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP);
+	record->eos_marker = B_FALSE;
+	record->bp = *bp;
+	record->zb = *zb;
+	record->indblkshift = dnp->dn_indblkshift;
+	record->datablkszsec = dnp->dn_datablkszsec;
+	record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+	bqueue_enqueue(&sta->q, record, record_size);
+
+	return (err);
+}
+
+/*
+ * This function kicks off the traverse_dataset.  It also handles setting the
+ * error code of the thread in case something goes wrong, and pushes the End of
+ * Stream record when the traverse_dataset call has finished.  If there is no
+ * dataset to traverse, the thread immediately pushes End of Stream marker.
+ */
+static void
+send_traverse_thread(void *arg)
+{
+	struct send_thread_arg *st_arg = arg;
+	int err;
+	struct send_block_record *data;
+
+	if (st_arg->ds != NULL) {
+		err = traverse_dataset(st_arg->ds, st_arg->fromtxg,
+		    st_arg->flags, send_cb, arg);
+		if (err != EINTR)
+			st_arg->error_code = err;
+	}
+	data = kmem_zalloc(sizeof (*data), KM_SLEEP);
+	data->eos_marker = B_TRUE;
+	bqueue_enqueue(&st_arg->q, data, 1);
+}
+
+/*
+ * This function actually handles figuring out what kind of record needs to be
+ * dumped, reading the data (which has hopefully been prefetched), and calling
+ * the appropriate helper function.
+ */
 static int
-backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
 {
-	dmu_sendarg_t *dsp = arg;
+	dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os);
+	const blkptr_t *bp = &data->bp;
+	const zbookmark_phys_t *zb = &data->zb;
+	uint8_t indblkshift = data->indblkshift;
+	uint16_t dblkszsec = data->datablkszsec;
+	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
 	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
 	int err = 0;
 
-	if (issig(JUSTLOOKING) && issig(FORREAL))
-		return (SET_ERROR(EINTR));
+	ASSERT3U(zb->zb_level, >=, 0);
 
 	if (zb->zb_object != DMU_META_DNODE_OBJECT &&
 	    DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
 		return (0);
-	} else if (zb->zb_level == ZB_ZIL_LEVEL) {
-		/*
-		 * If we are sending a non-snapshot (which is allowed on
-		 * read-only pools), it may have a ZIL, which must be ignored.
-		 */
-		return (0);
 	} else if (BP_IS_HOLE(bp) &&
 	    zb->zb_object == DMU_META_DNODE_OBJECT) {
-		uint64_t span = BP_SPAN(dnp, zb->zb_level);
+		uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
 		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
-		err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT);
+		err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT);
 	} else if (BP_IS_HOLE(bp)) {
-		uint64_t span = BP_SPAN(dnp, zb->zb_level);
-		err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span);
+		uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
+		uint64_t offset = zb->zb_blkid * span;
+		err = dump_free(dsa, zb->zb_object, offset, span);
 	} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
 		return (0);
 	} else if (type == DMU_OT_DNODE) {
-		dnode_phys_t *blk;
-		int i;
 		int blksz = BP_GET_LSIZE(bp);
 		arc_flags_t aflags = ARC_FLAG_WAIT;
 		arc_buf_t *abuf;
 
+		ASSERT0(zb->zb_level);
+
 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
 		    &aflags, zb) != 0)
 			return (SET_ERROR(EIO));
 
-		blk = abuf->b_data;
-		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
-			uint64_t dnobj = (zb->zb_blkid <<
-			    (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
-			err = dump_dnode(dsp, dnobj, blk+i);
+		dnode_phys_t *blk = abuf->b_data;
+		uint64_t dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT);
+		for (int i = 0; i < blksz >> DNODE_SHIFT; i++) {
+			err = dump_dnode(dsa, dnobj + i, blk + i);
 			if (err != 0)
 				break;
 		}
@@ -566,20 +623,20 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 
 		offset = zb->zb_blkid * blksz;
 
-		if (!(dsp->dsa_featureflags &
+		if (!(dsa->dsa_featureflags &
 		    DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
 		    blksz > SPA_OLD_MAXBLOCKSIZE) {
 			char *buf = abuf->b_data;
 			while (blksz > 0 && err == 0) {
 				int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
-				err = dump_write(dsp, type, zb->zb_object,
+				err = dump_write(dsa, type, zb->zb_object,
 				    offset, n, NULL, buf);
 				offset += n;
 				buf += n;
 				blksz -= n;
 			}
 		} else {
-			err = dump_write(dsp, type, zb->zb_object,
+			err = dump_write(dsa, type, zb->zb_object,
 			    offset, blksz, bp, abuf->b_data);
 		}
 		(void) arc_buf_remove_ref(abuf, &abuf);
@@ -590,11 +647,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 }
 
 /*
- * Releases dp using the specified tag.
+ * Pop the new data off the queue, and free the old data.
+ */
+static struct send_block_record *
+get_next_record(bqueue_t *bq, struct send_block_record *data)
+{
+	struct send_block_record *tmp = bqueue_dequeue(bq);
+	kmem_free(data, sizeof (*data));
+	return (tmp);
+}
+
+/*
+ * Actually do the bulk of the work in a zfs send.
+ *
+ * Note: Releases dp using the specified tag.
  */
 static int
-dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
-    zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
+dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
+    zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok,
     boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
 {
 	objset_t *os;
@@ -603,8 +673,9 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
 	int err;
 	uint64_t fromtxg = 0;
 	uint64_t featureflags = 0;
+	struct send_thread_arg to_arg;
 
-	err = dmu_objset_from_ds(ds, &os);
+	err = dmu_objset_from_ds(to_ds, &os);
 	if (err != 0) {
 		dsl_pool_rele(dp, tag);
 		return (err);
@@ -671,16 +742,16 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
 	dsp->dsa_proc = curproc;
 	dsp->dsa_os = os;
 	dsp->dsa_off = off;
-	dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid;
+	dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
 	dsp->dsa_pending_op = PENDING_NONE;
-	dsp->dsa_incremental = (fromzb != NULL);
+	dsp->dsa_incremental = (ancestor_zb != NULL);
 	dsp->dsa_featureflags = featureflags;
 
-	mutex_enter(&ds->ds_sendstream_lock);
-	list_insert_head(&ds->ds_sendstreams, dsp);
-	mutex_exit(&ds->ds_sendstream_lock);
+	mutex_enter(&to_ds->ds_sendstream_lock);
+	list_insert_head(&to_ds->ds_sendstreams, dsp);
+	mutex_exit(&to_ds->ds_sendstream_lock);
 
-	dsl_dataset_long_hold(ds, FTAG);
+	dsl_dataset_long_hold(to_ds, FTAG);
 	dsl_pool_rele(dp, tag);
 
 	if (dump_record(dsp, NULL, 0) != 0) {
@@ -688,8 +759,41 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
 		goto out;
 	}
 
-	err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
-	    backup_cb, dsp);
+	err = bqueue_init(&to_arg.q, zfs_send_queue_length,
+	    offsetof(struct send_block_record, ln));
+	to_arg.error_code = 0;
+	to_arg.cancel = B_FALSE;
+	to_arg.ds = to_ds;
+	to_arg.fromtxg = fromtxg;
+	to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH;
+	(void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc,
+	    TS_RUN, minclsyspri);
+
+	struct send_block_record *to_data;
+	to_data = bqueue_dequeue(&to_arg.q);
+
+	while (!to_data->eos_marker && err == 0) {
+		err = do_dump(dsp, to_data);
+		to_data = get_next_record(&to_arg.q, to_data);
+		if (issig(JUSTLOOKING) && issig(FORREAL))
+			err = EINTR;
+	}
+
+	if (err != 0) {
+		to_arg.cancel = B_TRUE;
+		while (!to_data->eos_marker) {
+			to_data = get_next_record(&to_arg.q, to_data);
+		}
+	}
+	kmem_free(to_data, sizeof (*to_data));
+
+	bqueue_destroy(&to_arg.q);
+
+	if (err == 0 && to_arg.error_code != 0)
+		err = to_arg.error_code;
+
+	if (err != 0)
+		goto out;
 
 	if (dsp->dsa_pending_op != PENDING_NONE)
 		if (dump_record(dsp, NULL, 0) != 0)
@@ -706,20 +810,18 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
 	drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
 	drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
 
-	if (dump_record(dsp, NULL, 0) != 0) {
+	if (dump_record(dsp, NULL, 0) != 0)
 		err = dsp->dsa_err;
-		goto out;
-	}
 
 out:
-	mutex_enter(&ds->ds_sendstream_lock);
-	list_remove(&ds->ds_sendstreams, dsp);
-	mutex_exit(&ds->ds_sendstream_lock);
+	mutex_enter(&to_ds->ds_sendstream_lock);
+	list_remove(&to_ds->ds_sendstreams, dsp);
+	mutex_exit(&to_ds->ds_sendstream_lock);
 
 	kmem_free(drr, sizeof (dmu_replay_record_t));
 	kmem_free(dsp, sizeof (dmu_sendarg_t));
 
-	dsl_dataset_long_rele(ds, FTAG);
+	dsl_dataset_long_rele(to_ds, FTAG);
 
 	return (err);
 }
@@ -1139,7 +1241,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 		 * If it's a non-clone incremental, we are missing the
 		 * target fs, so fail the recv.
 		 */
-		if (fromguid != 0 && !(flags & DRR_FLAG_CLONE))
+		if (fromguid != 0 && !(flags & DRR_FLAG_CLONE ||
+		    drba->drba_origin))
 			return (SET_ERROR(ENOENT));
 
 		/* Open the parent of tofs */
@@ -1313,21 +1416,57 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
 	    &drba, 5, ZFS_SPACE_CHECK_NORMAL));
 }
 
-struct restorearg {
+struct receive_record_arg {
+	dmu_replay_record_t header;
+	void *payload; /* Pointer to a buffer containing the payload */
+	/*
+	 * If the record is a write, pointer to the arc_buf_t containing the
+	 * payload.
+	 */
+	arc_buf_t *write_buf;
+	int payload_size;
+	boolean_t eos_marker; /* Marks the end of the stream */
+	bqueue_node_t node;
+};
+
+struct receive_writer_arg {
 	objset_t *os;
-	int err;
 	boolean_t byteswap;
-	vnode_t *vp;
-	uint64_t voff;
-	int bufsize; /* amount of memory allocated for buf */
+	bqueue_t q;
+	/*
+	 * These three args are used to signal to the main thread that we're
+	 * done.
+	 */
+	kmutex_t mutex;
+	kcondvar_t cv;
+	boolean_t done;
+	int err;
+	/* A map from guid to dataset to help handle dedup'd streams. */
+	avl_tree_t *guid_to_ds_map;
+};
 
-	dmu_replay_record_t *drr;
-	dmu_replay_record_t *next_drr;
-	char *buf;
+struct receive_arg  {
+	objset_t *os;
+	vnode_t *vp; /* The vnode to read the stream from */
+	uint64_t voff; /* The current offset in the stream */
+	/*
+	 * A record that has had its payload read in, but hasn't yet been handed
+	 * off to the worker thread.
+	 */
+	struct receive_record_arg *rrd;
+	/* A record that has had its header read in, but not its payload. */
+	struct receive_record_arg *next_rrd;
 	zio_cksum_t cksum;
 	zio_cksum_t prev_cksum;
+	int err;
+	boolean_t byteswap;
+	/* Sorted list of objects not to issue prefetches for. */
+	list_t ignore_obj_list;
+};
 
-	avl_tree_t *guid_to_ds_map;
+struct receive_ign_obj_node {
+	list_node_t node;
+	uint64_t object;
 };
 
 typedef struct guid_map_entry {
@@ -1366,13 +1505,12 @@ free_guid_map_onexit(void *arg)
 }
 
 static int
-restore_read(struct restorearg *ra, int len, void *buf)
+receive_read(struct receive_arg *ra, int len, void *buf)
 {
 	int done = 0;
 
 	/* some things will require 8-byte alignment, so everything must */
 	ASSERT0(len % 8);
-	ASSERT3U(len, <=, ra->bufsize);
 
 	while (done < len) {
 		ssize_t resid;
@@ -1507,12 +1645,12 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
-	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(ra->os)) ||
+	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
 	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
 		return (SET_ERROR(EINVAL));
 	}
 
-	err = dmu_object_info(ra->os, drro->drr_object, &doi);
+	err = dmu_object_info(rwa->os, drro->drr_object, &doi);
 
 	if (err != 0 && err != ENOENT)
 		return (SET_ERROR(EINVAL));
@@ -1531,14 +1669,14 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
 
 		if (drro->drr_blksz != doi.doi_data_block_size ||
 		    nblkptr < doi.doi_nblkptr) {
-			err = dmu_free_long_range(ra->os, drro->drr_object,
+			err = dmu_free_long_range(rwa->os, drro->drr_object,
 			    0, DMU_OBJECT_END);
 			if (err != 0)
 				return (SET_ERROR(EINVAL));
 		}
 	}
 
-	tx = dmu_tx_create(ra->os);
+	tx = dmu_tx_create(rwa->os);
 	dmu_tx_hold_bonus(tx, object);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
@@ -1548,7 +1686,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
 
 	if (object == DMU_NEW_OBJECT) {
 		/* currently free, want to be allocated */
-		err = dmu_object_claim(ra->os, drro->drr_object,
+		err = dmu_object_claim(rwa->os, drro->drr_object,
 		    drro->drr_type, drro->drr_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
 	} else if (drro->drr_type != doi.doi_type ||
@@ -1556,7 +1694,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
 	    drro->drr_bonustype != doi.doi_bonus_type ||
 	    drro->drr_bonuslen != doi.doi_bonus_size) {
 		/* currently allocated, but with different properties */
-		err = dmu_object_reclaim(ra->os, drro->drr_object,
+		err = dmu_object_reclaim(rwa->os, drro->drr_object,
 		    drro->drr_type, drro->drr_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
 	}
@@ -1565,20 +1703,20 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
 		return (SET_ERROR(EINVAL));
 	}
 
-	dmu_object_set_checksum(ra->os, drro->drr_object,
+	dmu_object_set_checksum(rwa->os, drro->drr_object,
 	    drro->drr_checksumtype, tx);
-	dmu_object_set_compress(ra->os, drro->drr_object,
+	dmu_object_set_compress(rwa->os, drro->drr_object,
 	    drro->drr_compress, tx);
 
 	if (data != NULL) {
 		dmu_buf_t *db;
 
-		VERIFY0(dmu_bonus_hold(ra->os, drro->drr_object, FTAG, &db));
+		VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db));
 		dmu_buf_will_dirty(db, tx);
 
 		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
 		bcopy(data, db->db_data, drro->drr_bonuslen);
-		if (ra->byteswap) {
+		if (rwa->byteswap) {
 			dmu_object_byteswap_t byteswap =
 			    DMU_OT_BYTESWAP(drro->drr_bonustype);
 			dmu_ot_byteswap[byteswap].ob_func(db->db_data,
@@ -1602,13 +1740,13 @@ restore_freeobjects(struct restorearg *ra,
 
 	for (obj = drrfo->drr_firstobj;
 	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
-	    (void) dmu_object_next(ra->os, &obj, FALSE, 0)) {
+	    (void) dmu_object_next(rwa->os, &obj, FALSE, 0)) {
 		int err;
 
-		if (dmu_object_info(ra->os, obj, NULL) != 0)
+		if (dmu_object_info(rwa->os, obj, NULL) != 0)
 			continue;
 
-		err = dmu_free_long_object(ra->os, obj);
+		err = dmu_free_long_object(rwa->os, obj);
 		if (err != 0)
 			return (err);
 	}
@@ -1626,10 +1764,10 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
 	    !DMU_OT_IS_VALID(drrw->drr_type))
 		return (SET_ERROR(EINVAL));
 
-	if (dmu_object_info(ra->os, drrw->drr_object, NULL) != 0)
+	if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
-	tx = dmu_tx_create(ra->os);
+	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_write(tx, drrw->drr_object,
 	    drrw->drr_offset, drrw->drr_length);
@@ -1638,7 +1776,7 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
 		dmu_tx_abort(tx);
 		return (err);
 	}
-	if (ra->byteswap) {
+	if (rwa->byteswap) {
 		dmu_object_byteswap_t byteswap =
 		    DMU_OT_BYTESWAP(drrw->drr_type);
 		dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
@@ -1661,7 +1799,8 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
  * data from the stream to fulfill this write.
  */
 static int
-restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
+receive_write_byref(struct receive_writer_arg *rwa,
+    struct drr_write_byref *drrwbr)
 {
 	dmu_tx_t *tx;
 	int err;
@@ -1680,14 +1819,14 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
 	 */
 	if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
 		gmesrch.guid = drrwbr->drr_refguid;
-		if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
+		if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch,
 		    &where)) == NULL) {
 			return (SET_ERROR(EINVAL));
 		}
 		if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
 			return (SET_ERROR(EINVAL));
 	} else {
-		ref_os = ra->os;
+		ref_os = rwa->os;
 	}
 
 	err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
@@ -1695,7 +1834,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
 	if (err != 0)
 		return (err);
 
-	tx = dmu_tx_create(ra->os);
+	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_write(tx, drrwbr->drr_object,
 	    drrwbr->drr_offset, drrwbr->drr_length);
@@ -1704,7 +1843,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
 		dmu_tx_abort(tx);
 		return (err);
 	}
-	dmu_write(ra->os, drrwbr->drr_object,
+	dmu_write(rwa->os, drrwbr->drr_object,
 	    drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
 	dmu_buf_rele(dbp, FTAG);
 	dmu_tx_commit(tx);
@@ -1712,7 +1851,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
 }
 
 static int
-restore_write_embedded(struct restorearg *ra,
+receive_write_embedded(struct receive_writer_arg *rwa,
     struct drr_write_embedded *drrwnp, void *data)
 {
 	dmu_tx_t *tx;
@@ -1729,7 +1868,7 @@ restore_write_embedded(struct restorearg *ra,
 	if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
 		return (EINVAL);
 
-	tx = dmu_tx_create(ra->os);
+	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_write(tx, drrwnp->drr_object,
 	    drrwnp->drr_offset, drrwnp->drr_length);
@@ -1739,36 +1878,37 @@ restore_write_embedded(struct restorearg *ra,
 		return (err);
 	}
 
-	dmu_write_embedded(ra->os, drrwnp->drr_object,
+	dmu_write_embedded(rwa->os, drrwnp->drr_object,
 	    drrwnp->drr_offset, data, drrwnp->drr_etype,
 	    drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
-	    ra->byteswap ^ ZFS_HOST_BYTEORDER, tx);
+	    rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
 
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 static int
-restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data)
+receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
+    void *data)
 {
 	dmu_tx_t *tx;
 	dmu_buf_t *db, *db_spill;
 	int err;
 
 	if (drrs->drr_length < SPA_MINBLOCKSIZE ||
-	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(ra->os)))
+	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
 		return (SET_ERROR(EINVAL));
 
-	if (dmu_object_info(ra->os, drrs->drr_object, NULL) != 0)
+	if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
-	VERIFY0(dmu_bonus_hold(ra->os, drrs->drr_object, FTAG, &db));
+	VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
 	if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
 		dmu_buf_rele(db, FTAG);
 		return (err);
 	}
 
-	tx = dmu_tx_create(ra->os);
+	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_spill(tx, db->db_object);
 
@@ -1803,11 +1943,12 @@ restore_free(struct restorearg *ra, struct drr_free *drrf)
 	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
 		return (SET_ERROR(EINVAL));
 
-	if (dmu_object_info(ra->os, drrf->drr_object, NULL) != 0)
+	if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
-	err = dmu_free_long_range(ra->os, drrf->drr_object,
+	err = dmu_free_long_range(rwa->os, drrf->drr_object,
 	    drrf->drr_offset, drrf->drr_length);
+
 	return (err);
 }
 
@@ -1822,7 +1963,7 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
 }
 
 static void
-restore_cksum(struct restorearg *ra, int len, void *buf)
+receive_cksum(struct receive_arg *ra, int len, void *buf)
 {
 	if (ra->byteswap) {
 		fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
@@ -1973,6 +2114,118 @@ restore_process_record(struct restorearg *ra)
 }
 
 /*
+ * Commit the records to the pool.
+ */
+static int
+receive_process_record(struct receive_writer_arg *rwa,
+    struct receive_record_arg *rrd)
+{
+	int err;
+
+	switch (rrd->header.drr_type) {
+	case DRR_OBJECT:
+	{
+		struct drr_object *drro = &rrd->header.drr_u.drr_object;
+		err = receive_object(rwa, drro, rrd->payload);
+		kmem_free(rrd->payload, rrd->payload_size);
+		rrd->payload = NULL;
+		return (err);
+	}
+	case DRR_FREEOBJECTS:
+	{
+		struct drr_freeobjects *drrfo =
+		    &rrd->header.drr_u.drr_freeobjects;
+		return (receive_freeobjects(rwa, drrfo));
+	}
+	case DRR_WRITE:
+	{
+		struct drr_write *drrw = &rrd->header.drr_u.drr_write;
+		err = receive_write(rwa, drrw, rrd->write_buf);
+		/* if receive_write() is successful, it consumes the arc_buf */
+		if (err != 0)
+			dmu_return_arcbuf(rrd->write_buf);
+		rrd->write_buf = NULL;
+		rrd->payload = NULL;
+		return (err);
+	}
+	case DRR_WRITE_BYREF:
+	{
+		struct drr_write_byref *drrwbr =
+		    &rrd->header.drr_u.drr_write_byref;
+		return (receive_write_byref(rwa, drrwbr));
+	}
+	case DRR_WRITE_EMBEDDED:
+	{
+		struct drr_write_embedded *drrwe =
+		    &rrd->header.drr_u.drr_write_embedded;
+		err = receive_write_embedded(rwa, drrwe, rrd->payload);
+		kmem_free(rrd->payload, rrd->payload_size);
+		rrd->payload = NULL;
+		return (err);
+	}
+	case DRR_FREE:
+	{
+		struct drr_free *drrf = &rrd->header.drr_u.drr_free;
+		return (receive_free(rwa, drrf));
+	}
+	case DRR_SPILL:
+	{
+		struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
+		err = receive_spill(rwa, drrs, rrd->payload);
+		kmem_free(rrd->payload, rrd->payload_size);
+		rrd->payload = NULL;
+		return (err);
+	}
+	default:
+		return (SET_ERROR(EINVAL));
+	}
+}
+
+/*
+ * dmu_recv_stream's worker thread; pull records off the queue, and then call
+ * receive_process_record  When we're done, signal the main thread and exit.
+ */
+static void
+receive_writer_thread(void *arg)
+{
+	struct receive_writer_arg *rwa = arg;
+	struct receive_record_arg *rrd;
+	for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
+	    rrd = bqueue_dequeue(&rwa->q)) {
+		/*
+		 * If there's an error, the main thread will stop putting things
+		 * on the queue, but we need to clear everything in it before we
+		 * can exit.
+		 */
+		if (rwa->err == 0) {
+			rwa->err = receive_process_record(rwa, rrd);
+		} else if (rrd->write_buf != NULL) {
+			dmu_return_arcbuf(rrd->write_buf);
+			rrd->write_buf = NULL;
+			rrd->payload = NULL;
+		} else if (rrd->payload != NULL) {
+			kmem_free(rrd->payload, rrd->payload_size);
+			rrd->payload = NULL;
+		}
+		kmem_free(rrd, sizeof (*rrd));
+	}
+	kmem_free(rrd, sizeof (*rrd));
+	mutex_enter(&rwa->mutex);
+	rwa->done = B_TRUE;
+	cv_signal(&rwa->cv);
+	mutex_exit(&rwa->mutex);
+}
+
+/*
+ * Read in the stream's records, one by one, and apply them to the pool.  There
+ * are two threads involved; the thread that calls this function will spin up a
+ * worker thread, read the records off the stream one by one, and issue
+ * prefetches for any necessary indirect blocks.  It will then push the records
+ * onto an internal blocking queue.  The worker thread will pull the records off
+ * the queue, and actually write the data into the DMU.  This way, the worker
+ * thread doesn't have to wait for reads to complete, since everything it needs
+ * (the indirect blocks) will be prefetched.
+ *
  * NB: callers *must* call dmu_recv_end() if this succeeds.
  */
 int
@@ -2021,48 +2274,92 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
 		}
 
 		if (*action_handlep == 0) {
-			ra.guid_to_ds_map =
+			rwa.guid_to_ds_map =
 			    kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
-			avl_create(ra.guid_to_ds_map, guid_compare,
+			avl_create(rwa.guid_to_ds_map, guid_compare,
 			    sizeof (guid_map_entry_t),
 			    offsetof(guid_map_entry_t, avlnode));
 			err = zfs_onexit_add_cb(minor,
-			    free_guid_map_onexit, ra.guid_to_ds_map,
+			    free_guid_map_onexit, rwa.guid_to_ds_map,
 			    action_handlep);
 			if (ra.err != 0)
 				goto out;
 		} else {
 			err = zfs_onexit_cb_data(minor, *action_handlep,
-			    (void **)&ra.guid_to_ds_map);
+			    (void **)&rwa.guid_to_ds_map);
 			if (ra.err != 0)
 				goto out;
 		}
 
-		drc->drc_guid_to_ds_map = ra.guid_to_ds_map;
+		drc->drc_guid_to_ds_map = rwa.guid_to_ds_map;
 	}
 
-	err = restore_read_payload_and_next_header(&ra, 0, NULL);
-	if (err != 0)
+	err = receive_read_payload_and_next_header(&ra, 0, NULL);
+	if (err)
 		goto out;
-	for (;;) {
-		void *tmp;
 
+	(void) bqueue_init(&rwa.q, zfs_recv_queue_length,
+	    offsetof(struct receive_record_arg, node));
+	cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL);
+	rwa.os = ra.os;
+	rwa.byteswap = drc->drc_byteswap;
+
+	(void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, curproc,
+	    TS_RUN, minclsyspri);
+	/*
+	 * We're reading rwa.err without locks, which is safe since we are the
+	 * only reader, and the worker thread is the only writer.  It's ok if we
+	 * miss a write for an iteration or two of the loop, since the writer
+	 * thread will keep freeing records we send it until we send it an eos
+	 * marker.
+	 *
+	 * We can leave this loop in 3 ways:  First, if rwa.err is
+	 * non-zero.  In that case, the writer thread will free the rrd we just
+	 * pushed.  Second, if  we're interrupted; in that case, either it's the
+	 * first loop and ra.rrd was never allocated, or it's later, and ra.rrd
+	 * has been handed off to the writer thread who will free it.  Finally,
+	 * if receive_read_record fails or we're at the end of the stream, then
+	 * we free ra.rrd and exit.
+	 */
+	while (rwa.err == 0) {
 		if (issig(JUSTLOOKING) && issig(FORREAL)) {
 			err = SET_ERROR(EINTR);
 			break;
 		}
 
-		tmp = ra.next_drr;
-		ra.next_drr = ra.drr;
-		ra.drr = tmp;
+		ASSERT3P(ra.rrd, ==, NULL);
+		ra.rrd = ra.next_rrd;
+		ra.next_rrd = NULL;
+		/* Allocates and loads header into ra.next_rrd */
+		err = receive_read_record(&ra);
 
-		/* process ra.drr, read in ra.next_drr */
-		err = restore_process_record(&ra);
-		if (err != 0)
-			break;
-		if (ra.drr->drr_type == DRR_END)
+		if (ra.rrd->header.drr_type == DRR_END || err != 0) {
+			kmem_free(ra.rrd, sizeof (*ra.rrd));
+			ra.rrd = NULL;
 			break;
+		}
+
+		bqueue_enqueue(&rwa.q, ra.rrd,
+		    sizeof (struct receive_record_arg) + ra.rrd->payload_size);
+		ra.rrd = NULL;
+	}
+	if (ra.next_rrd == NULL)
+		ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP);
+	ra.next_rrd->eos_marker = B_TRUE;
+	bqueue_enqueue(&rwa.q, ra.next_rrd, 1);
+
+	mutex_enter(&rwa.mutex);
+	while (!rwa.done) {
+		cv_wait(&rwa.cv, &rwa.mutex);
 	}
+	mutex_exit(&rwa.mutex);
+
+	cv_destroy(&rwa.cv);
+	mutex_destroy(&rwa.mutex);
+	bqueue_destroy(&rwa.q);
+	if (err == 0)
+		err = rwa.err;
 
 out:
 	if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
diff --git a/module/zfs/dmu_send.c.orig b/module/zfs/dmu_send.c.orig
new file mode 100644
index 000000000000..b8ecc24b3af7
--- /dev/null
+++ b/module/zfs/dmu_send.c.orig
@@ -0,0 +1,2342 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/spa_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+#include <zfs_fletcher.h>
+#include <sys/avl.h>
+#include <sys/ddt.h>
+#include <sys/zfs_onexit.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/blkptr.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/zfeature.h>
+
+/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
+int zfs_send_corrupt_data = B_FALSE;
+
+static char *dmu_recv_tag = "dmu_recv_tag";
+static const char *recv_clone_name = "%recv";
+
+typedef struct dump_bytes_io {
+	dmu_sendarg_t	*dbi_dsp;
+	void		*dbi_buf;
+	int		dbi_len;
+} dump_bytes_io_t;
+
+static void
+dump_bytes_cb(void *arg)
+{
+	dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg;
+	dmu_sendarg_t *dsp = dbi->dbi_dsp;
+	dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset;
+	ssize_t resid; /* have to get resid to get detailed errno */
+	ASSERT0(dbi->dbi_len % 8);
+
+	dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp,
+	    (caddr_t)dbi->dbi_buf, dbi->dbi_len,
+	    0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
+
+	mutex_enter(&ds->ds_sendstream_lock);
+	*dsp->dsa_off += dbi->dbi_len;
+	mutex_exit(&ds->ds_sendstream_lock);
+}
+
+static int
+dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
+{
+	dump_bytes_io_t dbi;
+
+	dbi.dbi_dsp = dsp;
+	dbi.dbi_buf = buf;
+	dbi.dbi_len = len;
+
+#if defined(HAVE_LARGE_STACKS)
+	dump_bytes_cb(&dbi);
+#else
+	/*
+	 * The vn_rdwr() call is performed in a taskq to ensure that there is
+	 * always enough stack space to write safely to the target filesystem.
+	 * The ZIO_TYPE_FREE threads are used because there can be a lot of
+	 * them and they are used in vdev_file.c for a similar purpose.
+	 */
+	spa_taskq_dispatch_sync(dmu_objset_spa(dsp->dsa_os), ZIO_TYPE_FREE,
+	    ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP);
+#endif /* HAVE_LARGE_STACKS */
+
+	return (dsp->dsa_err);
+}
+
+/*
+ * For all record types except BEGIN, fill in the checksum (overlaid in
+ * drr_u.drr_checksum.drr_checksum).  The checksum verifies everything
+ * up to the start of the checksum itself.
+ */
+static int
+dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
+{
+	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+	fletcher_4_incremental_native(dsp->dsa_drr,
+	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    &dsp->dsa_zc);
+	if (dsp->dsa_drr->drr_type != DRR_BEGIN) {
+		ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u.
+		    drr_checksum.drr_checksum));
+		dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc;
+	}
+	fletcher_4_incremental_native(&dsp->dsa_drr->
+	    drr_u.drr_checksum.drr_checksum,
+	    sizeof (zio_cksum_t), &dsp->dsa_zc);
+	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
+		return (SET_ERROR(EINTR));
+	if (payload_len != 0) {
+		fletcher_4_incremental_native(payload, payload_len,
+		    &dsp->dsa_zc);
+		if (dump_bytes(dsp, payload, payload_len) != 0)
+			return (SET_ERROR(EINTR));
+	}
+	return (0);
+}
+
+static int
+dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
+    uint64_t length)
+{
+	struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
+
+	/*
+	 * When we receive a free record, dbuf_free_range() assumes
+	 * that the receiving system doesn't have any dbufs in the range
+	 * being freed.  This is always true because there is a one-record
+	 * constraint: we only send one WRITE record for any given
+	 * object+offset.  We know that the one-record constraint is
+	 * true because we always send data in increasing order by
+	 * object,offset.
+	 *
+	 * If the increasing-order constraint ever changes, we should find
+	 * another way to assert that the one-record constraint is still
+	 * satisfied.
+	 */
+	ASSERT(object > dsp->dsa_last_data_object ||
+	    (object == dsp->dsa_last_data_object &&
+	    offset > dsp->dsa_last_data_offset));
+
+	/*
+	 * If we are doing a non-incremental send, then there can't
+	 * be any data in the dataset we're receiving into.  Therefore
+	 * a free record would simply be a no-op.  Save space by not
+	 * sending it to begin with.
+	 */
+	if (!dsp->dsa_incremental)
+		return (0);
+
+	if (length != -1ULL && offset + length < offset)
+		length = -1ULL;
+
+	/*
+	 * If there is a pending op, but it's not PENDING_FREE, push it out,
+	 * since free block aggregation can only be done for blocks of the
+	 * same type (i.e., DRR_FREE records can only be aggregated with
+	 * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
+	 * aggregated with other DRR_FREEOBJECTS records.
+	 */
+	if (dsp->dsa_pending_op != PENDING_NONE &&
+	    dsp->dsa_pending_op != PENDING_FREE) {
+		if (dump_record(dsp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dsp->dsa_pending_op = PENDING_NONE;
+	}
+
+	if (dsp->dsa_pending_op == PENDING_FREE) {
+		/*
+		 * There should never be a PENDING_FREE if length is -1
+		 * (because dump_dnode is the only place where this
+		 * function is called with a -1, and only after flushing
+		 * any pending record).
+		 */
+		ASSERT(length != -1ULL);
+		/*
+		 * Check to see whether this free block can be aggregated
+		 * with pending one.
+		 */
+		if (drrf->drr_object == object && drrf->drr_offset +
+		    drrf->drr_length == offset) {
+			drrf->drr_length += length;
+			return (0);
+		} else {
+			/* not a continuation.  Push out pending record */
+			if (dump_record(dsp, NULL, 0) != 0)
+				return (SET_ERROR(EINTR));
+			dsp->dsa_pending_op = PENDING_NONE;
+		}
+	}
+	/* create a FREE record and make it pending */
+	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+	dsp->dsa_drr->drr_type = DRR_FREE;
+	drrf->drr_object = object;
+	drrf->drr_offset = offset;
+	drrf->drr_length = length;
+	drrf->drr_toguid = dsp->dsa_toguid;
+	if (length == -1ULL) {
+		if (dump_record(dsp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+	} else {
+		dsp->dsa_pending_op = PENDING_FREE;
+	}
+
+	return (0);
+}
+
+static int
+dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
+    uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
+{
+	struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
+
+	/*
+	 * We send data in increasing object, offset order.
+	 * See comment in dump_free() for details.
+	 */
+	ASSERT(object > dsp->dsa_last_data_object ||
+	    (object == dsp->dsa_last_data_object &&
+	    offset > dsp->dsa_last_data_offset));
+	dsp->dsa_last_data_object = object;
+	dsp->dsa_last_data_offset = offset + blksz - 1;
+
+	/*
+	 * If there is any kind of pending aggregation (currently either
+	 * a grouping of free objects or free blocks), push it out to
+	 * the stream, since aggregation can't be done across operations
+	 * of different types.
+	 */
+	if (dsp->dsa_pending_op != PENDING_NONE) {
+		if (dump_record(dsp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dsp->dsa_pending_op = PENDING_NONE;
+	}
+	/* write a WRITE record */
+	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+	dsp->dsa_drr->drr_type = DRR_WRITE;
+	drrw->drr_object = object;
+	drrw->drr_type = type;
+	drrw->drr_offset = offset;
+	drrw->drr_length = blksz;
+	drrw->drr_toguid = dsp->dsa_toguid;
+	if (bp == NULL || BP_IS_EMBEDDED(bp)) {
+		/*
+		 * There's no pre-computed checksum for partial-block
+		 * writes or embedded BP's, so (like
+		 * fletcher4-checkummed blocks) userland will have to
+		 * compute a dedup-capable checksum itself.
+		 */
+		drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
+	} else {
+		drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
+		if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
+			drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
+		DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
+		DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
+		DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
+		drrw->drr_key.ddk_cksum = bp->blk_cksum;
+	}
+
+	if (dump_record(dsp, data, blksz) != 0)
+		return (SET_ERROR(EINTR));
+	return (0);
+}
+
+static int
+dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
+    int blksz, const blkptr_t *bp)
+{
+	char buf[BPE_PAYLOAD_SIZE];
+	struct drr_write_embedded *drrw =
+	    &(dsp->dsa_drr->drr_u.drr_write_embedded);
+
+	if (dsp->dsa_pending_op != PENDING_NONE) {
+		if (dump_record(dsp, NULL, 0) != 0)
+			return (EINTR);
+		dsp->dsa_pending_op = PENDING_NONE;
+	}
+
+	ASSERT(BP_IS_EMBEDDED(bp));
+
+	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+	dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
+	drrw->drr_object = object;
+	drrw->drr_offset = offset;
+	drrw->drr_length = blksz;
+	drrw->drr_toguid = dsp->dsa_toguid;
+	drrw->drr_compression = BP_GET_COMPRESS(bp);
+	drrw->drr_etype = BPE_GET_ETYPE(bp);
+	drrw->drr_lsize = BPE_GET_LSIZE(bp);
+	drrw->drr_psize = BPE_GET_PSIZE(bp);
+
+	decode_embedded_bp_compressed(bp, buf);
+
+	if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
+		return (EINTR);
+	return (0);
+}
+
+static int
+dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
+{
+	struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
+
+	if (dsp->dsa_pending_op != PENDING_NONE) {
+		if (dump_record(dsp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dsp->dsa_pending_op = PENDING_NONE;
+	}
+
+	/* write a SPILL record */
+	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+	dsp->dsa_drr->drr_type = DRR_SPILL;
+	drrs->drr_object = object;
+	drrs->drr_length = blksz;
+	drrs->drr_toguid = dsp->dsa_toguid;
+
+	if (dump_record(dsp, data, blksz) != 0)
+		return (SET_ERROR(EINTR));
+	return (0);
+}
+
+static int
+dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
+{
+	struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
+
+	/* See comment in dump_free(). */
+	if (!dsp->dsa_incremental)
+		return (0);
+
+	/*
+	 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
+	 * push it out, since free block aggregation can only be done for
+	 * blocks of the same type (i.e., DRR_FREE records can only be
+	 * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
+	 * can only be aggregated with other DRR_FREEOBJECTS records.
+	 */
+	if (dsp->dsa_pending_op != PENDING_NONE &&
+	    dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
+		if (dump_record(dsp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dsp->dsa_pending_op = PENDING_NONE;
+	}
+	if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
+		/*
+		 * See whether this free object array can be aggregated
+		 * with pending one
+		 */
+		if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
+			drrfo->drr_numobjs += numobjs;
+			return (0);
+		} else {
+			/* can't be aggregated.  Push out pending record */
+			if (dump_record(dsp, NULL, 0) != 0)
+				return (SET_ERROR(EINTR));
+			dsp->dsa_pending_op = PENDING_NONE;
+		}
+	}
+
+	/* write a FREEOBJECTS record */
+	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+	dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
+	drrfo->drr_firstobj = firstobj;
+	drrfo->drr_numobjs = numobjs;
+	drrfo->drr_toguid = dsp->dsa_toguid;
+
+	dsp->dsa_pending_op = PENDING_FREEOBJECTS;
+
+	return (0);
+}
+
+static int
+dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
+{
+	struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
+
+	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
+		return (dump_freeobjects(dsp, object, 1));
+
+	if (dsp->dsa_pending_op != PENDING_NONE) {
+		if (dump_record(dsp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dsp->dsa_pending_op = PENDING_NONE;
+	}
+
+	/* write an OBJECT record */
+	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+	dsp->dsa_drr->drr_type = DRR_OBJECT;
+	drro->drr_object = object;
+	drro->drr_type = dnp->dn_type;
+	drro->drr_bonustype = dnp->dn_bonustype;
+	drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+	drro->drr_bonuslen = dnp->dn_bonuslen;
+	drro->drr_checksumtype = dnp->dn_checksum;
+	drro->drr_compress = dnp->dn_compress;
+	drro->drr_toguid = dsp->dsa_toguid;
+
+	if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+		drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
+
+	if (dump_record(dsp, DN_BONUS(dnp),
+	    P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) {
+		return (SET_ERROR(EINTR));
+	}
+
+	/* Free anything past the end of the file. */
+	if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
+	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
+		return (SET_ERROR(EINTR));
+	if (dsp->dsa_err != 0)
+		return (SET_ERROR(EINTR));
+	return (0);
+}
+
+static boolean_t
+backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
+{
+	if (!BP_IS_EMBEDDED(bp))
+		return (B_FALSE);
+
+	/*
+	 * Compression function must be legacy, or explicitly enabled.
+	 */
+	if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
+	    !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)))
+		return (B_FALSE);
+
+	/*
+	 * Embed type must be explicitly enabled.
+	 */
+	switch (BPE_GET_ETYPE(bp)) {
+	case BP_EMBEDDED_TYPE_DATA:
+		if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
+			return (B_TRUE);
+		break;
+	default:
+		return (B_FALSE);
+	}
+	return (B_FALSE);
+}
+
+#define	BP_SPAN(dnp, level) \
+	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
+	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+
+/* ARGSUSED */
+static int
+backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	dmu_sendarg_t *dsp = arg;
+	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
+	int err = 0;
+
+	if (issig(JUSTLOOKING) && issig(FORREAL))
+		return (SET_ERROR(EINTR));
+
+	if (zb->zb_object != DMU_META_DNODE_OBJECT &&
+	    DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
+		return (0);
+	} else if (zb->zb_level == ZB_ZIL_LEVEL) {
+		/*
+		 * If we are sending a non-snapshot (which is allowed on
+		 * read-only pools), it may have a ZIL, which must be ignored.
+		 */
+		return (0);
+	} else if (BP_IS_HOLE(bp) &&
+	    zb->zb_object == DMU_META_DNODE_OBJECT) {
+		uint64_t span = BP_SPAN(dnp, zb->zb_level);
+		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
+		err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT);
+	} else if (BP_IS_HOLE(bp)) {
+		uint64_t span = BP_SPAN(dnp, zb->zb_level);
+		err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span);
+	} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
+		return (0);
+	} else if (type == DMU_OT_DNODE) {
+		dnode_phys_t *blk;
+		int i;
+		int blksz = BP_GET_LSIZE(bp);
+		arc_flags_t aflags = ARC_FLAG_WAIT;
+		arc_buf_t *abuf;
+
+		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+		    &aflags, zb) != 0)
+			return (SET_ERROR(EIO));
+
+		blk = abuf->b_data;
+		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
+			uint64_t dnobj = (zb->zb_blkid <<
+			    (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
+			err = dump_dnode(dsp, dnobj, blk+i);
+			if (err != 0)
+				break;
+		}
+		(void) arc_buf_remove_ref(abuf, &abuf);
+	} else if (type == DMU_OT_SA) {
+		arc_flags_t aflags = ARC_FLAG_WAIT;
+		arc_buf_t *abuf;
+		int blksz = BP_GET_LSIZE(bp);
+
+		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+		    &aflags, zb) != 0)
+			return (SET_ERROR(EIO));
+
+		err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
+		(void) arc_buf_remove_ref(abuf, &abuf);
+	} else if (backup_do_embed(dsp, bp)) {
+		/* it's an embedded level-0 block of a regular object */
+		int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+		err = dump_write_embedded(dsp, zb->zb_object,
+		    zb->zb_blkid * blksz, blksz, bp);
+	} else { /* it's a level-0 block of a regular object */
+		uint64_t offset;
+		arc_flags_t aflags = ARC_FLAG_WAIT;
+		arc_buf_t *abuf;
+		int blksz = BP_GET_LSIZE(bp);
+
+		ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+		ASSERT0(zb->zb_level);
+		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+		    &aflags, zb) != 0) {
+			if (zfs_send_corrupt_data) {
+				uint64_t *ptr;
+				/* Send a block filled with 0x"zfs badd bloc" */
+				abuf = arc_buf_alloc(spa, blksz, &abuf,
+				    ARC_BUFC_DATA);
+				for (ptr = abuf->b_data;
+				    (char *)ptr < (char *)abuf->b_data + blksz;
+				    ptr++)
+					*ptr = 0x2f5baddb10cULL;
+			} else {
+				return (SET_ERROR(EIO));
+			}
+		}
+
+		offset = zb->zb_blkid * blksz;
+
+		if (!(dsp->dsa_featureflags &
+		    DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+		    blksz > SPA_OLD_MAXBLOCKSIZE) {
+			char *buf = abuf->b_data;
+			while (blksz > 0 && err == 0) {
+				int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
+				err = dump_write(dsp, type, zb->zb_object,
+				    offset, n, NULL, buf);
+				offset += n;
+				buf += n;
+				blksz -= n;
+			}
+		} else {
+			err = dump_write(dsp, type, zb->zb_object,
+			    offset, blksz, bp, abuf->b_data);
+		}
+		(void) arc_buf_remove_ref(abuf, &abuf);
+	}
+
+	ASSERT(err == 0 || err == EINTR);
+	return (err);
+}
+
+/*
+ * Releases dp using the specified tag.
+ */
+static int
+dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
+    zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
+    boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
+{
+	objset_t *os;
+	dmu_replay_record_t *drr;
+	dmu_sendarg_t *dsp;
+	int err;
+	uint64_t fromtxg = 0;
+	uint64_t featureflags = 0;
+
+	err = dmu_objset_from_ds(ds, &os);
+	if (err != 0) {
+		dsl_pool_rele(dp, tag);
+		return (err);
+	}
+
+	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
+	drr->drr_type = DRR_BEGIN;
+	drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
+	DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
+	    DMU_SUBSTREAM);
+
+#ifdef _KERNEL
+	if (dmu_objset_type(os) == DMU_OST_ZFS) {
+		uint64_t version;
+		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
+			kmem_free(drr, sizeof (dmu_replay_record_t));
+			dsl_pool_rele(dp, tag);
+			return (SET_ERROR(EINVAL));
+		}
+		if (version >= ZPL_VERSION_SA) {
+			featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
+		}
+	}
+#endif
+
+	if (large_block_ok && ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
+		featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
+	if (embedok &&
+	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
+		featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
+		if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+			featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
+	} else {
+		embedok = B_FALSE;
+	}
+
+	DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
+	    featureflags);
+
+	drr->drr_u.drr_begin.drr_creation_time =
+	    dsl_dataset_phys(ds)->ds_creation_time;
+	drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
+	if (is_clone)
+		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
+	drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid;
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
+
+	if (fromzb != NULL) {
+		drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid;
+		fromtxg = fromzb->zbm_creation_txg;
+	}
+	dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
+	if (!ds->ds_is_snapshot) {
+		(void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
+		    sizeof (drr->drr_u.drr_begin.drr_toname));
+	}
+
+	dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
+
+	dsp->dsa_drr = drr;
+	dsp->dsa_vp = vp;
+	dsp->dsa_outfd = outfd;
+	dsp->dsa_proc = curproc;
+	dsp->dsa_os = os;
+	dsp->dsa_off = off;
+	dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid;
+	dsp->dsa_pending_op = PENDING_NONE;
+	dsp->dsa_incremental = (fromzb != NULL);
+	dsp->dsa_featureflags = featureflags;
+
+	mutex_enter(&ds->ds_sendstream_lock);
+	list_insert_head(&ds->ds_sendstreams, dsp);
+	mutex_exit(&ds->ds_sendstream_lock);
+
+	dsl_dataset_long_hold(ds, FTAG);
+	dsl_pool_rele(dp, tag);
+
+	if (dump_record(dsp, NULL, 0) != 0) {
+		err = dsp->dsa_err;
+		goto out;
+	}
+
+	err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
+	    backup_cb, dsp);
+
+	if (dsp->dsa_pending_op != PENDING_NONE)
+		if (dump_record(dsp, NULL, 0) != 0)
+			err = SET_ERROR(EINTR);
+
+	if (err != 0) {
+		if (err == EINTR && dsp->dsa_err != 0)
+			err = dsp->dsa_err;
+		goto out;
+	}
+
+	bzero(drr, sizeof (dmu_replay_record_t));
+	drr->drr_type = DRR_END;
+	drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
+	drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
+
+	if (dump_record(dsp, NULL, 0) != 0) {
+		err = dsp->dsa_err;
+		goto out;
+	}
+
+out:
+	mutex_enter(&ds->ds_sendstream_lock);
+	list_remove(&ds->ds_sendstreams, dsp);
+	mutex_exit(&ds->ds_sendstream_lock);
+
+	kmem_free(drr, sizeof (dmu_replay_record_t));
+	kmem_free(dsp, sizeof (dmu_sendarg_t));
+
+	dsl_dataset_long_rele(ds, FTAG);
+
+	return (err);
+}
+
+int
+dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
+    int outfd, vnode_t *vp, offset_t *off)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	dsl_dataset_t *fromds = NULL;
+	int err;
+
+	err = dsl_pool_hold(pool, FTAG, &dp);
+	if (err != 0)
+		return (err);
+
+	err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds);
+	if (err != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (err);
+	}
+
+	if (fromsnap != 0) {
+		zfs_bookmark_phys_t zb;
+		boolean_t is_clone;
+
+		err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
+		if (err != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			dsl_pool_rele(dp, FTAG);
+			return (err);
+		}
+		if (!dsl_dataset_is_before(ds, fromds, 0))
+			err = SET_ERROR(EXDEV);
+		zb.zbm_creation_time =
+		    dsl_dataset_phys(fromds)->ds_creation_time;
+		zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg;
+		zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
+		is_clone = (fromds->ds_dir != ds->ds_dir);
+		dsl_dataset_rele(fromds, FTAG);
+		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+		    embedok, large_block_ok, outfd, vp, off);
+	} else {
+		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+		    embedok, large_block_ok, outfd, vp, off);
+	}
+	dsl_dataset_rele(ds, FTAG);
+	return (err);
+}
+
+int
+dmu_send(const char *tosnap, const char *fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
+    int outfd, vnode_t *vp, offset_t *off)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	int err;
+	boolean_t owned = B_FALSE;
+
+	if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
+		return (SET_ERROR(EINVAL));
+
+	err = dsl_pool_hold(tosnap, FTAG, &dp);
+	if (err != 0)
+		return (err);
+
+	if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) {
+		/*
+		 * We are sending a filesystem or volume.  Ensure
+		 * that it doesn't change by owning the dataset.
+		 */
+		err = dsl_dataset_own(dp, tosnap, FTAG, &ds);
+		owned = B_TRUE;
+	} else {
+		err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
+	}
+	if (err != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (err);
+	}
+
+	if (fromsnap != NULL) {
+		zfs_bookmark_phys_t zb;
+		boolean_t is_clone = B_FALSE;
+		int fsnamelen = strchr(tosnap, '@') - tosnap;
+
+		/*
+		 * If the fromsnap is in a different filesystem, then
+		 * mark the send stream as a clone.
+		 */
+		if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
+		    (fromsnap[fsnamelen] != '@' &&
+		    fromsnap[fsnamelen] != '#')) {
+			is_clone = B_TRUE;
+		}
+
+		if (strchr(fromsnap, '@')) {
+			dsl_dataset_t *fromds;
+			err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
+			if (err == 0) {
+				if (!dsl_dataset_is_before(ds, fromds, 0))
+					err = SET_ERROR(EXDEV);
+				zb.zbm_creation_time =
+				    dsl_dataset_phys(fromds)->ds_creation_time;
+				zb.zbm_creation_txg =
+				    dsl_dataset_phys(fromds)->ds_creation_txg;
+				zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
+				is_clone = (ds->ds_dir != fromds->ds_dir);
+				dsl_dataset_rele(fromds, FTAG);
+			}
+		} else {
+			err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
+		}
+		if (err != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			dsl_pool_rele(dp, FTAG);
+			return (err);
+		}
+		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+		    embedok, large_block_ok, outfd, vp, off);
+	} else {
+		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+		    embedok, large_block_ok, outfd, vp, off);
+	}
+	if (owned)
+		dsl_dataset_disown(ds, FTAG);
+	else
+		dsl_dataset_rele(ds, FTAG);
+	return (err);
+}
+
+static int
+dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size,
+    uint64_t *sizep)
+{
+	int err;
+	/*
+	 * Assume that space (both on-disk and in-stream) is dominated by
+	 * data.  We will adjust for indirect blocks and the copies property,
+	 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
+	 */
+
+	/*
+	 * Subtract out approximate space used by indirect blocks.
+	 * Assume most space is used by data blocks (non-indirect, non-dnode).
+	 * Assume all blocks are recordsize.  Assume ditto blocks and
+	 * internal fragmentation counter out compression.
+	 *
+	 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
+	 * block, which we observe in practice.
+	 */
+	uint64_t recordsize;
+	err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
+	if (err != 0)
+		return (err);
+	size -= size / recordsize * sizeof (blkptr_t);
+
+	/* Add in the space for the record associated with each block. */
+	size += size / recordsize * sizeof (dmu_replay_record_t);
+
+	*sizep = size;
+
+	return (0);
+}
+
+int
+dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
+{
+	int err;
+	uint64_t size;
+
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+
+	/* tosnap must be a snapshot */
+	if (!ds->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
+
+	/* fromsnap, if provided, must be a snapshot */
+	if (fromds != NULL && !fromds->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * fromsnap must be an earlier snapshot from the same fs as tosnap,
+	 * or the origin's fs.
+	 */
+	if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
+		return (SET_ERROR(EXDEV));
+
+	/* Get uncompressed size estimate of changed data. */
+	if (fromds == NULL) {
+		size = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+	} else {
+		uint64_t used, comp;
+		err = dsl_dataset_space_written(fromds, ds,
+		    &used, &comp, &size);
+		if (err != 0)
+			return (err);
+	}
+
+	err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
+	return (err);
+}
+
+/*
+ * Simple callback used to traverse the blocks of a snapshot and sum their
+ * uncompressed size
+ */
+/* ARGSUSED */
+static int
+dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	uint64_t *spaceptr = arg;
+	if (bp != NULL && !BP_IS_HOLE(bp)) {
+		*spaceptr += BP_GET_UCSIZE(bp);
+	}
+	return (0);
+}
+
+/*
+ * Given a desination snapshot and a TXG, calculate the approximate size of a
+ * send stream sent from that TXG. from_txg may be zero, indicating that the
+ * whole snapshot will be sent.
+ */
+int
+dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
+    uint64_t *sizep)
+{
+	int err;
+	uint64_t size = 0;
+
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+
+	/* tosnap must be a snapshot */
+	if (!dsl_dataset_is_snapshot(ds))
+		return (SET_ERROR(EINVAL));
+
+	/* verify that from_txg is before the provided snapshot was taken */
+	if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) {
+		return (SET_ERROR(EXDEV));
+	}
+	/*
+	 * traverse the blocks of the snapshot with birth times after
+	 * from_txg, summing their uncompressed size
+	 */
+	err = traverse_dataset(ds, from_txg, TRAVERSE_POST,
+	    dmu_calculate_send_traversal, &size);
+	if (err)
+		return (err);
+
+	err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
+	return (err);
+}
+
+typedef struct dmu_recv_begin_arg {
+	const char *drba_origin;
+	dmu_recv_cookie_t *drba_cookie;
+	cred_t *drba_cred;
+	uint64_t drba_snapobj;
+} dmu_recv_begin_arg_t;
+
+static int
+recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
+    uint64_t fromguid)
+{
+	uint64_t val;
+	int error;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	/* temporary clone name must not exist */
+	error = zap_lookup(dp->dp_meta_objset,
+	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
+	    8, 1, &val);
+	if (error != ENOENT)
+		return (error == 0 ? EBUSY : error);
+
+	/* new snapshot name must not exist */
+	error = zap_lookup(dp->dp_meta_objset,
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj,
+	    drba->drba_cookie->drc_tosnap, 8, 1, &val);
+	if (error != ENOENT)
+		return (error == 0 ? EEXIST : error);
+
+	/*
+	 * Check snapshot limit before receiving. We'll recheck again at the
+	 * end, but might as well abort before receiving if we're already over
+	 * the limit.
+	 *
+	 * Note that we do not check the file system limit with
+	 * dsl_dir_fscount_check because the temporary %clones don't count
+	 * against that limit.
+	 */
+	error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
+	    NULL, drba->drba_cred);
+	if (error != 0)
+		return (error);
+
+	if (fromguid != 0) {
+		dsl_dataset_t *snap;
+		uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+
+		/* Find snapshot in this dir that matches fromguid. */
+		while (obj != 0) {
+			error = dsl_dataset_hold_obj(dp, obj, FTAG,
+			    &snap);
+			if (error != 0)
+				return (SET_ERROR(ENODEV));
+			if (snap->ds_dir != ds->ds_dir) {
+				dsl_dataset_rele(snap, FTAG);
+				return (SET_ERROR(ENODEV));
+			}
+			if (dsl_dataset_phys(snap)->ds_guid == fromguid)
+				break;
+			obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+			dsl_dataset_rele(snap, FTAG);
+		}
+		if (obj == 0)
+			return (SET_ERROR(ENODEV));
+
+		if (drba->drba_cookie->drc_force) {
+			drba->drba_snapobj = obj;
+		} else {
+			/*
+			 * If we are not forcing, there must be no
+			 * changes since fromsnap.
+			 */
+			if (dsl_dataset_modified_since_snap(ds, snap)) {
+				dsl_dataset_rele(snap, FTAG);
+				return (SET_ERROR(ETXTBSY));
+			}
+			drba->drba_snapobj = ds->ds_prev->ds_object;
+		}
+
+		dsl_dataset_rele(snap, FTAG);
+	} else {
+		/* if full, then must be forced */
+		if (!drba->drba_cookie->drc_force)
+			return (SET_ERROR(EEXIST));
+		/* start from $ORIGIN@$ORIGIN, if supported */
+		drba->drba_snapobj = dp->dp_origin_snap != NULL ?
+		    dp->dp_origin_snap->ds_object : 0;
+	}
+
+	return (0);
+
+}
+
+static int
+dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_begin_arg_t *drba = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+	uint64_t fromguid = drrb->drr_fromguid;
+	int flags = drrb->drr_flags;
+	int error;
+	uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+	dsl_dataset_t *ds;
+	const char *tofs = drba->drba_cookie->drc_tofs;
+
+	/* already checked */
+	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+
+	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+	    DMU_COMPOUNDSTREAM ||
+	    drrb->drr_type >= DMU_OST_NUMTYPES ||
+	    ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
+		return (SET_ERROR(EINVAL));
+
+	/* Verify pool version supports SA if SA_SPILL feature set */
+	if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+	    spa_version(dp->dp_spa) < SPA_VERSION_SA)
+		return (SET_ERROR(ENOTSUP));
+
+	/*
+	 * The receiving code doesn't know how to translate a WRITE_EMBEDDED
+	 * record to a plan WRITE record, so the pool must have the
+	 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
+	 * records.  Same with WRITE_EMBEDDED records that use LZ4 compression.
+	 */
+	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
+		return (SET_ERROR(ENOTSUP));
+	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+		return (SET_ERROR(ENOTSUP));
+
+	/*
+	 * The receiving code doesn't know how to translate large blocks
+	 * to smaller ones, so the pool must have the LARGE_BLOCKS
+	 * feature enabled if the stream has LARGE_BLOCKS.
+	 */
+	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+		return (SET_ERROR(ENOTSUP));
+
+	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+	if (error == 0) {
+		/* target fs already exists; recv into temp clone */
+
+		/* Can't recv a clone into an existing fs */
+		if (flags & DRR_FLAG_CLONE) {
+			dsl_dataset_rele(ds, FTAG);
+			return (SET_ERROR(EINVAL));
+		}
+
+		error = recv_begin_check_existing_impl(drba, ds, fromguid);
+		dsl_dataset_rele(ds, FTAG);
+	} else if (error == ENOENT) {
+		/* target fs does not exist; must be a full backup or clone */
+		char buf[MAXNAMELEN];
+
+		/*
+		 * If it's a non-clone incremental, we are missing the
+		 * target fs, so fail the recv.
+		 */
+		if (fromguid != 0 && !(flags & DRR_FLAG_CLONE))
+			return (SET_ERROR(ENOENT));
+
+		/* Open the parent of tofs */
+		ASSERT3U(strlen(tofs), <, MAXNAMELEN);
+		(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
+		error = dsl_dataset_hold(dp, buf, FTAG, &ds);
+		if (error != 0)
+			return (error);
+
+		/*
+		 * Check filesystem and snapshot limits before receiving. We'll
+		 * recheck snapshot limits again at the end (we create the
+		 * filesystems and increment those counts during begin_sync).
+		 */
+		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+		    ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
+		if (error != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			return (error);
+		}
+
+		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
+		if (error != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			return (error);
+		}
+
+		if (drba->drba_origin != NULL) {
+			dsl_dataset_t *origin;
+			error = dsl_dataset_hold(dp, drba->drba_origin,
+			    FTAG, &origin);
+			if (error != 0) {
+				dsl_dataset_rele(ds, FTAG);
+				return (error);
+			}
+			if (!origin->ds_is_snapshot) {
+				dsl_dataset_rele(origin, FTAG);
+				dsl_dataset_rele(ds, FTAG);
+				return (SET_ERROR(EINVAL));
+			}
+			if (dsl_dataset_phys(origin)->ds_guid != fromguid) {
+				dsl_dataset_rele(origin, FTAG);
+				dsl_dataset_rele(ds, FTAG);
+				return (SET_ERROR(ENODEV));
+			}
+			dsl_dataset_rele(origin, FTAG);
+		}
+		dsl_dataset_rele(ds, FTAG);
+		error = 0;
+	}
+	return (error);
+}
+
+static void
+dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_begin_arg_t *drba = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+	const char *tofs = drba->drba_cookie->drc_tofs;
+	dsl_dataset_t *ds, *newds;
+	uint64_t dsobj;
+	int error;
+	uint64_t crflags;
+
+	crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ?
+	    DS_FLAG_CI_DATASET : 0;
+
+	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+	if (error == 0) {
+		/* create temporary clone */
+		dsl_dataset_t *snap = NULL;
+		if (drba->drba_snapobj != 0) {
+			VERIFY0(dsl_dataset_hold_obj(dp,
+			    drba->drba_snapobj, FTAG, &snap));
+		}
+		dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
+		    snap, crflags, drba->drba_cred, tx);
+		dsl_dataset_rele(snap, FTAG);
+		dsl_dataset_rele(ds, FTAG);
+	} else {
+		dsl_dir_t *dd;
+		const char *tail;
+		dsl_dataset_t *origin = NULL;
+
+		VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
+
+		if (drba->drba_origin != NULL) {
+			VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
+			    FTAG, &origin));
+		}
+
+		/* Create new dataset. */
+		dsobj = dsl_dataset_create_sync(dd,
+		    strrchr(tofs, '/') + 1,
+		    origin, crflags, drba->drba_cred, tx);
+		if (origin != NULL)
+			dsl_dataset_rele(origin, FTAG);
+		dsl_dir_rele(dd, FTAG);
+		drba->drba_cookie->drc_newfs = B_TRUE;
+	}
+	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
+
+	dmu_buf_will_dirty(newds->ds_dbuf, tx);
+	dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+	/*
+	 * If we actually created a non-clone, we need to create the
+	 * objset in our new dataset.
+	 */
+	if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) {
+		(void) dmu_objset_create_impl(dp->dp_spa,
+		    newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
+	}
+
+	drba->drba_cookie->drc_ds = newds;
+
+	spa_history_log_internal_ds(newds, "receive", tx, "");
+}
+
+/*
+ * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
+ * succeeds; otherwise we will leak the holds on the datasets.
+ */
+int
+dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
+    boolean_t force, char *origin, dmu_recv_cookie_t *drc)
+{
+	dmu_recv_begin_arg_t drba = { 0 };
+	dmu_replay_record_t *drr;
+
+	bzero(drc, sizeof (dmu_recv_cookie_t));
+	drc->drc_drrb = drrb;
+	drc->drc_tosnap = tosnap;
+	drc->drc_tofs = tofs;
+	drc->drc_force = force;
+	drc->drc_cred = CRED();
+
+	if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
+		drc->drc_byteswap = B_TRUE;
+	else if (drrb->drr_magic != DMU_BACKUP_MAGIC)
+		return (SET_ERROR(EINVAL));
+
+	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
+	drr->drr_type = DRR_BEGIN;
+	drr->drr_u.drr_begin = *drc->drc_drrb;
+	if (drc->drc_byteswap) {
+		fletcher_4_incremental_byteswap(drr,
+		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
+	} else {
+		fletcher_4_incremental_native(drr,
+		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
+	}
+	kmem_free(drr, sizeof (dmu_replay_record_t));
+
+	if (drc->drc_byteswap) {
+		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
+		drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
+		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
+		drrb->drr_type = BSWAP_32(drrb->drr_type);
+		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
+		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
+	}
+
+	drba.drba_origin = origin;
+	drba.drba_cookie = drc;
+	drba.drba_cred = CRED();
+
+	return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync,
+	    &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+}
+
+struct restorearg {
+	objset_t *os;
+	int err;
+	boolean_t byteswap;
+	vnode_t *vp;
+	uint64_t voff;
+	int bufsize; /* amount of memory allocated for buf */
+
+	dmu_replay_record_t *drr;
+	dmu_replay_record_t *next_drr;
+	char *buf;
+	zio_cksum_t cksum;
+	zio_cksum_t prev_cksum;
+
+	avl_tree_t *guid_to_ds_map;
+};
+
+typedef struct guid_map_entry {
+	uint64_t	guid;
+	dsl_dataset_t	*gme_ds;
+	avl_node_t	avlnode;
+} guid_map_entry_t;
+
+static int
+guid_compare(const void *arg1, const void *arg2)
+{
+	const guid_map_entry_t *gmep1 = arg1;
+	const guid_map_entry_t *gmep2 = arg2;
+
+	if (gmep1->guid < gmep2->guid)
+		return (-1);
+	else if (gmep1->guid > gmep2->guid)
+		return (1);
+	return (0);
+}
+
+static void
+free_guid_map_onexit(void *arg)
+{
+	avl_tree_t *ca = arg;
+	void *cookie = NULL;
+	guid_map_entry_t *gmep;
+
+	while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
+		dsl_dataset_long_rele(gmep->gme_ds, gmep);
+		dsl_dataset_rele(gmep->gme_ds, gmep);
+		kmem_free(gmep, sizeof (guid_map_entry_t));
+	}
+	avl_destroy(ca);
+	kmem_free(ca, sizeof (avl_tree_t));
+}
+
+static int
+restore_read(struct restorearg *ra, int len, void *buf)
+{
+	int done = 0;
+
+	/* some things will require 8-byte alignment, so everything must */
+	ASSERT0(len % 8);
+	ASSERT3U(len, <=, ra->bufsize);
+
+	while (done < len) {
+		ssize_t resid;
+
+		ra->err = vn_rdwr(UIO_READ, ra->vp,
+		    (char *)buf + done, len - done,
+		    ra->voff, UIO_SYSSPACE, FAPPEND,
+		    RLIM64_INFINITY, CRED(), &resid);
+
+		if (resid == len - done)
+			ra->err = SET_ERROR(EINVAL);
+		ra->voff += len - done - resid;
+		done = len - resid;
+		if (ra->err != 0)
+			return (ra->err);
+	}
+
+	ASSERT3U(done, ==, len);
+	return (0);
+}
+
+noinline static void
+byteswap_record(dmu_replay_record_t *drr)
+{
+#define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
+#define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
+	drr->drr_type = BSWAP_32(drr->drr_type);
+	drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
+
+	switch (drr->drr_type) {
+	case DRR_BEGIN:
+		DO64(drr_begin.drr_magic);
+		DO64(drr_begin.drr_versioninfo);
+		DO64(drr_begin.drr_creation_time);
+		DO32(drr_begin.drr_type);
+		DO32(drr_begin.drr_flags);
+		DO64(drr_begin.drr_toguid);
+		DO64(drr_begin.drr_fromguid);
+		break;
+	case DRR_OBJECT:
+		DO64(drr_object.drr_object);
+		DO32(drr_object.drr_type);
+		DO32(drr_object.drr_bonustype);
+		DO32(drr_object.drr_blksz);
+		DO32(drr_object.drr_bonuslen);
+		DO64(drr_object.drr_toguid);
+		break;
+	case DRR_FREEOBJECTS:
+		DO64(drr_freeobjects.drr_firstobj);
+		DO64(drr_freeobjects.drr_numobjs);
+		DO64(drr_freeobjects.drr_toguid);
+		break;
+	case DRR_WRITE:
+		DO64(drr_write.drr_object);
+		DO32(drr_write.drr_type);
+		DO64(drr_write.drr_offset);
+		DO64(drr_write.drr_length);
+		DO64(drr_write.drr_toguid);
+		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
+		DO64(drr_write.drr_key.ddk_prop);
+		break;
+	case DRR_WRITE_BYREF:
+		DO64(drr_write_byref.drr_object);
+		DO64(drr_write_byref.drr_offset);
+		DO64(drr_write_byref.drr_length);
+		DO64(drr_write_byref.drr_toguid);
+		DO64(drr_write_byref.drr_refguid);
+		DO64(drr_write_byref.drr_refobject);
+		DO64(drr_write_byref.drr_refoffset);
+		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref.
+		    drr_key.ddk_cksum);
+		DO64(drr_write_byref.drr_key.ddk_prop);
+		break;
+	case DRR_WRITE_EMBEDDED:
+		DO64(drr_write_embedded.drr_object);
+		DO64(drr_write_embedded.drr_offset);
+		DO64(drr_write_embedded.drr_length);
+		DO64(drr_write_embedded.drr_toguid);
+		DO32(drr_write_embedded.drr_lsize);
+		DO32(drr_write_embedded.drr_psize);
+		break;
+	case DRR_FREE:
+		DO64(drr_free.drr_object);
+		DO64(drr_free.drr_offset);
+		DO64(drr_free.drr_length);
+		DO64(drr_free.drr_toguid);
+		break;
+	case DRR_SPILL:
+		DO64(drr_spill.drr_object);
+		DO64(drr_spill.drr_length);
+		DO64(drr_spill.drr_toguid);
+		break;
+	case DRR_END:
+		DO64(drr_end.drr_toguid);
+		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
+		break;
+	default:
+		break;
+	}
+
+	if (drr->drr_type != DRR_BEGIN) {
+		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
+	}
+
+#undef DO64
+#undef DO32
+}
+
+static inline uint8_t
+deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
+{
+	if (bonus_type == DMU_OT_SA) {
+		return (1);
+	} else {
+		return (1 +
+		    ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT));
+	}
+}
+
+noinline static int
+restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
+{
+	dmu_object_info_t doi;
+	dmu_tx_t *tx;
+	uint64_t object;
+	int err;
+
+	if (drro->drr_type == DMU_OT_NONE ||
+	    !DMU_OT_IS_VALID(drro->drr_type) ||
+	    !DMU_OT_IS_VALID(drro->drr_bonustype) ||
+	    drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
+	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
+	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
+	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
+	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(ra->os)) ||
+	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	err = dmu_object_info(ra->os, drro->drr_object, &doi);
+
+	if (err != 0 && err != ENOENT)
+		return (SET_ERROR(EINVAL));
+	object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
+
+	/*
+	 * If we are losing blkptrs or changing the block size this must
+	 * be a new file instance.  We must clear out the previous file
+	 * contents before we can change this type of metadata in the dnode.
+	 */
+	if (err == 0) {
+		int nblkptr;
+
+		nblkptr = deduce_nblkptr(drro->drr_bonustype,
+		    drro->drr_bonuslen);
+
+		if (drro->drr_blksz != doi.doi_data_block_size ||
+		    nblkptr < doi.doi_nblkptr) {
+			err = dmu_free_long_range(ra->os, drro->drr_object,
+			    0, DMU_OBJECT_END);
+			if (err != 0)
+				return (SET_ERROR(EINVAL));
+		}
+	}
+
+	tx = dmu_tx_create(ra->os);
+	dmu_tx_hold_bonus(tx, object);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+
+	if (object == DMU_NEW_OBJECT) {
+		/* currently free, want to be allocated */
+		err = dmu_object_claim(ra->os, drro->drr_object,
+		    drro->drr_type, drro->drr_blksz,
+		    drro->drr_bonustype, drro->drr_bonuslen, tx);
+	} else if (drro->drr_type != doi.doi_type ||
+	    drro->drr_blksz != doi.doi_data_block_size ||
+	    drro->drr_bonustype != doi.doi_bonus_type ||
+	    drro->drr_bonuslen != doi.doi_bonus_size) {
+		/* currently allocated, but with different properties */
+		err = dmu_object_reclaim(ra->os, drro->drr_object,
+		    drro->drr_type, drro->drr_blksz,
+		    drro->drr_bonustype, drro->drr_bonuslen, tx);
+	}
+	if (err != 0) {
+		dmu_tx_commit(tx);
+		return (SET_ERROR(EINVAL));
+	}
+
+	dmu_object_set_checksum(ra->os, drro->drr_object,
+	    drro->drr_checksumtype, tx);
+	dmu_object_set_compress(ra->os, drro->drr_object,
+	    drro->drr_compress, tx);
+
+	if (data != NULL) {
+		dmu_buf_t *db;
+
+		VERIFY0(dmu_bonus_hold(ra->os, drro->drr_object, FTAG, &db));
+		dmu_buf_will_dirty(db, tx);
+
+		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
+		bcopy(data, db->db_data, drro->drr_bonuslen);
+		if (ra->byteswap) {
+			dmu_object_byteswap_t byteswap =
+			    DMU_OT_BYTESWAP(drro->drr_bonustype);
+			dmu_ot_byteswap[byteswap].ob_func(db->db_data,
+			    drro->drr_bonuslen);
+		}
+		dmu_buf_rele(db, FTAG);
+	}
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+/* ARGSUSED */
+noinline static int
+restore_freeobjects(struct restorearg *ra,
+    struct drr_freeobjects *drrfo)
+{
+	uint64_t obj;
+
+	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
+		return (SET_ERROR(EINVAL));
+
+	for (obj = drrfo->drr_firstobj;
+	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
+	    (void) dmu_object_next(ra->os, &obj, FALSE, 0)) {
+		int err;
+
+		if (dmu_object_info(ra->os, obj, NULL) != 0)
+			continue;
+
+		err = dmu_free_long_object(ra->os, obj);
+		if (err != 0)
+			return (err);
+	}
+	return (0);
+}
+
+noinline static int
+restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
+{
+	dmu_tx_t *tx;
+	dmu_buf_t *bonus;
+	int err;
+
+	if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
+	    !DMU_OT_IS_VALID(drrw->drr_type))
+		return (SET_ERROR(EINVAL));
+
+	if (dmu_object_info(ra->os, drrw->drr_object, NULL) != 0)
+		return (SET_ERROR(EINVAL));
+
+	tx = dmu_tx_create(ra->os);
+
+	dmu_tx_hold_write(tx, drrw->drr_object,
+	    drrw->drr_offset, drrw->drr_length);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+	if (ra->byteswap) {
+		dmu_object_byteswap_t byteswap =
+		    DMU_OT_BYTESWAP(drrw->drr_type);
+		dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
+		    drrw->drr_length);
+	}
+
+	if (dmu_bonus_hold(ra->os, drrw->drr_object, FTAG, &bonus) != 0)
+		return (SET_ERROR(EINVAL));
+	dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
+	dmu_tx_commit(tx);
+	dmu_buf_rele(bonus, FTAG);
+	return (0);
+}
+
+/*
+ * Handle a DRR_WRITE_BYREF record.  This record is used in dedup'ed
+ * streams to refer to a copy of the data that is already on the
+ * system because it came in earlier in the stream.  This function
+ * finds the earlier copy of the data, and uses that copy instead of
+ * data from the stream to fulfill this write.
+ */
+static int
+restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
+{
+	dmu_tx_t *tx;
+	int err;
+	guid_map_entry_t gmesrch;
+	guid_map_entry_t *gmep;
+	avl_index_t where;
+	objset_t *ref_os = NULL;
+	dmu_buf_t *dbp;
+
+	if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * If the GUID of the referenced dataset is different from the
+	 * GUID of the target dataset, find the referenced dataset.
+	 */
+	if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
+		gmesrch.guid = drrwbr->drr_refguid;
+		if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
+		    &where)) == NULL) {
+			return (SET_ERROR(EINVAL));
+		}
+		if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
+			return (SET_ERROR(EINVAL));
+	} else {
+		ref_os = ra->os;
+	}
+
+	err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
+	    drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
+	if (err != 0)
+		return (err);
+
+	tx = dmu_tx_create(ra->os);
+
+	dmu_tx_hold_write(tx, drrwbr->drr_object,
+	    drrwbr->drr_offset, drrwbr->drr_length);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+	dmu_write(ra->os, drrwbr->drr_object,
+	    drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
+	dmu_buf_rele(dbp, FTAG);
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+static int
+restore_write_embedded(struct restorearg *ra,
+    struct drr_write_embedded *drrwnp, void *data)
+{
+	dmu_tx_t *tx;
+	int err;
+
+	if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset)
+		return (EINVAL);
+
+	if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE)
+		return (EINVAL);
+
+	if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES)
+		return (EINVAL);
+	if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
+		return (EINVAL);
+
+	tx = dmu_tx_create(ra->os);
+
+	dmu_tx_hold_write(tx, drrwnp->drr_object,
+	    drrwnp->drr_offset, drrwnp->drr_length);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+
+	dmu_write_embedded(ra->os, drrwnp->drr_object,
+	    drrwnp->drr_offset, data, drrwnp->drr_etype,
+	    drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
+	    ra->byteswap ^ ZFS_HOST_BYTEORDER, tx);
+
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+static int
+restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data)
+{
+	dmu_tx_t *tx;
+	dmu_buf_t *db, *db_spill;
+	int err;
+
+	if (drrs->drr_length < SPA_MINBLOCKSIZE ||
+	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(ra->os)))
+		return (SET_ERROR(EINVAL));
+
+	if (dmu_object_info(ra->os, drrs->drr_object, NULL) != 0)
+		return (SET_ERROR(EINVAL));
+
+	VERIFY0(dmu_bonus_hold(ra->os, drrs->drr_object, FTAG, &db));
+	if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
+		dmu_buf_rele(db, FTAG);
+		return (err);
+	}
+
+	tx = dmu_tx_create(ra->os);
+
+	dmu_tx_hold_spill(tx, db->db_object);
+
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_buf_rele(db, FTAG);
+		dmu_buf_rele(db_spill, FTAG);
+		dmu_tx_abort(tx);
+		return (err);
+	}
+	dmu_buf_will_dirty(db_spill, tx);
+
+	if (db_spill->db_size < drrs->drr_length)
+		VERIFY(0 == dbuf_spill_set_blksz(db_spill,
+		    drrs->drr_length, tx));
+	bcopy(data, db_spill->db_data, drrs->drr_length);
+
+	dmu_buf_rele(db, FTAG);
+	dmu_buf_rele(db_spill, FTAG);
+
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+/* ARGSUSED */
+noinline static int
+restore_free(struct restorearg *ra, struct drr_free *drrf)
+{
+	int err;
+
+	if (drrf->drr_length != -1ULL &&
+	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
+		return (SET_ERROR(EINVAL));
+
+	if (dmu_object_info(ra->os, drrf->drr_object, NULL) != 0)
+		return (SET_ERROR(EINVAL));
+
+	err = dmu_free_long_range(ra->os, drrf->drr_object,
+	    drrf->drr_offset, drrf->drr_length);
+	return (err);
+}
+
+/* used to destroy the drc_ds on error */
+static void
+dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
+{
+	char name[MAXNAMELEN];
+	dsl_dataset_name(drc->drc_ds, name);
+	dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+	(void) dsl_destroy_head(name);
+}
+
+static void
+restore_cksum(struct restorearg *ra, int len, void *buf)
+{
+	if (ra->byteswap) {
+		fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
+	} else {
+		fletcher_4_incremental_native(buf, len, &ra->cksum);
+	}
+}
+
+/*
+ * If len != 0, read payload into buf.
+ * Read next record's header into ra->next_drr.
+ * Verify checksum of payload and next record.
+ */
+static int
+restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf)
+{
+	int err;
+	zio_cksum_t cksum_orig;
+	zio_cksum_t *cksump;
+
+	if (len != 0) {
+		ASSERT3U(len, <=, ra->bufsize);
+		err = restore_read(ra, len, buf);
+		if (err != 0)
+			return (err);
+		restore_cksum(ra, len, buf);
+	}
+
+	ra->prev_cksum = ra->cksum;
+
+	err = restore_read(ra, sizeof (*ra->next_drr), ra->next_drr);
+	if (err != 0)
+		return (err);
+	if (ra->next_drr->drr_type == DRR_BEGIN)
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * Note: checksum is of everything up to but not including the
+	 * checksum itself.
+	 */
+	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+	restore_cksum(ra,
+	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    ra->next_drr);
+
+	cksum_orig = ra->next_drr->drr_u.drr_checksum.drr_checksum;
+	cksump = &ra->next_drr->drr_u.drr_checksum.drr_checksum;
+
+	if (ra->byteswap)
+		byteswap_record(ra->next_drr);
+
+	if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
+	    !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump))
+		return (SET_ERROR(ECKSUM));
+
+	restore_cksum(ra, sizeof (cksum_orig), &cksum_orig);
+
+	return (0);
+}
+
+static int
+restore_process_record(struct restorearg *ra)
+{
+	int err;
+
+	switch (ra->drr->drr_type) {
+	case DRR_OBJECT:
+	{
+		struct drr_object *drro = &ra->drr->drr_u.drr_object;
+		err = restore_read_payload_and_next_header(ra,
+		    P2ROUNDUP(drro->drr_bonuslen, 8), ra->buf);
+		if (err != 0)
+			return (err);
+		return (restore_object(ra, drro, ra->buf));
+	}
+	case DRR_FREEOBJECTS:
+	{
+		struct drr_freeobjects *drrfo =
+		    &ra->drr->drr_u.drr_freeobjects;
+		err = restore_read_payload_and_next_header(ra, 0, NULL);
+		if (err != 0)
+			return (err);
+		return (restore_freeobjects(ra, drrfo));
+	}
+	case DRR_WRITE:
+	{
+		struct drr_write *drrw = &ra->drr->drr_u.drr_write;
+		arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os),
+		    drrw->drr_length);
+
+		err = restore_read_payload_and_next_header(ra,
+		    drrw->drr_length, abuf->b_data);
+		if (err != 0)
+			return (err);
+		err = restore_write(ra, drrw, abuf);
+		/* if restore_write() is successful, it consumes the arc_buf */
+		if (err != 0)
+			dmu_return_arcbuf(abuf);
+		return (err);
+	}
+	case DRR_WRITE_BYREF:
+	{
+		struct drr_write_byref *drrwbr =
+		    &ra->drr->drr_u.drr_write_byref;
+		err = restore_read_payload_and_next_header(ra, 0, NULL);
+		if (err != 0)
+			return (err);
+		return (restore_write_byref(ra, drrwbr));
+	}
+	case DRR_WRITE_EMBEDDED:
+	{
+		struct drr_write_embedded *drrwe =
+		    &ra->drr->drr_u.drr_write_embedded;
+		err = restore_read_payload_and_next_header(ra,
+		    P2ROUNDUP(drrwe->drr_psize, 8), ra->buf);
+		if (err != 0)
+			return (err);
+		return (restore_write_embedded(ra, drrwe, ra->buf));
+	}
+	case DRR_FREE:
+	{
+		struct drr_free *drrf = &ra->drr->drr_u.drr_free;
+		err = restore_read_payload_and_next_header(ra, 0, NULL);
+		if (err != 0)
+			return (err);
+		return (restore_free(ra, drrf));
+	}
+	case DRR_END:
+	{
+		struct drr_end *drre = &ra->drr->drr_u.drr_end;
+		if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
+			return (SET_ERROR(EINVAL));
+		return (0);
+	}
+	case DRR_SPILL:
+	{
+		struct drr_spill *drrs = &ra->drr->drr_u.drr_spill;
+		err = restore_read_payload_and_next_header(ra,
+		    drrs->drr_length, ra->buf);
+		if (err != 0)
+			return (err);
+		return (restore_spill(ra, drrs, ra->buf));
+	}
+	default:
+		return (SET_ERROR(EINVAL));
+	}
+}
+
+/*
+ * NB: callers *must* call dmu_recv_end() if this succeeds.
+ */
+int
+dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
+    int cleanup_fd, uint64_t *action_handlep)
+{
+	int err = 0;
+	struct restorearg ra = { 0 };
+	int featureflags;
+
+	ra.byteswap = drc->drc_byteswap;
+	ra.cksum = drc->drc_cksum;
+	ra.vp = vp;
+	ra.voff = *voffp;
+	ra.bufsize = SPA_MAXBLOCKSIZE;
+	ra.drr = vmem_alloc(sizeof (*ra.drr), KM_SLEEP);
+	ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP);
+	ra.next_drr = vmem_alloc(sizeof (*ra.next_drr), KM_SLEEP);
+
+	/* these were verified in dmu_recv_begin */
+	ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
+	    DMU_SUBSTREAM);
+	ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
+
+	/*
+	 * Open the objset we are modifying.
+	 */
+	VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os));
+
+	ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
+
+	featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
+
+	/* if this stream is dedup'ed, set up the avl tree for guid mapping */
+	if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
+		minor_t minor;
+
+		if (cleanup_fd == -1) {
+			ra.err = SET_ERROR(EBADF);
+			goto out;
+		}
+		ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
+		if (ra.err != 0) {
+			cleanup_fd = -1;
+			goto out;
+		}
+
+		if (*action_handlep == 0) {
+			ra.guid_to_ds_map =
+			    kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+			avl_create(ra.guid_to_ds_map, guid_compare,
+			    sizeof (guid_map_entry_t),
+			    offsetof(guid_map_entry_t, avlnode));
+			err = zfs_onexit_add_cb(minor,
+			    free_guid_map_onexit, ra.guid_to_ds_map,
+			    action_handlep);
+			if (ra.err != 0)
+				goto out;
+		} else {
+			err = zfs_onexit_cb_data(minor, *action_handlep,
+			    (void **)&ra.guid_to_ds_map);
+			if (ra.err != 0)
+				goto out;
+		}
+
+		drc->drc_guid_to_ds_map = ra.guid_to_ds_map;
+	}
+
+	err = restore_read_payload_and_next_header(&ra, 0, NULL);
+	if (err != 0)
+		goto out;
+	for (;;) {
+		void *tmp;
+
+		if (issig(JUSTLOOKING) && issig(FORREAL)) {
+			err = SET_ERROR(EINTR);
+			break;
+		}
+
+		tmp = ra.next_drr;
+		ra.next_drr = ra.drr;
+		ra.drr = tmp;
+
+		/* process ra.drr, read in ra.next_drr */
+		err = restore_process_record(&ra);
+		if (err != 0)
+			break;
+		if (ra.drr->drr_type == DRR_END)
+			break;
+	}
+
+out:
+	if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
+		zfs_onexit_fd_rele(cleanup_fd);
+
+	if (err != 0) {
+		/*
+		 * destroy what we created, so we don't leave it in the
+		 * inconsistent restoring state.
+		 */
+		dmu_recv_cleanup_ds(drc);
+	}
+
+	vmem_free(ra.drr, sizeof (*ra.drr));
+	vmem_free(ra.buf, ra.bufsize);
+	vmem_free(ra.next_drr, sizeof (*ra.next_drr));
+	*voffp = ra.voff;
+	return (err);
+}
+
+static int
+dmu_recv_end_check(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_cookie_t *drc = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	int error;
+
+	ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
+
+	if (!drc->drc_newfs) {
+		dsl_dataset_t *origin_head;
+
+		error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
+		if (error != 0)
+			return (error);
+		if (drc->drc_force) {
+			/*
+			 * We will destroy any snapshots in tofs (i.e. before
+			 * origin_head) that are after the origin (which is
+			 * the snap before drc_ds, because drc_ds can not
+			 * have any snaps of its own).
+			 */
+			uint64_t obj;
+
+			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+			while (obj !=
+			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
+				dsl_dataset_t *snap;
+				error = dsl_dataset_hold_obj(dp, obj, FTAG,
+				    &snap);
+				if (error != 0)
+					break;
+				if (snap->ds_dir != origin_head->ds_dir)
+					error = SET_ERROR(EINVAL);
+				if (error == 0)  {
+					error = dsl_destroy_snapshot_check_impl(
+					    snap, B_FALSE);
+				}
+				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+				dsl_dataset_rele(snap, FTAG);
+				if (error != 0)
+					break;
+			}
+			if (error != 0) {
+				dsl_dataset_rele(origin_head, FTAG);
+				return (error);
+			}
+		}
+		error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
+		    origin_head, drc->drc_force, drc->drc_owner, tx);
+		if (error != 0) {
+			dsl_dataset_rele(origin_head, FTAG);
+			return (error);
+		}
+		error = dsl_dataset_snapshot_check_impl(origin_head,
+		    drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
+		dsl_dataset_rele(origin_head, FTAG);
+		if (error != 0)
+			return (error);
+
+		error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
+	} else {
+		error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
+		    drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
+	}
+	return (error);
+}
+
+static void
+dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_cookie_t *drc = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+
+	spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
+	    tx, "snap=%s", drc->drc_tosnap);
+
+	if (!drc->drc_newfs) {
+		dsl_dataset_t *origin_head;
+
+		VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
+		    &origin_head));
+
+		if (drc->drc_force) {
+			/*
+			 * Destroy any snapshots of drc_tofs (origin_head)
+			 * after the origin (the snap before drc_ds).
+			 */
+			uint64_t obj;
+
+			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+			while (obj !=
+			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
+				dsl_dataset_t *snap;
+				VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
+				    &snap));
+				ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
+				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+				dsl_destroy_snapshot_sync_impl(snap,
+				    B_FALSE, tx);
+				dsl_dataset_rele(snap, FTAG);
+			}
+		}
+		VERIFY3P(drc->drc_ds->ds_prev, ==,
+		    origin_head->ds_prev);
+
+		dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
+		    origin_head, tx);
+		dsl_dataset_snapshot_sync_impl(origin_head,
+		    drc->drc_tosnap, tx);
+
+		/* set snapshot's creation time and guid */
+		dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
+		dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
+		    drc->drc_drrb->drr_creation_time;
+		dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
+		    drc->drc_drrb->drr_toguid;
+		dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
+		    ~DS_FLAG_INCONSISTENT;
+
+		dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
+		dsl_dataset_phys(origin_head)->ds_flags &=
+		    ~DS_FLAG_INCONSISTENT;
+
+		dsl_dataset_rele(origin_head, FTAG);
+		dsl_destroy_head_sync_impl(drc->drc_ds, tx);
+
+		if (drc->drc_owner != NULL)
+			VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
+	} else {
+		dsl_dataset_t *ds = drc->drc_ds;
+
+		dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
+
+		/* set snapshot's creation time and guid */
+		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+		dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
+		    drc->drc_drrb->drr_creation_time;
+		dsl_dataset_phys(ds->ds_prev)->ds_guid =
+		    drc->drc_drrb->drr_toguid;
+		dsl_dataset_phys(ds->ds_prev)->ds_flags &=
+		    ~DS_FLAG_INCONSISTENT;
+
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+	}
+	drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
+	/*
+	 * Release the hold from dmu_recv_begin.  This must be done before
+	 * we return to open context, so that when we free the dataset's dnode,
+	 * we can evict its bonus buffer.
+	 */
+	dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+	drc->drc_ds = NULL;
+}
+
+static int
+add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *snapds;
+	guid_map_entry_t *gmep;
+	int err;
+
+	ASSERT(guid_map != NULL);
+
+	err = dsl_pool_hold(name, FTAG, &dp);
+	if (err != 0)
+		return (err);
+	gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
+	err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds);
+	if (err == 0) {
+		gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
+		gmep->gme_ds = snapds;
+		avl_add(guid_map, gmep);
+		dsl_dataset_long_hold(snapds, gmep);
+	} else {
+		kmem_free(gmep, sizeof (*gmep));
+	}
+
+	dsl_pool_rele(dp, FTAG);
+	return (err);
+}
+
+static int dmu_recv_end_modified_blocks = 3;
+
+static int
+dmu_recv_existing_end(dmu_recv_cookie_t *drc)
+{
+	int error;
+
+#ifdef _KERNEL
+	char *name;
+
+	/*
+	 * We will be destroying the ds; make sure its origin is unmounted if
+	 * necessary.
+	 */
+	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+	dsl_dataset_name(drc->drc_ds, name);
+	zfs_destroy_unmount_origin(name);
+	kmem_free(name, MAXNAMELEN);
+#endif
+
+	error = dsl_sync_task(drc->drc_tofs,
+	    dmu_recv_end_check, dmu_recv_end_sync, drc,
+	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
+
+	if (error != 0)
+		dmu_recv_cleanup_ds(drc);
+	return (error);
+}
+
+static int
+dmu_recv_new_end(dmu_recv_cookie_t *drc)
+{
+	int error;
+
+	error = dsl_sync_task(drc->drc_tofs,
+	    dmu_recv_end_check, dmu_recv_end_sync, drc,
+	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
+
+	if (error != 0) {
+		dmu_recv_cleanup_ds(drc);
+	} else if (drc->drc_guid_to_ds_map != NULL) {
+		(void) add_ds_to_guidmap(drc->drc_tofs,
+		    drc->drc_guid_to_ds_map,
+		    drc->drc_newsnapobj);
+	}
+	return (error);
+}
+
+int
+dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
+{
+	drc->drc_owner = owner;
+
+	if (drc->drc_newfs)
+		return (dmu_recv_new_end(drc));
+	else
+		return (dmu_recv_existing_end(drc));
+}
+
+/*
+ * Return TRUE if this objset is currently being received into.
+ */
+boolean_t
+dmu_objset_is_receiving(objset_t *os)
+{
+	return (os->os_dsl_dataset != NULL &&
+	    os->os_dsl_dataset->ds_owner == dmu_recv_tag);
+}
+
+#if defined(_KERNEL)
+module_param(zfs_send_corrupt_data, int, 0644);
+MODULE_PARM_DESC(zfs_send_corrupt_data, "Allow sending corrupt data");
+#endif
diff --git a/module/zfs/dmu_send.c.rej b/module/zfs/dmu_send.c.rej
new file mode 100644
index 000000000000..6082c7391624
--- /dev/null
+++ b/module/zfs/dmu_send.c.rej
@@ -0,0 +1,484 @@
+--- module/zfs/dmu_send.c
++++ module/zfs/dmu_send.c
+@@ -52,13 +52,38 @@
+ #include <sys/blkptr.h>
+ #include <sys/dsl_bookmark.h>
+ #include <sys/zfeature.h>
++#include <sys/bqueue.h>
+ 
+ /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
+ int zfs_send_corrupt_data = B_FALSE;
++int zfs_send_queue_length = 16 * 1024 * 1024;
++int zfs_recv_queue_length = 16 * 1024 * 1024;
+ 
+ static char *dmu_recv_tag = "dmu_recv_tag";
+ static const char *recv_clone_name = "%recv";
+ 
++#define	BP_SPAN(datablkszsec, indblkshift, level) \
++	(((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
++	(level) * (indblkshift - SPA_BLKPTRSHIFT)))
++
++struct send_thread_arg {
++	bqueue_t	q;
++	dsl_dataset_t	*ds;		/* Dataset to traverse */
++	uint64_t	fromtxg;	/* Traverse from this txg */
++	int		flags;		/* flags to pass to traverse_dataset */
++	int		error_code;
++	boolean_t	cancel;
++};
++
++struct send_block_record {
++	boolean_t		eos_marker; /* Marks the end of the stream */
++	blkptr_t		bp;
++	zbookmark_phys_t	zb;
++	uint8_t			indblkshift;
++	uint16_t		datablkszsec;
++	bqueue_node_t		ln;
++};
++
+ static int
+ dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
+ {
+@@ -557,20 +639,21 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ 		    &aflags, zb) != 0)
+ 			return (SET_ERROR(EIO));
+ 
+-		err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
++		err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data);
+ 		(void) arc_buf_remove_ref(abuf, &abuf);
+-	} else if (backup_do_embed(dsp, bp)) {
++	} else if (backup_do_embed(dsa, bp)) {
+ 		/* it's an embedded level-0 block of a regular object */
+-		int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+-		err = dump_write_embedded(dsp, zb->zb_object,
++		int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
++		ASSERT0(zb->zb_level);
++		err = dump_write_embedded(dsa, zb->zb_object,
+ 		    zb->zb_blkid * blksz, blksz, bp);
+-	} else { /* it's a level-0 block of a regular object */
++	} else {
++		/* it's a level-0 block of a regular object */
+ 		arc_flags_t aflags = ARC_FLAG_WAIT;
+ 		arc_buf_t *abuf;
+-		int blksz = BP_GET_LSIZE(bp);
++		int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
+ 		uint64_t offset;
+ 
+-		ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ 		ASSERT0(zb->zb_level);
+ 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
+ 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+@@ -669,35 +766,34 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
+ 	}
+ #endif
+ 
+-	if (large_block_ok && ds->ds_large_blocks)
++	if (large_block_ok && to_ds->ds_large_blocks)
+ 		featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
+ 	if (embedok &&
+ 	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
+ 		featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
+ 		if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+ 			featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
+-	} else {
+-		embedok = B_FALSE;
+ 	}
+ 
+ 	DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
+ 	    featureflags);
+ 
+ 	drr->drr_u.drr_begin.drr_creation_time =
+-	    dsl_dataset_phys(ds)->ds_creation_time;
++	    dsl_dataset_phys(to_ds)->ds_creation_time;
+ 	drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
+ 	if (is_clone)
+ 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
+-	drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid;
+-	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
++	drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
++	if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
+ 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
+ 
+-	if (fromzb != NULL) {
+-		drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid;
+-		fromtxg = fromzb->zbm_creation_txg;
++	if (ancestor_zb != NULL) {
++		drr->drr_u.drr_begin.drr_fromguid =
++		    ancestor_zb->zbm_guid;
++		fromtxg = ancestor_zb->zbm_creation_txg;
+ 	}
+-	dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
+-	if (!ds->ds_is_snapshot) {
++	dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname);
++	if (!to_ds->ds_is_snapshot) {
+ 		(void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
+ 		    sizeof (drr->drr_u.drr_begin.drr_toname));
+ 	}
+@@ -1608,7 +1771,8 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
+ }
+ 
+ static int
+-restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
++receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
++    void *data)
+ {
+ 	dmu_object_info_t doi;
+ 	dmu_tx_t *tx;
+@@ -1707,7 +1871,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
+ 
+ /* ARGSUSED */
+ static int
+-restore_freeobjects(struct restorearg *ra,
++receive_freeobjects(struct receive_writer_arg *rwa,
+     struct drr_freeobjects *drrfo)
+ {
+ 	uint64_t obj;
+@@ -1731,7 +1895,8 @@ restore_freeobjects(struct restorearg *ra,
+ }
+ 
+ static int
+-restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
++receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
++    arc_buf_t *abuf)
+ {
+ 	dmu_tx_t *tx;
+ 	int err;
+@@ -1760,7 +1925,7 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
+ 	}
+ 
+ 	dmu_buf_t *bonus;
+-	if (dmu_bonus_hold(ra->os, drrw->drr_object, FTAG, &bonus) != 0)
++	if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0)
+ 		return (SET_ERROR(EINVAL));
+ 	dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
+ 	dmu_tx_commit(tx);
+@@ -1912,7 +2079,7 @@ restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data)
+ 
+ /* ARGSUSED */
+ static int
+-restore_free(struct restorearg *ra, struct drr_free *drrf)
++receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
+ {
+ 	int err;
+ 
+@@ -1950,30 +2118,42 @@ restore_cksum(struct restorearg *ra, int len, void *buf)
+ }
+ 
+ /*
+- * If len != 0, read payload into buf.
+- * Read next record's header into ra->next_drr.
++ * Read the payload into a buffer of size len, and update the current record's
++ * payload field.
++ * Allocate ra->next_rrd and read the next record's header into
++ * ra->next_rrd->header.
+  * Verify checksum of payload and next record.
+  */
+ static int
+-restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf)
++receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
+ {
+ 	int err;
+ 
+ 	if (len != 0) {
+-		ASSERT3U(len, <=, ra->bufsize);
+-		err = restore_read(ra, len, buf);
++		ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
++		ra->rrd->payload = buf;
++		ra->rrd->payload_size = len;
++		err = receive_read(ra, len, ra->rrd->payload);
+ 		if (err != 0)
+ 			return (err);
+-		restore_cksum(ra, len, buf);
++		receive_cksum(ra, len, ra->rrd->payload);
+ 	}
+ 
+ 	ra->prev_cksum = ra->cksum;
+ 
+-	err = restore_read(ra, sizeof (*ra->next_drr), ra->next_drr);
+-	if (err != 0)
++	ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
++	err = receive_read(ra, sizeof (ra->next_rrd->header),
++	    &ra->next_rrd->header);
++	if (err != 0) {
++		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
++		ra->next_rrd = NULL;
+ 		return (err);
+-	if (ra->next_drr->drr_type == DRR_BEGIN)
++	}
++	if (ra->next_rrd->header.drr_type == DRR_BEGIN) {
++		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
++		ra->next_rrd = NULL;
+ 		return (SET_ERROR(EINVAL));
++	}
+ 
+ 	/*
+ 	 * Note: checksum is of everything up to but not including the
+@@ -1981,107 +2161,180 @@ restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf)
+ 	 */
+ 	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+ 	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+-	restore_cksum(ra,
++	receive_cksum(ra,
+ 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+-	    ra->next_drr);
++	    &ra->next_rrd->header);
+ 
+-	zio_cksum_t cksum_orig = ra->next_drr->drr_u.drr_checksum.drr_checksum;
+-	zio_cksum_t *cksump = &ra->next_drr->drr_u.drr_checksum.drr_checksum;
++	zio_cksum_t cksum_orig =
++	    ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
++	zio_cksum_t *cksump =
++	    &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
+ 
+ 	if (ra->byteswap)
+-		byteswap_record(ra->next_drr);
++		byteswap_record(&ra->next_rrd->header);
+ 
+ 	if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
+-	    !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump))
++	    !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) {
++		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
++		ra->next_rrd = NULL;
+ 		return (SET_ERROR(ECKSUM));
++	}
+ 
+-	restore_cksum(ra, sizeof (cksum_orig), &cksum_orig);
++	receive_cksum(ra, sizeof (cksum_orig), &cksum_orig);
+ 
+ 	return (0);
+ }
+ 
++/*
++ * Issue the prefetch reads for any necessary indirect blocks.
++ *
++ * We use the object ignore list to tell us whether or not to issue prefetches
++ * for a given object.  We do this for both correctness (in case the blocksize
++ * of an object has changed) and performance (if the object doesn't exist, don't
++ * needlessly try to issue prefetches).  We also trim the list as we go through
++ * the stream to prevent it from growing to an unbounded size.
++ *
++ * The object numbers within will always be in sorted order, and any write
++ * records we see will also be in sorted order, but they're not sorted with
++ * respect to each other (i.e. we can get several object records before
++ * receiving each object's write records).  As a result, once we've reached a
++ * given object number, we can safely remove any reference to lower object
++ * numbers in the ignore list. In practice, we receive up to 32 object records
++ * before receiving write records, so the list can have up to 32 nodes in it.
++ */
++/* ARGSUSED */
++static void
++receive_read_prefetch(struct receive_arg *ra,
++    uint64_t object, uint64_t offset, uint64_t length)
++{
++	struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list);
++	while (node != NULL && node->object < object) {
++		VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list));
++		kmem_free(node, sizeof (*node));
++		node = list_head(&ra->ignore_obj_list);
++	}
++	if (node == NULL || node->object > object) {
++		dmu_prefetch(ra->os, object, 1, offset, length,
++		    ZIO_PRIORITY_SYNC_READ);
++	}
++}
++
++/*
++ * Read records off the stream, issuing any necessary prefetches.
++ */
+ static int
+-restore_process_record(struct restorearg *ra)
++receive_read_record(struct receive_arg *ra)
+ {
+ 	int err;
+ 
+-	switch (ra->drr->drr_type) {
++	switch (ra->rrd->header.drr_type) {
+ 	case DRR_OBJECT:
+ 	{
+-		struct drr_object *drro = &ra->drr->drr_u.drr_object;
+-		err = restore_read_payload_and_next_header(ra,
+-		    P2ROUNDUP(drro->drr_bonuslen, 8), ra->buf);
+-		if (err != 0)
++		struct drr_object *drro = &ra->rrd->header.drr_u.drr_object;
++		uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8);
++		void *buf = kmem_zalloc(size, KM_SLEEP);
++		dmu_object_info_t doi;
++		err = receive_read_payload_and_next_header(ra, size, buf);
++		if (err != 0) {
++			kmem_free(buf, size);
+ 			return (err);
+-		return (restore_object(ra, drro, ra->buf));
++		}
++		err = dmu_object_info(ra->os, drro->drr_object, &doi);
++		/*
++		 * See receive_read_prefetch for an explanation why we're
++		 * storing this object in the ignore_obj_list.
++		 */
++		if (err == ENOENT ||
++		    (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
++			struct receive_ign_obj_node *node =
++			    kmem_zalloc(sizeof (*node),
++			    KM_SLEEP);
++			node->object = drro->drr_object;
++#ifdef ZFS_DEBUG
++			struct receive_ign_obj_node *last_object =
++			    list_tail(&ra->ignore_obj_list);
++			uint64_t last_objnum = (last_object != NULL ?
++			    last_object->object : 0);
++			ASSERT3U(node->object, >, last_objnum);
++#endif
++			list_insert_tail(&ra->ignore_obj_list, node);
++			err = 0;
++		}
++		return (err);
+ 	}
+ 	case DRR_FREEOBJECTS:
+ 	{
+-		struct drr_freeobjects *drrfo =
+-		    &ra->drr->drr_u.drr_freeobjects;
+-		err = restore_read_payload_and_next_header(ra, 0, NULL);
+-		if (err != 0)
+-			return (err);
+-		return (restore_freeobjects(ra, drrfo));
++		err = receive_read_payload_and_next_header(ra, 0, NULL);
++		return (err);
+ 	}
+ 	case DRR_WRITE:
+ 	{
+-		struct drr_write *drrw = &ra->drr->drr_u.drr_write;
++		struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write;
+ 		arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os),
+ 		    drrw->drr_length);
+ 
+-		err = restore_read_payload_and_next_header(ra,
++		err = receive_read_payload_and_next_header(ra,
+ 		    drrw->drr_length, abuf->b_data);
+-		if (err != 0)
+-			return (err);
+-		err = restore_write(ra, drrw, abuf);
+-		/* if restore_write() is successful, it consumes the arc_buf */
+-		if (err != 0)
++		if (err != 0) {
+ 			dmu_return_arcbuf(abuf);
++			return (err);
++		}
++		ra->rrd->write_buf = abuf;
++		receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset,
++		    drrw->drr_length);
+ 		return (err);
+ 	}
+ 	case DRR_WRITE_BYREF:
+ 	{
+-		struct drr_write_byref *drrwbr =
+-		    &ra->drr->drr_u.drr_write_byref;
+-		err = restore_read_payload_and_next_header(ra, 0, NULL);
+-		if (err != 0)
+-			return (err);
+-		return (restore_write_byref(ra, drrwbr));
++		struct drr_write_byref *drrwb =
++		    &ra->rrd->header.drr_u.drr_write_byref;
++		err = receive_read_payload_and_next_header(ra, 0, NULL);
++		receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset,
++		    drrwb->drr_length);
++		return (err);
+ 	}
+ 	case DRR_WRITE_EMBEDDED:
+ 	{
+ 		struct drr_write_embedded *drrwe =
+-		    &ra->drr->drr_u.drr_write_embedded;
+-		err = restore_read_payload_and_next_header(ra,
+-		    P2ROUNDUP(drrwe->drr_psize, 8), ra->buf);
+-		if (err != 0)
++		    &ra->rrd->header.drr_u.drr_write_embedded;
++		uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
++		void *buf = kmem_zalloc(size, KM_SLEEP);
++
++		err = receive_read_payload_and_next_header(ra, size, buf);
++		if (err != 0) {
++			kmem_free(buf, size);
+ 			return (err);
+-		return (restore_write_embedded(ra, drrwe, ra->buf));
++		}
++
++		receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset,
++		    drrwe->drr_length);
++		return (err);
+ 	}
+ 	case DRR_FREE:
+ 	{
+-		struct drr_free *drrf = &ra->drr->drr_u.drr_free;
+-		err = restore_read_payload_and_next_header(ra, 0, NULL);
+-		if (err != 0)
+-			return (err);
+-		return (restore_free(ra, drrf));
++		/*
++		 * It might be beneficial to prefetch indirect blocks here, but
++		 * we don't really have the data to decide for sure.
++		 */
++		err = receive_read_payload_and_next_header(ra, 0, NULL);
++		return (err);
+ 	}
+ 	case DRR_END:
+ 	{
+-		struct drr_end *drre = &ra->drr->drr_u.drr_end;
++		struct drr_end *drre = &ra->rrd->header.drr_u.drr_end;
+ 		if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
+ 			return (SET_ERROR(EINVAL));
+ 		return (0);
+ 	}
+ 	case DRR_SPILL:
+ 	{
+-		struct drr_spill *drrs = &ra->drr->drr_u.drr_spill;
+-		err = restore_read_payload_and_next_header(ra,
+-		    drrs->drr_length, ra->buf);
++		struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill;
++		void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP);
++		err = receive_read_payload_and_next_header(ra, drrs->drr_length,
++		    buf);
+ 		if (err != 0)
+-			return (err);
+-		return (restore_spill(ra, drrs, ra->buf));
++			kmem_free(buf, drrs->drr_length);
++		return (err);
+ 	}
+ 	default:
+ 		return (SET_ERROR(EINVAL));
+@@ -2208,17 +2573,16 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
+     int cleanup_fd, uint64_t *action_handlep)
+ {
+ 	int err = 0;
+-	struct restorearg ra = { 0 };
++	struct receive_arg ra = { 0 };
++	struct receive_writer_arg rwa = { 0 };
+ 	int featureflags;
+ 
+ 	ra.byteswap = drc->drc_byteswap;
+ 	ra.cksum = drc->drc_cksum;
+ 	ra.vp = vp;
+ 	ra.voff = *voffp;
+-	ra.bufsize = SPA_MAXBLOCKSIZE;
+-	ra.drr = kmem_alloc(sizeof (*ra.drr), KM_SLEEP);
+-	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
+-	ra.next_drr = kmem_alloc(sizeof (*ra.next_drr), KM_SLEEP);
++	list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node),
++	    offsetof(struct receive_ign_obj_node, node));
+ 
+ 	/* these were verified in dmu_recv_begin */
+ 	ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
+@@ -2348,10 +2756,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
+ 		dmu_recv_cleanup_ds(drc);
+ 	}
+ 
+-	kmem_free(ra.drr, sizeof (*ra.drr));
+-	kmem_free(ra.buf, ra.bufsize);
+-	kmem_free(ra.next_drr, sizeof (*ra.next_drr));
+ 	*voffp = ra.voff;
++	for (struct receive_ign_obj_node *n =
++	    list_remove_head(&ra.ignore_obj_list); n != NULL;
++	    n = list_remove_head(&ra.ignore_obj_list)) {
++		kmem_free(n, sizeof (*n));
++	}
++	list_destroy(&ra.ignore_obj_list);
+ 	return (err);
+ }
+ 
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index 12d099bfd414..07164161bb94 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -157,7 +157,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
 		 * If we already visited this bp & everything below,
 		 * don't bother doing it again.
 		 */
-		if (zbookmark_is_before(dnp, zb, td->td_resume))
+		if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
 			return (RESUME_SKIP_ALL);
 
 		/*
@@ -428,6 +428,17 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
 	int j, err = 0;
 	zbookmark_phys_t czb;
 
+	if (td->td_flags & TRAVERSE_PRE) {
+		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+		    ZB_DNODE_BLKID);
+		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
+		    td->td_arg);
+		if (err == TRAVERSE_VISIT_NO_CHILDREN)
+			return (0);
+		if (err != 0)
+			return (err);
+	}
+
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
@@ -435,10 +446,21 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
 			break;
 	}
 
-	if (err == 0 && dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+	if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 		err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
 	}
+
+	if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
+		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+		    ZB_DNODE_BLKID);
+		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
+		    td->td_arg);
+		if (err == TRAVERSE_VISIT_NO_CHILDREN)
+			return (0);
+		if (err != 0)
+			return (err);
+	}
 	return (err);
 }
 
@@ -451,6 +473,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 
 	ASSERT(pfd->pd_bytes_fetched >= 0);
+	if (bp == NULL)
+		return (0);
 	if (pfd->pd_cancel)
 		return (SET_ERROR(EINTR));
 
diff --git a/module/zfs/dmu_traverse.c.orig b/module/zfs/dmu_traverse.c.orig
new file mode 100644
index 000000000000..12d099bfd414
--- /dev/null
+++ b/module/zfs/dmu_traverse.c.orig
@@ -0,0 +1,672 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_impl.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/callb.h>
+#include <sys/zfeature.h>
+
+int32_t zfs_pd_bytes_max = 50 * 1024 * 1024;	/* 50MB */
+
+typedef struct prefetch_data {
+	kmutex_t pd_mtx;
+	kcondvar_t pd_cv;
+	int32_t pd_bytes_fetched;
+	int pd_flags;
+	boolean_t pd_cancel;
+	boolean_t pd_exited;
+} prefetch_data_t;
+
+typedef struct traverse_data {
+	spa_t *td_spa;
+	uint64_t td_objset;
+	blkptr_t *td_rootbp;
+	uint64_t td_min_txg;
+	zbookmark_phys_t *td_resume;
+	int td_flags;
+	prefetch_data_t *td_pfd;
+	boolean_t td_paused;
+	uint64_t td_hole_birth_enabled_txg;
+	blkptr_cb_t *td_func;
+	void *td_arg;
+} traverse_data_t;
+
+static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
+    uint64_t objset, uint64_t object);
+static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
+    uint64_t objset, uint64_t object);
+
+static int
+traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+	traverse_data_t *td = arg;
+	zbookmark_phys_t zb;
+
+	if (BP_IS_HOLE(bp))
+		return (0);
+
+	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
+		return (0);
+
+	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+	(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
+
+	return (0);
+}
+
+static int
+traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+{
+	traverse_data_t *td = arg;
+
+	if (lrc->lrc_txtype == TX_WRITE) {
+		lr_write_t *lr = (lr_write_t *)lrc;
+		blkptr_t *bp = &lr->lr_blkptr;
+		zbookmark_phys_t zb;
+
+		if (BP_IS_HOLE(bp))
+			return (0);
+
+		if (claim_txg == 0 || bp->blk_birth < claim_txg)
+			return (0);
+
+		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
+		    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
+
+		(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
+		    td->td_arg);
+	}
+	return (0);
+}
+
+static void
+traverse_zil(traverse_data_t *td, zil_header_t *zh)
+{
+	uint64_t claim_txg = zh->zh_claim_txg;
+	zilog_t *zilog;
+
+	/*
+	 * We only want to visit blocks that have been claimed but not yet
+	 * replayed; plus, in read-only mode, blocks that are already stable.
+	 */
+	if (claim_txg == 0 && spa_writeable(td->td_spa))
+		return;
+
+	zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
+
+	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
+	    claim_txg);
+
+	zil_free(zilog);
+}
+
+typedef enum resume_skip {
+	RESUME_SKIP_ALL,
+	RESUME_SKIP_NONE,
+	RESUME_SKIP_CHILDREN
+} resume_skip_t;
+
+/*
+ * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
+ * the block indicated by zb does not need to be visited at all. Returns
+ * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
+ * resume point. This indicates that this block should be visited but not its
+ * children (since they must have been visited in a previous traversal).
+ * Otherwise returns RESUME_SKIP_NONE.
+ */
+static resume_skip_t
+resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
+    const zbookmark_phys_t *zb)
+{
+	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
+		/*
+		 * If we already visited this bp & everything below,
+		 * don't bother doing it again.
+		 */
+		if (zbookmark_is_before(dnp, zb, td->td_resume))
+			return (RESUME_SKIP_ALL);
+
+		/*
+		 * If we found the block we're trying to resume from, zero
+		 * the bookmark out to indicate that we have resumed.
+		 */
+		if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
+			bzero(td->td_resume, sizeof (*zb));
+			if (td->td_flags & TRAVERSE_POST)
+				return (RESUME_SKIP_CHILDREN);
+		}
+	}
+	return (RESUME_SKIP_NONE);
+}
+
+static void
+traverse_prefetch_metadata(traverse_data_t *td,
+    const blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+
+	if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
+		return;
+	/*
+	 * If we are in the process of resuming, don't prefetch, because
+	 * some children will not be needed (and in fact may have already
+	 * been freed).
+	 */
+	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
+		return;
+	if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
+		return;
+	if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
+		return;
+
+	(void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
+	    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+}
+
+static boolean_t
+prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
+{
+	ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
+	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
+	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
+		return (B_FALSE);
+	return (B_TRUE);
+}
+
+static int
+traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
+    const blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+	int err = 0;
+	arc_buf_t *buf = NULL;
+	prefetch_data_t *pd = td->td_pfd;
+
+	switch (resume_skip_check(td, dnp, zb)) {
+	case RESUME_SKIP_ALL:
+		return (0);
+	case RESUME_SKIP_CHILDREN:
+		goto post;
+	case RESUME_SKIP_NONE:
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	if (bp->blk_birth == 0) {
+		/*
+		 * Since this block has a birth time of 0 it must be a
+		 * hole created before the SPA_FEATURE_HOLE_BIRTH
+		 * feature was enabled.  If SPA_FEATURE_HOLE_BIRTH
+		 * was enabled before the min_txg for this traveral we
+		 * know the hole must have been created before the
+		 * min_txg for this traveral, so we can skip it. If
+		 * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg
+		 * for this traveral we cannot tell if the hole was
+		 * created before or after the min_txg for this
+		 * traversal, so we cannot skip it.
+		 */
+		if (td->td_hole_birth_enabled_txg < td->td_min_txg)
+			return (0);
+	} else if (bp->blk_birth <= td->td_min_txg) {
+		return (0);
+	}
+
+	if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
+		uint64_t size = BP_GET_LSIZE(bp);
+		mutex_enter(&pd->pd_mtx);
+		ASSERT(pd->pd_bytes_fetched >= 0);
+		while (pd->pd_bytes_fetched < size && !pd->pd_exited)
+			cv_wait_sig(&pd->pd_cv, &pd->pd_mtx);
+		pd->pd_bytes_fetched -= size;
+		cv_broadcast(&pd->pd_cv);
+		mutex_exit(&pd->pd_mtx);
+	}
+
+	if (BP_IS_HOLE(bp)) {
+		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
+		if (err != 0)
+			goto post;
+		return (0);
+	}
+
+	if (td->td_flags & TRAVERSE_PRE) {
+		err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
+		    td->td_arg);
+		if (err == TRAVERSE_VISIT_NO_CHILDREN)
+			return (0);
+		if (err != 0)
+			goto post;
+	}
+
+	if (BP_GET_LEVEL(bp) > 0) {
+		uint32_t flags = ARC_FLAG_WAIT;
+		int32_t i;
+		int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+		zbookmark_phys_t *czb;
+
+		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		if (err != 0)
+			goto post;
+
+		czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);
+
+		for (i = 0; i < epb; i++) {
+			SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object,
+			    zb->zb_level - 1,
+			    zb->zb_blkid * epb + i);
+			traverse_prefetch_metadata(td,
+			    &((blkptr_t *)buf->b_data)[i], czb);
+		}
+
+		/* recursively visitbp() blocks below this */
+		for (i = 0; i < epb; i++) {
+			SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object,
+			    zb->zb_level - 1,
+			    zb->zb_blkid * epb + i);
+			err = traverse_visitbp(td, dnp,
+			    &((blkptr_t *)buf->b_data)[i], czb);
+			if (err != 0)
+				break;
+		}
+
+		kmem_free(czb, sizeof (zbookmark_phys_t));
+
+	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+		uint32_t flags = ARC_FLAG_WAIT;
+		int32_t i;
+		int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+		dnode_phys_t *cdnp;
+
+		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		if (err != 0)
+			goto post;
+		cdnp = buf->b_data;
+
+		for (i = 0; i < epb; i++) {
+			prefetch_dnode_metadata(td, &cdnp[i], zb->zb_objset,
+			    zb->zb_blkid * epb + i);
+		}
+
+		/* recursively visitbp() blocks below this */
+		for (i = 0; i < epb; i++) {
+			err = traverse_dnode(td, &cdnp[i], zb->zb_objset,
+			    zb->zb_blkid * epb + i);
+			if (err != 0)
+				break;
+		}
+	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+		arc_flags_t flags = ARC_FLAG_WAIT;
+		objset_phys_t *osp;
+		dnode_phys_t *mdnp, *gdnp, *udnp;
+
+		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		if (err != 0)
+			goto post;
+
+		osp = buf->b_data;
+		mdnp = &osp->os_meta_dnode;
+		gdnp = &osp->os_groupused_dnode;
+		udnp = &osp->os_userused_dnode;
+
+		prefetch_dnode_metadata(td, mdnp, zb->zb_objset,
+		    DMU_META_DNODE_OBJECT);
+		if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+			prefetch_dnode_metadata(td, gdnp, zb->zb_objset,
+			    DMU_GROUPUSED_OBJECT);
+			prefetch_dnode_metadata(td, udnp, zb->zb_objset,
+			    DMU_USERUSED_OBJECT);
+		}
+
+		err = traverse_dnode(td, mdnp, zb->zb_objset,
+		    DMU_META_DNODE_OBJECT);
+		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+			err = traverse_dnode(td, gdnp, zb->zb_objset,
+			    DMU_GROUPUSED_OBJECT);
+		}
+		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+			err = traverse_dnode(td, udnp, zb->zb_objset,
+			    DMU_USERUSED_OBJECT);
+		}
+	}
+
+	if (buf)
+		(void) arc_buf_remove_ref(buf, &buf);
+
+post:
+	if (err == 0 && (td->td_flags & TRAVERSE_POST))
+		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
+
+	if ((td->td_flags & TRAVERSE_HARD) && (err == EIO || err == ECKSUM)) {
+		/*
+		 * Ignore this disk error as requested by the HARD flag,
+		 * and continue traversal.
+		 */
+		err = 0;
+	}
+
+	/*
+	 * If we are stopping here, set td_resume.
+	 */
+	if (td->td_resume != NULL && err != 0 && !td->td_paused) {
+		td->td_resume->zb_objset = zb->zb_objset;
+		td->td_resume->zb_object = zb->zb_object;
+		td->td_resume->zb_level = 0;
+		/*
+		 * If we have stopped on an indirect block (e.g. due to
+		 * i/o error), we have not visited anything below it.
+		 * Set the bookmark to the first level-0 block that we need
+		 * to visit.  This way, the resuming code does not need to
+		 * deal with resuming from indirect blocks.
+		 */
+		td->td_resume->zb_blkid = zb->zb_blkid <<
+		    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+		td->td_paused = B_TRUE;
+	}
+
+	return (err);
+}
+
+static void
+prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
+    uint64_t objset, uint64_t object)
+{
+	int j;
+	zbookmark_phys_t czb;
+
+	for (j = 0; j < dnp->dn_nblkptr; j++) {
+		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
+		traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
+	}
+
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
+		traverse_prefetch_metadata(td, &dnp->dn_spill, &czb);
+	}
+}
+
+static int
+traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
+    uint64_t objset, uint64_t object)
+{
+	int j, err = 0;
+	zbookmark_phys_t czb;
+
+	for (j = 0; j < dnp->dn_nblkptr; j++) {
+		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
+		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
+		if (err != 0)
+			break;
+	}
+
+	if (err == 0 && dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
+		err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
+	}
+	return (err);
+}
+
+/* ARGSUSED */
+static int
+traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	prefetch_data_t *pfd = arg;
+	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+
+	ASSERT(pfd->pd_bytes_fetched >= 0);
+	if (pfd->pd_cancel)
+		return (SET_ERROR(EINTR));
+
+	if (!prefetch_needed(pfd, bp))
+		return (0);
+
+	mutex_enter(&pfd->pd_mtx);
+	while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
+		cv_wait_sig(&pfd->pd_cv, &pfd->pd_mtx);
+	pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
+	cv_broadcast(&pfd->pd_cv);
+	mutex_exit(&pfd->pd_mtx);
+
+	(void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb);
+
+	return (0);
+}
+
+static void
+traverse_prefetch_thread(void *arg)
+{
+	traverse_data_t *td_main = arg;
+	traverse_data_t td = *td_main;
+	zbookmark_phys_t czb;
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+
+	td.td_func = traverse_prefetcher;
+	td.td_arg = td_main->td_pfd;
+	td.td_pfd = NULL;
+
+	SET_BOOKMARK(&czb, td.td_objset,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+	(void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
+
+	mutex_enter(&td_main->td_pfd->pd_mtx);
+	td_main->td_pfd->pd_exited = B_TRUE;
+	cv_broadcast(&td_main->td_pfd->pd_cv);
+	mutex_exit(&td_main->td_pfd->pd_mtx);
+	spl_fstrans_unmark(cookie);
+}
+
+/*
+ * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
+ * in syncing context).
+ */
+static int
+traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
+    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
+    blkptr_cb_t func, void *arg)
+{
+	traverse_data_t *td;
+	prefetch_data_t *pd;
+	zbookmark_phys_t *czb;
+	int err;
+
+	ASSERT(ds == NULL || objset == ds->ds_object);
+	ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
+
+	/*
+	 * The data prefetching mechanism (the prefetch thread) is incompatible
+	 * with resuming from a bookmark.
+	 */
+	ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA));
+
+	td = kmem_alloc(sizeof (traverse_data_t), KM_SLEEP);
+	pd = kmem_zalloc(sizeof (prefetch_data_t), KM_SLEEP);
+	czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);
+
+	td->td_spa = spa;
+	td->td_objset = objset;
+	td->td_rootbp = rootbp;
+	td->td_min_txg = txg_start;
+	td->td_resume = resume;
+	td->td_func = func;
+	td->td_arg = arg;
+	td->td_pfd = pd;
+	td->td_flags = flags;
+	td->td_paused = B_FALSE;
+
+	if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+		VERIFY(spa_feature_enabled_txg(spa,
+		    SPA_FEATURE_HOLE_BIRTH, &td->td_hole_birth_enabled_txg));
+	} else {
+		td->td_hole_birth_enabled_txg = 0;
+	}
+
+	pd->pd_flags = flags;
+	mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);
+
+	SET_BOOKMARK(czb, td->td_objset,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+	/* See comment on ZIL traversal in dsl_scan_visitds. */
+	if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
+		uint32_t flags = ARC_FLAG_WAIT;
+		objset_phys_t *osp;
+		arc_buf_t *buf;
+
+		err = arc_read(NULL, td->td_spa, rootbp,
+		    arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, czb);
+		if (err != 0)
+			return (err);
+
+		osp = buf->b_data;
+		traverse_zil(td, &osp->os_zil_header);
+		(void) arc_buf_remove_ref(buf, &buf);
+	}
+
+	if (!(flags & TRAVERSE_PREFETCH_DATA) ||
+	    0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
+	    td, TQ_NOQUEUE))
+		pd->pd_exited = B_TRUE;
+
+	err = traverse_visitbp(td, NULL, rootbp, czb);
+
+	mutex_enter(&pd->pd_mtx);
+	pd->pd_cancel = B_TRUE;
+	cv_broadcast(&pd->pd_cv);
+	while (!pd->pd_exited)
+		cv_wait_sig(&pd->pd_cv, &pd->pd_mtx);
+	mutex_exit(&pd->pd_mtx);
+
+	mutex_destroy(&pd->pd_mtx);
+	cv_destroy(&pd->pd_cv);
+
+	kmem_free(czb, sizeof (zbookmark_phys_t));
+	kmem_free(pd, sizeof (struct prefetch_data));
+	kmem_free(td, sizeof (struct traverse_data));
+
+	return (err);
+}
+
+/*
+ * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
+ * in syncing context).
+ */
+int
+traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
+    blkptr_cb_t func, void *arg)
+{
+	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
+	    &dsl_dataset_phys(ds)->ds_bp, txg_start, NULL, flags, func, arg));
+}
+
+int
+traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
+    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
+    blkptr_cb_t func, void *arg)
+{
+	return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
+	    blkptr, txg_start, resume, flags, func, arg));
+}
+
+/*
+ * NB: pool must not be changing on-disk (eg, from zdb or sync context).
+ */
+int
+traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
+    blkptr_cb_t func, void *arg)
+{
+	int err;
+	uint64_t obj;
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	objset_t *mos = dp->dp_meta_objset;
+	boolean_t hard = (flags & TRAVERSE_HARD);
+
+	/* visit the MOS */
+	err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
+	    txg_start, NULL, flags, func, arg);
+	if (err != 0)
+		return (err);
+
+	/* visit each dataset */
+	for (obj = 1; err == 0;
+	    err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
+		dmu_object_info_t doi;
+
+		err = dmu_object_info(mos, obj, &doi);
+		if (err != 0) {
+			if (hard)
+				continue;
+			break;
+		}
+
+		if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
+			dsl_dataset_t *ds;
+			uint64_t txg = txg_start;
+
+			dsl_pool_config_enter(dp, FTAG);
+			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
+			dsl_pool_config_exit(dp, FTAG);
+			if (err != 0) {
+				if (hard)
+					continue;
+				break;
+			}
+			if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
+				txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+			err = traverse_dataset(ds, txg, flags, func, arg);
+			dsl_dataset_rele(ds, FTAG);
+			if (err != 0)
+				break;
+		}
+	}
+	if (err == ESRCH)
+		err = 0;
+	return (err);
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(traverse_dataset);
+EXPORT_SYMBOL(traverse_pool);
+
+module_param(zfs_pd_bytes_max, int, 0644);
+MODULE_PARM_DESC(zfs_pd_bytes_max, "Max number of bytes to prefetch");
+#endif
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index 5ae429f70866..016defe79db8 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -332,7 +332,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 			dmu_buf_impl_t *db;
 
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
-			err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
+			err = dbuf_hold_impl(dn, 0, start,
+			    FALSE, FALSE, FTAG, &db);
 			rw_exit(&dn->dn_struct_rwlock);
 
 			if (err) {
@@ -533,7 +534,8 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 		blkoff = P2PHASE(blkid, epb);
 		tochk = MIN(epb - blkoff, nblks);
 
-		err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
+		err = dbuf_hold_impl(dn, 1, blkid >> epbs,
+		    FALSE, FALSE, FTAG, &dbuf);
 		if (err) {
 			txh->txh_tx->tx_err = err;
 			break;
diff --git a/module/zfs/dmu_tx.c.orig b/module/zfs/dmu_tx.c.orig
new file mode 100644
index 000000000000..5ae429f70866
--- /dev/null
+++ b/module/zfs/dmu_tx.c.orig
@@ -0,0 +1,1692 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
+#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
+#include <sys/dsl_pool.h>
+#include <sys/zap_impl.h> /* for fzap_default_block_shift */
+#include <sys/spa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/zfs_context.h>
+#include <sys/varargs.h>
+#include <sys/trace_dmu.h>
+
+typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
+    uint64_t arg1, uint64_t arg2);
+
+dmu_tx_stats_t dmu_tx_stats = {
+	{ "dmu_tx_assigned",		KSTAT_DATA_UINT64 },
+	{ "dmu_tx_delay",		KSTAT_DATA_UINT64 },
+	{ "dmu_tx_error",		KSTAT_DATA_UINT64 },
+	{ "dmu_tx_suspended",		KSTAT_DATA_UINT64 },
+	{ "dmu_tx_group",		KSTAT_DATA_UINT64 },
+	{ "dmu_tx_memory_reserve",	KSTAT_DATA_UINT64 },
+	{ "dmu_tx_memory_reclaim",	KSTAT_DATA_UINT64 },
+	{ "dmu_tx_dirty_throttle",	KSTAT_DATA_UINT64 },
+	{ "dmu_tx_dirty_delay",		KSTAT_DATA_UINT64 },
+	{ "dmu_tx_dirty_over_max",	KSTAT_DATA_UINT64 },
+	{ "dmu_tx_quota",		KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *dmu_tx_ksp;
+
+dmu_tx_t *
+dmu_tx_create_dd(dsl_dir_t *dd)
+{
+	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
+	tx->tx_dir = dd;
+	if (dd != NULL)
+		tx->tx_pool = dd->dd_pool;
+	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
+	    offsetof(dmu_tx_hold_t, txh_node));
+	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
+	    offsetof(dmu_tx_callback_t, dcb_node));
+	tx->tx_start = gethrtime();
+#ifdef DEBUG_DMU_TX
+	refcount_create(&tx->tx_space_written);
+	refcount_create(&tx->tx_space_freed);
+#endif
+	return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create(objset_t *os)
+{
+	dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
+	tx->tx_objset = os;
+	tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
+	return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
+{
+	dmu_tx_t *tx = dmu_tx_create_dd(NULL);
+
+	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
+	tx->tx_pool = dp;
+	tx->tx_txg = txg;
+	tx->tx_anyobj = TRUE;
+
+	return (tx);
+}
+
+int
+dmu_tx_is_syncing(dmu_tx_t *tx)
+{
+	return (tx->tx_anyobj);
+}
+
+int
+dmu_tx_private_ok(dmu_tx_t *tx)
+{
+	return (tx->tx_anyobj);
+}
+
+static dmu_tx_hold_t *
+dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
+    enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
+{
+	dmu_tx_hold_t *txh;
+	dnode_t *dn = NULL;
+	int err;
+
+	if (object != DMU_NEW_OBJECT) {
+		err = dnode_hold(os, object, tx, &dn);
+		if (err) {
+			tx->tx_err = err;
+			return (NULL);
+		}
+
+		if (err == 0 && tx->tx_txg != 0) {
+			mutex_enter(&dn->dn_mtx);
+			/*
+			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
+			 * problem, but there's no way for it to happen (for
+			 * now, at least).
+			 */
+			ASSERT(dn->dn_assigned_txg == 0);
+			dn->dn_assigned_txg = tx->tx_txg;
+			(void) refcount_add(&dn->dn_tx_holds, tx);
+			mutex_exit(&dn->dn_mtx);
+		}
+	}
+
+	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
+	txh->txh_tx = tx;
+	txh->txh_dnode = dn;
+#ifdef DEBUG_DMU_TX
+	txh->txh_type = type;
+	txh->txh_arg1 = arg1;
+	txh->txh_arg2 = arg2;
+#endif
+	list_insert_tail(&tx->tx_holds, txh);
+
+	return (txh);
+}
+
+void
+dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
+{
+	/*
+	 * If we're syncing, they can manipulate any object anyhow, and
+	 * the hold on the dnode_t can cause problems.
+	 */
+	if (!dmu_tx_is_syncing(tx)) {
+		(void) dmu_tx_hold_object_impl(tx, os,
+		    object, THT_NEWOBJECT, 0, 0);
+	}
+}
+
+static int
+dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
+{
+	int err;
+	dmu_buf_impl_t *db;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	db = dbuf_hold_level(dn, level, blkid, FTAG);
+	rw_exit(&dn->dn_struct_rwlock);
+	if (db == NULL)
+		return (SET_ERROR(EIO));
+	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
+	dbuf_rele(db, FTAG);
+	return (err);
+}
+
+static void
+dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
+    int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
+{
+	objset_t *os = dn->dn_objset;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	dmu_buf_impl_t *parent = NULL;
+	blkptr_t *bp = NULL;
+	uint64_t space;
+
+	if (level >= dn->dn_nlevels || history[level] == blkid)
+		return;
+
+	history[level] = blkid;
+
+	space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
+
+	if (db == NULL || db == dn->dn_dbuf) {
+		ASSERT(level != 0);
+		db = NULL;
+	} else {
+		ASSERT(DB_DNODE(db) == dn);
+		ASSERT(db->db_level == level);
+		ASSERT(db->db.db_size == space);
+		ASSERT(db->db_blkid == blkid);
+		bp = db->db_blkptr;
+		parent = db->db_parent;
+	}
+
+	freeable = (bp && (freeable ||
+	    dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
+
+	if (freeable)
+		txh->txh_space_tooverwrite += space;
+	else
+		txh->txh_space_towrite += space;
+	if (bp)
+		txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
+
+	dmu_tx_count_twig(txh, dn, parent, level + 1,
+	    blkid >> epbs, freeable, history);
+}
+
+/* ARGSUSED */
+static void
+dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+	dnode_t *dn = txh->txh_dnode;
+	uint64_t start, end, i;
+	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
+	int err = 0;
+	int l;
+
+	if (len == 0)
+		return;
+
+	min_bs = SPA_MINBLOCKSHIFT;
+	max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
+	min_ibs = DN_MIN_INDBLKSHIFT;
+	max_ibs = DN_MAX_INDBLKSHIFT;
+
+	if (dn) {
+		uint64_t history[DN_MAX_LEVELS];
+		int nlvls = dn->dn_nlevels;
+		int delta;
+
+		/*
+		 * For i/o error checking, read the first and last level-0
+		 * blocks (if they are not aligned), and all the level-1 blocks.
+		 */
+		if (dn->dn_maxblkid == 0) {
+			delta = dn->dn_datablksz;
+			start = (off < dn->dn_datablksz) ? 0 : 1;
+			end = (off+len <= dn->dn_datablksz) ? 0 : 1;
+			if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
+				err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+				if (err)
+					goto out;
+				delta -= off;
+			}
+		} else {
+			zio_t *zio = zio_root(dn->dn_objset->os_spa,
+			    NULL, NULL, ZIO_FLAG_CANFAIL);
+
+			/* first level-0 block */
+			start = off >> dn->dn_datablkshift;
+			if (P2PHASE(off, dn->dn_datablksz) ||
+			    len < dn->dn_datablksz) {
+				err = dmu_tx_check_ioerr(zio, dn, 0, start);
+				if (err)
+					goto out;
+			}
+
+			/* last level-0 block */
+			end = (off+len-1) >> dn->dn_datablkshift;
+			if (end != start && end <= dn->dn_maxblkid &&
+			    P2PHASE(off+len, dn->dn_datablksz)) {
+				err = dmu_tx_check_ioerr(zio, dn, 0, end);
+				if (err)
+					goto out;
+			}
+
+			/* level-1 blocks */
+			if (nlvls > 1) {
+				int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+				for (i = (start>>shft)+1; i < end>>shft; i++) {
+					err = dmu_tx_check_ioerr(zio, dn, 1, i);
+					if (err)
+						goto out;
+				}
+			}
+
+			err = zio_wait(zio);
+			if (err)
+				goto out;
+			delta = P2NPHASE(off, dn->dn_datablksz);
+		}
+
+		min_ibs = max_ibs = dn->dn_indblkshift;
+		if (dn->dn_maxblkid > 0) {
+			/*
+			 * The blocksize can't change,
+			 * so we can make a more precise estimate.
+			 */
+			ASSERT(dn->dn_datablkshift != 0);
+			min_bs = max_bs = dn->dn_datablkshift;
+		} else {
+			/*
+			 * The blocksize can increase up to the recordsize,
+			 * or if it is already more than the recordsize,
+			 * up to the next power of 2.
+			 */
+			min_bs = highbit64(dn->dn_datablksz - 1);
+			max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
+		}
+
+		/*
+		 * If this write is not off the end of the file
+		 * we need to account for overwrites/unref.
+		 */
+		if (start <= dn->dn_maxblkid) {
+			for (l = 0; l < DN_MAX_LEVELS; l++)
+				history[l] = -1ULL;
+		}
+		while (start <= dn->dn_maxblkid) {
+			dmu_buf_impl_t *db;
+
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
+			rw_exit(&dn->dn_struct_rwlock);
+
+			if (err) {
+				txh->txh_tx->tx_err = err;
+				return;
+			}
+
+			dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
+			    history);
+			dbuf_rele(db, FTAG);
+			if (++start > end) {
+				/*
+				 * Account for new indirects appearing
+				 * before this IO gets assigned into a txg.
+				 */
+				bits = 64 - min_bs;
+				epbs = min_ibs - SPA_BLKPTRSHIFT;
+				for (bits -= epbs * (nlvls - 1);
+				    bits >= 0; bits -= epbs)
+					txh->txh_fudge += 1ULL << max_ibs;
+				goto out;
+			}
+			off += delta;
+			if (len >= delta)
+				len -= delta;
+			delta = dn->dn_datablksz;
+		}
+	}
+
+	/*
+	 * 'end' is the last thing we will access, not one past.
+	 * This way we won't overflow when accessing the last byte.
+	 */
+	start = P2ALIGN(off, 1ULL << max_bs);
+	end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
+	txh->txh_space_towrite += end - start + 1;
+
+	start >>= min_bs;
+	end >>= min_bs;
+
+	epbs = min_ibs - SPA_BLKPTRSHIFT;
+
+	/*
+	 * The object contains at most 2^(64 - min_bs) blocks,
+	 * and each indirect level maps 2^epbs.
+	 */
+	for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
+		start >>= epbs;
+		end >>= epbs;
+		ASSERT3U(end, >=, start);
+		txh->txh_space_towrite += (end - start + 1) << max_ibs;
+		if (start != 0) {
+			/*
+			 * We also need a new blkid=0 indirect block
+			 * to reference any existing file data.
+			 */
+			txh->txh_space_towrite += 1ULL << max_ibs;
+		}
+	}
+
+out:
+	if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
+	    2 * DMU_MAX_ACCESS)
+		err = SET_ERROR(EFBIG);
+
+	if (err)
+		txh->txh_tx->tx_err = err;
+}
+
+static void
+dmu_tx_count_dnode(dmu_tx_hold_t *txh)
+{
+	dnode_t *dn = txh->txh_dnode;
+	dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
+	uint64_t space = mdn->dn_datablksz +
+	    ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
+
+	if (dn && dn->dn_dbuf->db_blkptr &&
+	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+	    dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
+		txh->txh_space_tooverwrite += space;
+		txh->txh_space_tounref += space;
+	} else {
+		txh->txh_space_towrite += space;
+		if (dn && dn->dn_dbuf->db_blkptr)
+			txh->txh_space_tounref += space;
+	}
+}
+
+void
+dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT(tx->tx_txg == 0);
+	ASSERT(len <= DMU_MAX_ACCESS);
+	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+	    object, THT_WRITE, off, len);
+	if (txh == NULL)
+		return;
+
+	dmu_tx_count_write(txh, off, len);
+	dmu_tx_count_dnode(txh);
+}
+
+static void
+dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+	uint64_t blkid, nblks, lastblk;
+	uint64_t space = 0, unref = 0, skipped = 0;
+	dnode_t *dn = txh->txh_dnode;
+	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+	spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
+	int epbs;
+	uint64_t l0span = 0, nl1blks = 0;
+
+	if (dn->dn_nlevels == 0)
+		return;
+
+	/*
+	 * The struct_rwlock protects us against dn_nlevels
+	 * changing, in case (against all odds) we manage to dirty &
+	 * sync out the changes after we check for being dirty.
+	 * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
+	 */
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	if (dn->dn_maxblkid == 0) {
+		if (off == 0 && len >= dn->dn_datablksz) {
+			blkid = 0;
+			nblks = 1;
+		} else {
+			rw_exit(&dn->dn_struct_rwlock);
+			return;
+		}
+	} else {
+		blkid = off >> dn->dn_datablkshift;
+		nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
+
+		if (blkid > dn->dn_maxblkid) {
+			rw_exit(&dn->dn_struct_rwlock);
+			return;
+		}
+		if (blkid + nblks > dn->dn_maxblkid)
+			nblks = dn->dn_maxblkid - blkid + 1;
+
+	}
+	l0span = nblks;    /* save for later use to calc level > 1 overhead */
+	if (dn->dn_nlevels == 1) {
+		int i;
+		for (i = 0; i < nblks; i++) {
+			blkptr_t *bp = dn->dn_phys->dn_blkptr;
+			ASSERT3U(blkid + i, <, dn->dn_nblkptr);
+			bp += blkid + i;
+			if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
+				dprintf_bp(bp, "can free old%s", "");
+				space += bp_get_dsize(spa, bp);
+			}
+			unref += BP_GET_ASIZE(bp);
+		}
+		nl1blks = 1;
+		nblks = 0;
+	}
+
+	lastblk = blkid + nblks - 1;
+	while (nblks) {
+		dmu_buf_impl_t *dbuf;
+		uint64_t ibyte, new_blkid;
+		int epb = 1 << epbs;
+		int err, i, blkoff, tochk;
+		blkptr_t *bp;
+
+		ibyte = blkid << dn->dn_datablkshift;
+		err = dnode_next_offset(dn,
+		    DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
+		new_blkid = ibyte >> dn->dn_datablkshift;
+		if (err == ESRCH) {
+			skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
+			break;
+		}
+		if (err) {
+			txh->txh_tx->tx_err = err;
+			break;
+		}
+		if (new_blkid > lastblk) {
+			skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
+			break;
+		}
+
+		if (new_blkid > blkid) {
+			ASSERT((new_blkid >> epbs) > (blkid >> epbs));
+			skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
+			nblks -= new_blkid - blkid;
+			blkid = new_blkid;
+		}
+		blkoff = P2PHASE(blkid, epb);
+		tochk = MIN(epb - blkoff, nblks);
+
+		err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
+		if (err) {
+			txh->txh_tx->tx_err = err;
+			break;
+		}
+
+		txh->txh_memory_tohold += dbuf->db.db_size;
+
+		/*
+		 * We don't check memory_tohold against DMU_MAX_ACCESS because
+		 * memory_tohold is an over-estimation (especially the >L1
+		 * indirect blocks), so it could fail.  Callers should have
+		 * already verified that they will not be holding too much
+		 * memory.
+		 */
+
+		err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
+		if (err != 0) {
+			txh->txh_tx->tx_err = err;
+			dbuf_rele(dbuf, FTAG);
+			break;
+		}
+
+		bp = dbuf->db.db_data;
+		bp += blkoff;
+
+		for (i = 0; i < tochk; i++) {
+			if (dsl_dataset_block_freeable(ds, &bp[i],
+			    bp[i].blk_birth)) {
+				dprintf_bp(&bp[i], "can free old%s", "");
+				space += bp_get_dsize(spa, &bp[i]);
+			}
+			unref += BP_GET_ASIZE(bp);
+		}
+		dbuf_rele(dbuf, FTAG);
+
+		++nl1blks;
+		blkid += tochk;
+		nblks -= tochk;
+	}
+	rw_exit(&dn->dn_struct_rwlock);
+
+	/*
+	 * Add in memory requirements of higher-level indirects.
+	 * This assumes a worst-possible scenario for dn_nlevels and a
+	 * worst-possible distribution of l1-blocks over the region to free.
+	 */
+	{
+		uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
+		int level = 2;
+		/*
+		 * Here we don't use DN_MAX_LEVEL, but calculate it with the
+		 * given datablkshift and indblkshift. This makes the
+		 * difference between 19 and 8 on large files.
+		 */
+		int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
+		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
+
+		while (level++ < maxlevel) {
+			txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1)
+			    << dn->dn_indblkshift;
+			blkcnt = 1 + (blkcnt >> epbs);
+		}
+	}
+
+	/* account for new level 1 indirect blocks that might show up */
+	if (skipped > 0) {
+		txh->txh_fudge += skipped << dn->dn_indblkshift;
+		skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
+		txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
+	}
+	txh->txh_space_tofree += space;
+	txh->txh_space_tounref += unref;
+}
+
+void
+dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
+{
+	dmu_tx_hold_t *txh;
+	dnode_t *dn;
+	int err;
+	zio_t *zio;
+
+	ASSERT(tx->tx_txg == 0);
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+	    object, THT_FREE, off, len);
+	if (txh == NULL)
+		return;
+	dn = txh->txh_dnode;
+	dmu_tx_count_dnode(txh);
+
+	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
+		return;
+	if (len == DMU_OBJECT_END)
+		len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
+
+	dmu_tx_count_dnode(txh);
+
+	/*
+	 * For i/o error checking, we read the first and last level-0
+	 * blocks if they are not aligned, and all the level-1 blocks.
+	 *
+	 * Note:  dbuf_free_range() assumes that we have not instantiated
+	 * any level-0 dbufs that will be completely freed.  Therefore we must
+	 * exercise care to not read or count the first and last blocks
+	 * if they are blocksize-aligned.
+	 */
+	if (dn->dn_datablkshift == 0) {
+		if (off != 0 || len < dn->dn_datablksz)
+			dmu_tx_count_write(txh, 0, dn->dn_datablksz);
+	} else {
+		/* first block will be modified if it is not aligned */
+		if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
+			dmu_tx_count_write(txh, off, 1);
+		/* last block will be modified if it is not aligned */
+		if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
+			dmu_tx_count_write(txh, off+len, 1);
+	}
+
+	/*
+	 * Check level-1 blocks.
+	 */
+	if (dn->dn_nlevels > 1) {
+		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
+		    SPA_BLKPTRSHIFT;
+		uint64_t start = off >> shift;
+		uint64_t end = (off + len) >> shift;
+		uint64_t i;
+
+		ASSERT(dn->dn_indblkshift != 0);
+
+		/*
+		 * dnode_reallocate() can result in an object with indirect
+		 * blocks having an odd data block size.  In this case,
+		 * just check the single block.
+		 */
+		if (dn->dn_datablkshift == 0)
+			start = end = 0;
+
+		zio = zio_root(tx->tx_pool->dp_spa,
+		    NULL, NULL, ZIO_FLAG_CANFAIL);
+		for (i = start; i <= end; i++) {
+			uint64_t ibyte = i << shift;
+			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
+			i = ibyte >> shift;
+			if (err == ESRCH || i > end)
+				break;
+			if (err) {
+				tx->tx_err = err;
+				return;
+			}
+
+			err = dmu_tx_check_ioerr(zio, dn, 1, i);
+			if (err) {
+				tx->tx_err = err;
+				return;
+			}
+		}
+		err = zio_wait(zio);
+		if (err) {
+			tx->tx_err = err;
+			return;
+		}
+	}
+
+	dmu_tx_count_free(txh, off, len);
+}
+
+void
+dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
+{
+	dmu_tx_hold_t *txh;
+	dnode_t *dn;
+	dsl_dataset_phys_t *ds_phys;
+	uint64_t nblocks;
+	int epbs, err;
+
+	ASSERT(tx->tx_txg == 0);
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+	    object, THT_ZAP, add, (uintptr_t)name);
+	if (txh == NULL)
+		return;
+	dn = txh->txh_dnode;
+
+	dmu_tx_count_dnode(txh);
+
+	if (dn == NULL) {
+		/*
+		 * We will be able to fit a new object's entries into one leaf
+		 * block.  So there will be at most 2 blocks total,
+		 * including the header block.
+		 */
+		dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
+		return;
+	}
+
+	ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
+
+	if (dn->dn_maxblkid == 0 && !add) {
+		blkptr_t *bp;
+
+		/*
+		 * If there is only one block  (i.e. this is a micro-zap)
+		 * and we are not adding anything, the accounting is simple.
+		 */
+		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+		if (err) {
+			tx->tx_err = err;
+			return;
+		}
+
+		/*
+		 * Use max block size here, since we don't know how much
+		 * the size will change between now and the dbuf dirty call.
+		 */
+		bp = &dn->dn_phys->dn_blkptr[0];
+		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+		    bp, bp->blk_birth))
+			txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ;
+		else
+			txh->txh_space_towrite += MZAP_MAX_BLKSZ;
+		if (!BP_IS_HOLE(bp))
+			txh->txh_space_tounref += MZAP_MAX_BLKSZ;
+		return;
+	}
+
+	if (dn->dn_maxblkid > 0 && name) {
+		/*
+		 * access the name in this fat-zap so that we'll check
+		 * for i/o errors to the leaf blocks, etc.
+		 */
+		err = zap_lookup(dn->dn_objset, dn->dn_object, name,
+		    8, 0, NULL);
+		if (err == EIO) {
+			tx->tx_err = err;
+			return;
+		}
+	}
+
+	err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
+	    &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
+
+	/*
+	 * If the modified blocks are scattered to the four winds,
+	 * we'll have to modify an indirect twig for each.
+	 */
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	ds_phys = dsl_dataset_phys(dn->dn_objset->os_dsl_dataset);
+	for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
+		if (ds_phys->ds_prev_snap_obj)
+			txh->txh_space_towrite += 3 << dn->dn_indblkshift;
+		else
+			txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
+}
+
+void
+dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT(tx->tx_txg == 0);
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+	    object, THT_BONUS, 0, 0);
+	if (txh)
+		dmu_tx_count_dnode(txh);
+}
+
+void
+dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT(tx->tx_txg == 0);
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+	    DMU_NEW_OBJECT, THT_SPACE, space, 0);
+	if (txh)
+		txh->txh_space_towrite += space;
+}
+
+int
+dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
+{
+	dmu_tx_hold_t *txh;
+	int holds = 0;
+
+	/*
+	 * By asserting that the tx is assigned, we're counting the
+	 * number of dn_tx_holds, which is the same as the number of
+	 * dn_holds.  Otherwise, we'd be counting dn_holds, but
+	 * dn_tx_holds could be 0.
+	 */
+	ASSERT(tx->tx_txg != 0);
+
+	/* if (tx->tx_anyobj == TRUE) */
+		/* return (0); */
+
+	for (txh = list_head(&tx->tx_holds); txh;
+	    txh = list_next(&tx->tx_holds, txh)) {
+		if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
+			holds++;
+	}
+
+	return (holds);
+}
+
+#ifdef DEBUG_DMU_TX
+void
+dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
+{
+	dmu_tx_hold_t *txh;
+	int match_object = FALSE, match_offset = FALSE;
+	dnode_t *dn;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	ASSERT(dn != NULL);
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
+	ASSERT3U(dn->dn_object, ==, db->db.db_object);
+
+	if (tx->tx_anyobj) {
+		DB_DNODE_EXIT(db);
+		return;
+	}
+
+	/* XXX No checking on the meta dnode for now */
+	if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+		DB_DNODE_EXIT(db);
+		return;
+	}
+
+	for (txh = list_head(&tx->tx_holds); txh;
+	    txh = list_next(&tx->tx_holds, txh)) {
+		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+		if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
+			match_object = TRUE;
+		if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
+			int datablkshift = dn->dn_datablkshift ?
+			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
+			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+			int shift = datablkshift + epbs * db->db_level;
+			uint64_t beginblk = shift >= 64 ? 0 :
+			    (txh->txh_arg1 >> shift);
+			uint64_t endblk = shift >= 64 ? 0 :
+			    ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
+			uint64_t blkid = db->db_blkid;
+
+			/* XXX txh_arg2 better not be zero... */
+
+			dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
+			    txh->txh_type, beginblk, endblk);
+
+			switch (txh->txh_type) {
+			case THT_WRITE:
+				if (blkid >= beginblk && blkid <= endblk)
+					match_offset = TRUE;
+				/*
+				 * We will let this hold work for the bonus
+				 * or spill buffer so that we don't need to
+				 * hold it when creating a new object.
+				 */
+				if (blkid == DMU_BONUS_BLKID ||
+				    blkid == DMU_SPILL_BLKID)
+					match_offset = TRUE;
+				/*
+				 * They might have to increase nlevels,
+				 * thus dirtying the new TLIBs.  Or the
+				 * might have to change the block size,
+				 * thus dirying the new lvl=0 blk=0.
+				 */
+				if (blkid == 0)
+					match_offset = TRUE;
+				break;
+			case THT_FREE:
+				/*
+				 * We will dirty all the level 1 blocks in
+				 * the free range and perhaps the first and
+				 * last level 0 block.
+				 */
+				if (blkid >= beginblk && (blkid <= endblk ||
+				    txh->txh_arg2 == DMU_OBJECT_END))
+					match_offset = TRUE;
+				break;
+			case THT_SPILL:
+				if (blkid == DMU_SPILL_BLKID)
+					match_offset = TRUE;
+				break;
+			case THT_BONUS:
+				if (blkid == DMU_BONUS_BLKID)
+					match_offset = TRUE;
+				break;
+			case THT_ZAP:
+				match_offset = TRUE;
+				break;
+			case THT_NEWOBJECT:
+				match_object = TRUE;
+				break;
+			default:
+				cmn_err(CE_PANIC, "bad txh_type %d",
+				    txh->txh_type);
+			}
+		}
+		if (match_object && match_offset) {
+			DB_DNODE_EXIT(db);
+			return;
+		}
+	}
+	DB_DNODE_EXIT(db);
+	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
+	    (u_longlong_t)db->db.db_object, db->db_level,
+	    (u_longlong_t)db->db_blkid);
+}
+#endif
+
+/*
+ * If we can't do 10 iops, something is wrong.  Let us go ahead
+ * and hit zfs_dirty_data_max.
+ */
+hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */
+int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
+
+/*
+ * We delay transactions when we've determined that the backend storage
+ * isn't able to accommodate the rate of incoming writes.
+ *
+ * If there is already a transaction waiting, we delay relative to when
+ * that transaction finishes waiting.  This way the calculated min_time
+ * is independent of the number of threads concurrently executing
+ * transactions.
+ *
+ * If we are the only waiter, wait relative to when the transaction
+ * started, rather than the current time.  This credits the transaction for
+ * "time already served", e.g. reading indirect blocks.
+ *
+ * The minimum time for a transaction to take is calculated as:
+ *     min_time = scale * (dirty - min) / (max - dirty)
+ *     min_time is then capped at zfs_delay_max_ns.
+ *
+ * The delay has two degrees of freedom that can be adjusted via tunables.
+ * The percentage of dirty data at which we start to delay is defined by
+ * zfs_delay_min_dirty_percent. This should typically be at or above
+ * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
+ * delay after writing at full speed has failed to keep up with the incoming
+ * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
+ * speaking, this variable determines the amount of delay at the midpoint of
+ * the curve.
+ *
+ * delay
+ *  10ms +-------------------------------------------------------------*+
+ *       |                                                             *|
+ *   9ms +                                                             *+
+ *       |                                                             *|
+ *   8ms +                                                             *+
+ *       |                                                            * |
+ *   7ms +                                                            * +
+ *       |                                                            * |
+ *   6ms +                                                            * +
+ *       |                                                            * |
+ *   5ms +                                                           *  +
+ *       |                                                           *  |
+ *   4ms +                                                           *  +
+ *       |                                                           *  |
+ *   3ms +                                                          *   +
+ *       |                                                          *   |
+ *   2ms +                                              (midpoint) *    +
+ *       |                                                  |    **     |
+ *   1ms +                                                  v ***       +
+ *       |             zfs_delay_scale ---------->     ********         |
+ *     0 +-------------------------------------*********----------------+
+ *       0%                    <- zfs_dirty_data_max ->               100%
+ *
+ * Note that since the delay is added to the outstanding time remaining on the
+ * most recent transaction, the delay is effectively the inverse of IOPS.
+ * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
+ * was chosen such that small changes in the amount of accumulated dirty data
+ * in the first 3/4 of the curve yield relatively small differences in the
+ * amount of delay.
+ *
+ * The effects can be easier to understand when the amount of delay is
+ * represented on a log scale:
+ *
+ * delay
+ * 100ms +-------------------------------------------------------------++
+ *       +                                                              +
+ *       |                                                              |
+ *       +                                                             *+
+ *  10ms +                                                             *+
+ *       +                                                           ** +
+ *       |                                              (midpoint)  **  |
+ *       +                                                  |     **    +
+ *   1ms +                                                  v ****      +
+ *       +             zfs_delay_scale ---------->        *****         +
+ *       |                                             ****             |
+ *       +                                          ****                +
+ * 100us +                                        **                    +
+ *       +                                       *                      +
+ *       |                                      *                       |
+ *       +                                     *                        +
+ *  10us +                                     *                        +
+ *       +                                                              +
+ *       |                                                              |
+ *       +                                                              +
+ *       +--------------------------------------------------------------+
+ *       0%                    <- zfs_dirty_data_max ->               100%
+ *
+ * Note here that only as the amount of dirty data approaches its limit does
+ * the delay start to increase rapidly. The goal of a properly tuned system
+ * should be to keep the amount of dirty data out of that range by first
+ * ensuring that the appropriate limits are set for the I/O scheduler to reach
+ * optimal throughput on the backend storage, and then by changing the value
+ * of zfs_delay_scale to increase the steepness of the curve.
+ */
+static void
+dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
+{
+	dsl_pool_t *dp = tx->tx_pool;
+	uint64_t delay_min_bytes =
+	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+	hrtime_t wakeup, min_tx_time, now;
+
+	if (dirty <= delay_min_bytes)
+		return;
+
+	/*
+	 * The caller has already waited until we are under the max.
+	 * We make them pass us the amount of dirty data so we don't
+	 * have to handle the case of it being >= the max, which could
+	 * cause a divide-by-zero if it's == the max.
+	 */
+	ASSERT3U(dirty, <, zfs_dirty_data_max);
+
+	now = gethrtime();
+	min_tx_time = zfs_delay_scale *
+	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
+	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
+	if (now > tx->tx_start + min_tx_time)
+		return;
+
+	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
+	    uint64_t, min_tx_time);
+
+	mutex_enter(&dp->dp_lock);
+	wakeup = MAX(tx->tx_start + min_tx_time,
+	    dp->dp_last_wakeup + min_tx_time);
+	dp->dp_last_wakeup = wakeup;
+	mutex_exit(&dp->dp_lock);
+
+	zfs_sleep_until(wakeup);
+}
+
+static int
+dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
+{
+	dmu_tx_hold_t *txh;
+	spa_t *spa = tx->tx_pool->dp_spa;
+	uint64_t memory, asize, fsize, usize;
+	uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
+
+	ASSERT0(tx->tx_txg);
+
+	if (tx->tx_err) {
+		DMU_TX_STAT_BUMP(dmu_tx_error);
+		return (tx->tx_err);
+	}
+
+	if (spa_suspended(spa)) {
+		DMU_TX_STAT_BUMP(dmu_tx_suspended);
+
+		/*
+		 * If the user has indicated a blocking failure mode
+		 * then return ERESTART which will block in dmu_tx_wait().
+		 * Otherwise, return EIO so that an error can get
+		 * propagated back to the VOP calls.
+		 *
+		 * Note that we always honor the txg_how flag regardless
+		 * of the failuremode setting.
+		 */
+		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
+		    txg_how != TXG_WAIT)
+			return (SET_ERROR(EIO));
+
+		return (SET_ERROR(ERESTART));
+	}
+
+	if (!tx->tx_waited &&
+	    dsl_pool_need_dirty_delay(tx->tx_pool)) {
+		tx->tx_wait_dirty = B_TRUE;
+		DMU_TX_STAT_BUMP(dmu_tx_dirty_delay);
+		return (ERESTART);
+	}
+
+	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
+	tx->tx_needassign_txh = NULL;
+
+	/*
+	 * NB: No error returns are allowed after txg_hold_open, but
+	 * before processing the dnode holds, due to the
+	 * dmu_tx_unassign() logic.
+	 */
+
+	towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
+	for (txh = list_head(&tx->tx_holds); txh;
+	    txh = list_next(&tx->tx_holds, txh)) {
+		dnode_t *dn = txh->txh_dnode;
+		if (dn != NULL) {
+			mutex_enter(&dn->dn_mtx);
+			if (dn->dn_assigned_txg == tx->tx_txg - 1) {
+				mutex_exit(&dn->dn_mtx);
+				tx->tx_needassign_txh = txh;
+				DMU_TX_STAT_BUMP(dmu_tx_group);
+				return (SET_ERROR(ERESTART));
+			}
+			if (dn->dn_assigned_txg == 0)
+				dn->dn_assigned_txg = tx->tx_txg;
+			ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+			(void) refcount_add(&dn->dn_tx_holds, tx);
+			mutex_exit(&dn->dn_mtx);
+		}
+		towrite += txh->txh_space_towrite;
+		tofree += txh->txh_space_tofree;
+		tooverwrite += txh->txh_space_tooverwrite;
+		tounref += txh->txh_space_tounref;
+		tohold += txh->txh_memory_tohold;
+		fudge += txh->txh_fudge;
+	}
+
+	/*
+	 * If a snapshot has been taken since we made our estimates,
+	 * assume that we won't be able to free or overwrite anything.
+	 */
+	if (tx->tx_objset &&
+	    dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
+	    tx->tx_lastsnap_txg) {
+		towrite += tooverwrite;
+		tooverwrite = tofree = 0;
+	}
+
+	/* needed allocation: worst-case estimate of write space */
+	asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
+	/* freed space estimate: worst-case overwrite + free estimate */
+	fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
+	/* convert unrefd space to worst-case estimate */
+	usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
+	/* calculate memory footprint estimate */
+	memory = towrite + tooverwrite + tohold;
+
+#ifdef DEBUG_DMU_TX
+	/*
+	 * Add in 'tohold' to account for our dirty holds on this memory
+	 * XXX - the "fudge" factor is to account for skipped blocks that
+	 * we missed because dnode_next_offset() misses in-core-only blocks.
+	 */
+	tx->tx_space_towrite = asize +
+	    spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
+	tx->tx_space_tofree = tofree;
+	tx->tx_space_tooverwrite = tooverwrite;
+	tx->tx_space_tounref = tounref;
+#endif
+
+	if (tx->tx_dir && asize != 0) {
+		int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
+		    asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
+		if (err)
+			return (err);
+	}
+
+	DMU_TX_STAT_BUMP(dmu_tx_assigned);
+
+	return (0);
+}
+
+static void
+dmu_tx_unassign(dmu_tx_t *tx)
+{
+	dmu_tx_hold_t *txh;
+
+	if (tx->tx_txg == 0)
+		return;
+
+	txg_rele_to_quiesce(&tx->tx_txgh);
+
+	/*
+	 * Walk the transaction's hold list, removing the hold on the
+	 * associated dnode, and notifying waiters if the refcount drops to 0.
+	 */
+	for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
+	    txh = list_next(&tx->tx_holds, txh)) {
+		dnode_t *dn = txh->txh_dnode;
+
+		if (dn == NULL)
+			continue;
+		mutex_enter(&dn->dn_mtx);
+		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+			dn->dn_assigned_txg = 0;
+			cv_broadcast(&dn->dn_notxholds);
+		}
+		mutex_exit(&dn->dn_mtx);
+	}
+
+	txg_rele_to_sync(&tx->tx_txgh);
+
+	tx->tx_lasttried_txg = tx->tx_txg;
+	tx->tx_txg = 0;
+}
+
+/*
+ * Assign tx to a transaction group.  txg_how can be one of:
+ *
+ * (1)	TXG_WAIT.  If the current open txg is full, waits until there's
+ *	a new one.  This should be used when you're not holding locks.
+ *	It will only fail if we're truly out of space (or over quota).
+ *
+ * (2)	TXG_NOWAIT.  If we can't assign into the current open txg without
+ *	blocking, returns immediately with ERESTART.  This should be used
+ *	whenever you're holding locks.  On an ERESTART error, the caller
+ *	should drop locks, do a dmu_tx_wait(tx), and try again.
+ *
+ * (3)	TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
+ *	has already been called on behalf of this operation (though
+ *	most likely on a different tx).
+ */
+int
+dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
+{
+	int err;
+
+	ASSERT(tx->tx_txg == 0);
+	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
+	    txg_how == TXG_WAITED);
+	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
+
+	if (txg_how == TXG_WAITED)
+		tx->tx_waited = B_TRUE;
+
+	/* If we might wait, we must not hold the config lock. */
+	ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
+
+	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
+		dmu_tx_unassign(tx);
+
+		if (err != ERESTART || txg_how != TXG_WAIT)
+			return (err);
+
+		dmu_tx_wait(tx);
+	}
+
+	txg_rele_to_quiesce(&tx->tx_txgh);
+
+	return (0);
+}
+
+void
+dmu_tx_wait(dmu_tx_t *tx)
+{
+	spa_t *spa = tx->tx_pool->dp_spa;
+	dsl_pool_t *dp = tx->tx_pool;
+	hrtime_t before;
+
+	ASSERT(tx->tx_txg == 0);
+	ASSERT(!dsl_pool_config_held(tx->tx_pool));
+
+	before = gethrtime();
+
+	if (tx->tx_wait_dirty) {
+		uint64_t dirty;
+
+		/*
+		 * dmu_tx_try_assign() has determined that we need to wait
+		 * because we've consumed much or all of the dirty buffer
+		 * space.
+		 */
+		mutex_enter(&dp->dp_lock);
+		if (dp->dp_dirty_total >= zfs_dirty_data_max)
+			DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max);
+		while (dp->dp_dirty_total >= zfs_dirty_data_max)
+			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
+		dirty = dp->dp_dirty_total;
+		mutex_exit(&dp->dp_lock);
+
+		dmu_tx_delay(tx, dirty);
+
+		tx->tx_wait_dirty = B_FALSE;
+
+		/*
+		 * Note: setting tx_waited only has effect if the caller
+		 * used TX_WAIT.  Otherwise they are going to destroy
+		 * this tx and try again.  The common case, zfs_write(),
+		 * uses TX_WAIT.
+		 */
+		tx->tx_waited = B_TRUE;
+	} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
+		/*
+		 * If the pool is suspended we need to wait until it
+		 * is resumed.  Note that it's possible that the pool
+		 * has become active after this thread has tried to
+		 * obtain a tx.  If that's the case then tx_lasttried_txg
+		 * would not have been set.
+		 */
+		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
+	} else if (tx->tx_needassign_txh) {
+		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
+
+		mutex_enter(&dn->dn_mtx);
+		while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
+			cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
+		mutex_exit(&dn->dn_mtx);
+		tx->tx_needassign_txh = NULL;
+	} else {
+		/*
+		 * A dnode is assigned to the quiescing txg.  Wait for its
+		 * transaction to complete.
+		 */
+		txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
+	}
+
+	spa_tx_assign_add_nsecs(spa, gethrtime() - before);
+}
+
+void
+dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
+{
+#ifdef DEBUG_DMU_TX
+	if (tx->tx_dir == NULL || delta == 0)
+		return;
+
+	if (delta > 0) {
+		ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
+		    tx->tx_space_towrite);
+		(void) refcount_add_many(&tx->tx_space_written, delta, NULL);
+	} else {
+		(void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
+	}
+#endif
+}
+
+void
+dmu_tx_commit(dmu_tx_t *tx)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT(tx->tx_txg != 0);
+
+	/*
+	 * Go through the transaction's hold list and remove holds on
+	 * associated dnodes, notifying waiters if no holds remain.
+	 */
+	while ((txh = list_head(&tx->tx_holds))) {
+		dnode_t *dn = txh->txh_dnode;
+
+		list_remove(&tx->tx_holds, txh);
+		kmem_free(txh, sizeof (dmu_tx_hold_t));
+		if (dn == NULL)
+			continue;
+		mutex_enter(&dn->dn_mtx);
+		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+			dn->dn_assigned_txg = 0;
+			cv_broadcast(&dn->dn_notxholds);
+		}
+		mutex_exit(&dn->dn_mtx);
+		dnode_rele(dn, tx);
+	}
+
+	if (tx->tx_tempreserve_cookie)
+		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
+
+	if (!list_is_empty(&tx->tx_callbacks))
+		txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
+
+	if (tx->tx_anyobj == FALSE)
+		txg_rele_to_sync(&tx->tx_txgh);
+
+	list_destroy(&tx->tx_callbacks);
+	list_destroy(&tx->tx_holds);
+#ifdef DEBUG_DMU_TX
+	dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
+	    tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
+	    tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
+	refcount_destroy_many(&tx->tx_space_written,
+	    refcount_count(&tx->tx_space_written));
+	refcount_destroy_many(&tx->tx_space_freed,
+	    refcount_count(&tx->tx_space_freed));
+#endif
+	kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+void
+dmu_tx_abort(dmu_tx_t *tx)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT(tx->tx_txg == 0);
+
+	while ((txh = list_head(&tx->tx_holds))) {
+		dnode_t *dn = txh->txh_dnode;
+
+		list_remove(&tx->tx_holds, txh);
+		kmem_free(txh, sizeof (dmu_tx_hold_t));
+		if (dn != NULL)
+			dnode_rele(dn, tx);
+	}
+
+	/*
+	 * Call any registered callbacks with an error code.
+	 */
+	if (!list_is_empty(&tx->tx_callbacks))
+		dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
+
+	list_destroy(&tx->tx_callbacks);
+	list_destroy(&tx->tx_holds);
+#ifdef DEBUG_DMU_TX
+	refcount_destroy_many(&tx->tx_space_written,
+	    refcount_count(&tx->tx_space_written));
+	refcount_destroy_many(&tx->tx_space_freed,
+	    refcount_count(&tx->tx_space_freed));
+#endif
+	kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+uint64_t
+dmu_tx_get_txg(dmu_tx_t *tx)
+{
+	ASSERT(tx->tx_txg != 0);
+	return (tx->tx_txg);
+}
+
+dsl_pool_t *
+dmu_tx_pool(dmu_tx_t *tx)
+{
+	ASSERT(tx->tx_pool != NULL);
+	return (tx->tx_pool);
+}
+
+void
+dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
+{
+	dmu_tx_callback_t *dcb;
+
+	dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
+
+	dcb->dcb_func = func;
+	dcb->dcb_data = data;
+
+	list_insert_tail(&tx->tx_callbacks, dcb);
+}
+
+/*
+ * Call all the commit callbacks on a list, with a given error code.
+ */
+void
+dmu_tx_do_callbacks(list_t *cb_list, int error)
+{
+	dmu_tx_callback_t *dcb;
+
+	while ((dcb = list_head(cb_list))) {
+		list_remove(cb_list, dcb);
+		dcb->dcb_func(dcb->dcb_data, error);
+		kmem_free(dcb, sizeof (dmu_tx_callback_t));
+	}
+}
+
+/*
+ * Interface to hold a bunch of attributes.
+ * used for creating new files.
+ * attrsize is the total size of all attributes
+ * to be added during object creation
+ *
+ * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
+ */
+
+/*
+ * hold necessary attribute name for attribute registration.
+ * should be a very rare case where this is needed.  If it does
+ * happen it would only happen on the first write to the file system.
+ */
+static void
+dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
+{
+	int i;
+
+	if (!sa->sa_need_attr_registration)
+		return;
+
+	for (i = 0; i != sa->sa_num_attrs; i++) {
+		if (!sa->sa_attr_table[i].sa_registered) {
+			if (sa->sa_reg_attr_obj)
+				dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
+				    B_TRUE, sa->sa_attr_table[i].sa_name);
+			else
+				dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
+				    B_TRUE, sa->sa_attr_table[i].sa_name);
+		}
+	}
+}
+
+
+void
+dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
+{
+	dnode_t *dn;
+	dmu_tx_hold_t *txh;
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
+	    THT_SPILL, 0, 0);
+	if (txh == NULL)
+		return;
+
+	dn = txh->txh_dnode;
+
+	if (dn == NULL)
+		return;
+
+	/* If blkptr doesn't exist then add space to towrite */
+	if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
+		txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
+	} else {
+		blkptr_t *bp;
+
+		bp = &dn->dn_phys->dn_spill;
+		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+		    bp, bp->blk_birth))
+			txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
+		else
+			txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
+		if (!BP_IS_HOLE(bp))
+			txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE;
+	}
+}
+
+void
+dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
+{
+	sa_os_t *sa = tx->tx_objset->os_sa;
+
+	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+
+	if (tx->tx_objset->os_sa->sa_master_obj == 0)
+		return;
+
+	if (tx->tx_objset->os_sa->sa_layout_attr_obj)
+		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+	else {
+		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+	}
+
+	dmu_tx_sa_registration_hold(sa, tx);
+
+	if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
+		return;
+
+	(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
+	    THT_SPILL, 0, 0);
+}
+
+/*
+ * Hold SA attribute
+ *
+ * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
+ *
+ * variable_size is the total size of all variable sized attributes
+ * passed to this function.  It is not the total size of all
+ * variable size attributes that *may* exist on this object.
+ */
+void
+dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
+{
+	uint64_t object;
+	sa_os_t *sa = tx->tx_objset->os_sa;
+
+	ASSERT(hdl != NULL);
+
+	object = sa_handle_object(hdl);
+
+	dmu_tx_hold_bonus(tx, object);
+
+	if (tx->tx_objset->os_sa->sa_master_obj == 0)
+		return;
+
+	if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
+	    tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
+		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+	}
+
+	dmu_tx_sa_registration_hold(sa, tx);
+
+	if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
+		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+
+	if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
+		ASSERT(tx->tx_txg == 0);
+		dmu_tx_hold_spill(tx, object);
+	} else {
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+		dnode_t *dn;
+
+		DB_DNODE_ENTER(db);
+		dn = DB_DNODE(db);
+		if (dn->dn_have_spill) {
+			ASSERT(tx->tx_txg == 0);
+			dmu_tx_hold_spill(tx, object);
+		}
+		DB_DNODE_EXIT(db);
+	}
+}
+
+void
+dmu_tx_init(void)
+{
+	dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (dmu_tx_ksp != NULL) {
+		dmu_tx_ksp->ks_data = &dmu_tx_stats;
+		kstat_install(dmu_tx_ksp);
+	}
+}
+
+void
+dmu_tx_fini(void)
+{
+	if (dmu_tx_ksp != NULL) {
+		kstat_delete(dmu_tx_ksp);
+		dmu_tx_ksp = NULL;
+	}
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(dmu_tx_create);
+EXPORT_SYMBOL(dmu_tx_hold_write);
+EXPORT_SYMBOL(dmu_tx_hold_free);
+EXPORT_SYMBOL(dmu_tx_hold_zap);
+EXPORT_SYMBOL(dmu_tx_hold_bonus);
+EXPORT_SYMBOL(dmu_tx_abort);
+EXPORT_SYMBOL(dmu_tx_assign);
+EXPORT_SYMBOL(dmu_tx_wait);
+EXPORT_SYMBOL(dmu_tx_commit);
+EXPORT_SYMBOL(dmu_tx_get_txg);
+EXPORT_SYMBOL(dmu_tx_callback_register);
+EXPORT_SYMBOL(dmu_tx_do_callbacks);
+EXPORT_SYMBOL(dmu_tx_hold_spill);
+EXPORT_SYMBOL(dmu_tx_hold_sa_create);
+EXPORT_SYMBOL(dmu_tx_hold_sa);
+#endif
diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c
index 8ff2f0509787..9d41bdc0f132 100644
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@@ -293,7 +293,8 @@ dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
 	fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
 
 	for (i = 0; i < fetchsz; i++) {
-		dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ);
+		dbuf_prefetch(dn, 0, blkid + i, ZIO_PRIORITY_ASYNC_READ,
+		    ARC_FLAG_PREFETCH);
 	}
 
 	return (fetchsz);
diff --git a/module/zfs/dmu_zfetch.c.orig b/module/zfs/dmu_zfetch.c.orig
new file mode 100644
index 000000000000..8ff2f0509787
--- /dev/null
+++ b/module/zfs/dmu_zfetch.c.orig
@@ -0,0 +1,748 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/dmu.h>
+#include <sys/dbuf.h>
+#include <sys/kstat.h>
+
+/*
+ * I'm against tune-ables, but these should probably exist as tweakable globals
+ * until we can get this working the way we want it to.
+ */
+
+int zfs_prefetch_disable = 0;
+
+/* max # of streams per zfetch */
+unsigned int	zfetch_max_streams = 8;
+/* min time before stream reclaim */
+unsigned int	zfetch_min_sec_reap = 2;
+/* max number of blocks to fetch at a time */
+unsigned int	zfetch_block_cap = 256;
+/* number of bytes in a array_read at which we stop prefetching (1Mb) */
+unsigned long	zfetch_array_rd_sz = 1024 * 1024;
+
+/* forward decls for static routines */
+static boolean_t	dmu_zfetch_colinear(zfetch_t *, zstream_t *);
+static void		dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
+static uint64_t		dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
+static uint64_t		dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
+static boolean_t	dmu_zfetch_find(zfetch_t *, zstream_t *, int);
+static int		dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
+static zstream_t	*dmu_zfetch_stream_reclaim(zfetch_t *);
+static void		dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
+static int		dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
+
+typedef struct zfetch_stats {
+	kstat_named_t zfetchstat_hits;
+	kstat_named_t zfetchstat_misses;
+	kstat_named_t zfetchstat_colinear_hits;
+	kstat_named_t zfetchstat_colinear_misses;
+	kstat_named_t zfetchstat_stride_hits;
+	kstat_named_t zfetchstat_stride_misses;
+	kstat_named_t zfetchstat_reclaim_successes;
+	kstat_named_t zfetchstat_reclaim_failures;
+	kstat_named_t zfetchstat_stream_resets;
+	kstat_named_t zfetchstat_stream_noresets;
+	kstat_named_t zfetchstat_bogus_streams;
+} zfetch_stats_t;
+
+static zfetch_stats_t zfetch_stats = {
+	{ "hits",			KSTAT_DATA_UINT64 },
+	{ "misses",			KSTAT_DATA_UINT64 },
+	{ "colinear_hits",		KSTAT_DATA_UINT64 },
+	{ "colinear_misses",		KSTAT_DATA_UINT64 },
+	{ "stride_hits",		KSTAT_DATA_UINT64 },
+	{ "stride_misses",		KSTAT_DATA_UINT64 },
+	{ "reclaim_successes",		KSTAT_DATA_UINT64 },
+	{ "reclaim_failures",		KSTAT_DATA_UINT64 },
+	{ "streams_resets",		KSTAT_DATA_UINT64 },
+	{ "streams_noresets",		KSTAT_DATA_UINT64 },
+	{ "bogus_streams",		KSTAT_DATA_UINT64 },
+};
+
+#define	ZFETCHSTAT_INCR(stat, val) \
+	atomic_add_64(&zfetch_stats.stat.value.ui64, (val));
+
+#define	ZFETCHSTAT_BUMP(stat)		ZFETCHSTAT_INCR(stat, 1);
+
+kstat_t		*zfetch_ksp;
+
+/*
+ * Given a zfetch structure and a zstream structure, determine whether the
+ * blocks to be read are part of a co-linear pair of existing prefetch
+ * streams.  If a set is found, coalesce the streams, removing one, and
+ * configure the prefetch so it looks for a strided access pattern.
+ *
+ * In other words: if we find two sequential access streams that are
+ * the same length and distance N appart, and this read is N from the
+ * last stream, then we are probably in a strided access pattern.  So
+ * combine the two sequential streams into a single strided stream.
+ *
+ * Returns whether co-linear streams were found.
+ */
+static boolean_t
+dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
+{
+	zstream_t	*z_walk;
+	zstream_t	*z_comp;
+
+	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
+		return (0);
+
+	if (zh == NULL) {
+		rw_exit(&zf->zf_rwlock);
+		return (0);
+	}
+
+	for (z_walk = list_head(&zf->zf_stream); z_walk;
+	    z_walk = list_next(&zf->zf_stream, z_walk)) {
+		for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
+		    z_comp = list_next(&zf->zf_stream, z_comp)) {
+			int64_t		diff;
+
+			if (z_walk->zst_len != z_walk->zst_stride ||
+			    z_comp->zst_len != z_comp->zst_stride) {
+				continue;
+			}
+
+			diff = z_comp->zst_offset - z_walk->zst_offset;
+			if (z_comp->zst_offset + diff == zh->zst_offset) {
+				z_walk->zst_offset = zh->zst_offset;
+				z_walk->zst_direction = diff < 0 ?
+				    ZFETCH_BACKWARD : ZFETCH_FORWARD;
+				z_walk->zst_stride =
+				    diff * z_walk->zst_direction;
+				z_walk->zst_ph_offset =
+				    zh->zst_offset + z_walk->zst_stride;
+				dmu_zfetch_stream_remove(zf, z_comp);
+				mutex_destroy(&z_comp->zst_lock);
+				kmem_free(z_comp, sizeof (zstream_t));
+
+				dmu_zfetch_dofetch(zf, z_walk);
+
+				rw_exit(&zf->zf_rwlock);
+				return (1);
+			}
+
+			diff = z_walk->zst_offset - z_comp->zst_offset;
+			if (z_walk->zst_offset + diff == zh->zst_offset) {
+				z_walk->zst_offset = zh->zst_offset;
+				z_walk->zst_direction = diff < 0 ?
+				    ZFETCH_BACKWARD : ZFETCH_FORWARD;
+				z_walk->zst_stride =
+				    diff * z_walk->zst_direction;
+				z_walk->zst_ph_offset =
+				    zh->zst_offset + z_walk->zst_stride;
+				dmu_zfetch_stream_remove(zf, z_comp);
+				mutex_destroy(&z_comp->zst_lock);
+				kmem_free(z_comp, sizeof (zstream_t));
+
+				dmu_zfetch_dofetch(zf, z_walk);
+
+				rw_exit(&zf->zf_rwlock);
+				return (1);
+			}
+		}
+	}
+
+	rw_exit(&zf->zf_rwlock);
+	return (0);
+}
+
+/*
+ * Given a zstream_t, determine the bounds of the prefetch.  Then call the
+ * routine that actually prefetches the individual blocks.
+ */
+static void
+dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
+{
+	uint64_t	prefetch_tail;
+	uint64_t	prefetch_limit;
+	uint64_t	prefetch_ofst;
+	uint64_t	prefetch_len;
+	uint64_t	blocks_fetched;
+
+	zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
+	zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
+
+	prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
+	    (int64_t)(zs->zst_offset + zs->zst_stride));
+	/*
+	 * XXX: use a faster division method?
+	 */
+	prefetch_limit = zs->zst_offset + zs->zst_len +
+	    (zs->zst_cap * zs->zst_stride) / zs->zst_len;
+
+	while (prefetch_tail < prefetch_limit) {
+		prefetch_ofst = zs->zst_offset + zs->zst_direction *
+		    (prefetch_tail - zs->zst_offset);
+
+		prefetch_len = zs->zst_len;
+
+		/*
+		 * Don't prefetch beyond the end of the file, if working
+		 * backwards.
+		 */
+		if ((zs->zst_direction == ZFETCH_BACKWARD) &&
+		    (prefetch_ofst > prefetch_tail)) {
+			prefetch_len += prefetch_ofst;
+			prefetch_ofst = 0;
+		}
+
+		/* don't prefetch more than we're supposed to */
+		if (prefetch_len > zs->zst_len)
+			break;
+
+		blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
+		    prefetch_ofst, zs->zst_len);
+
+		prefetch_tail += zs->zst_stride;
+		/* stop if we've run out of stuff to prefetch */
+		if (blocks_fetched < zs->zst_len)
+			break;
+	}
+	zs->zst_ph_offset = prefetch_tail;
+	zs->zst_last = ddi_get_lbolt();
+}
+
+void
+zfetch_init(void)
+{
+
+	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (zfetch_ksp != NULL) {
+		zfetch_ksp->ks_data = &zfetch_stats;
+		kstat_install(zfetch_ksp);
+	}
+}
+
+void
+zfetch_fini(void)
+{
+	if (zfetch_ksp != NULL) {
+		kstat_delete(zfetch_ksp);
+		zfetch_ksp = NULL;
+	}
+}
+
+/*
+ * This takes a pointer to a zfetch structure and a dnode.  It performs the
+ * necessary setup for the zfetch structure, grokking data from the
+ * associated dnode.
+ */
+void
+dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
+{
+	if (zf == NULL) {
+		return;
+	}
+
+	zf->zf_dnode = dno;
+	zf->zf_stream_cnt = 0;
+	zf->zf_alloc_fail = 0;
+
+	list_create(&zf->zf_stream, sizeof (zstream_t),
+	    offsetof(zstream_t, zst_node));
+
+	rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
+}
+
+/*
+ * This function computes the actual size, in blocks, that can be prefetched,
+ * and fetches it.
+ */
+static uint64_t
+dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
+{
+	uint64_t	fetchsz;
+	uint64_t	i;
+
+	fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
+
+	for (i = 0; i < fetchsz; i++) {
+		dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ);
+	}
+
+	return (fetchsz);
+}
+
+/*
+ * this function returns the number of blocks that would be prefetched, based
+ * upon the supplied dnode, blockid, and nblks.  This is used so that we can
+ * update streams in place, and then prefetch with their old value after the
+ * fact.  This way, we can delay the prefetch, but subsequent accesses to the
+ * stream won't result in the same data being prefetched multiple times.
+ */
+static uint64_t
+dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
+{
+	uint64_t	fetchsz;
+
+	if (blkid > dn->dn_maxblkid) {
+		return (0);
+	}
+
+	/* compute fetch size */
+	if (blkid + nblks + 1 > dn->dn_maxblkid) {
+		fetchsz = (dn->dn_maxblkid - blkid) + 1;
+		ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
+	} else {
+		fetchsz = nblks;
+	}
+
+
+	return (fetchsz);
+}
+
+/*
+ * given a zfetch and a zstream structure, see if there is an associated zstream
+ * for this block read.  If so, it starts a prefetch for the stream it
+ * located and returns true, otherwise it returns false
+ */
+static boolean_t
+dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
+{
+	zstream_t	*zs;
+	int64_t		diff;
+	int		reset = !prefetched;
+	int		rc = 0;
+
+	if (zh == NULL)
+		return (0);
+
+	/*
+	 * XXX: This locking strategy is a bit coarse; however, it's impact has
+	 * yet to be tested.  If this turns out to be an issue, it can be
+	 * modified in a number of different ways.
+	 */
+
+	rw_enter(&zf->zf_rwlock, RW_READER);
+top:
+
+	for (zs = list_head(&zf->zf_stream); zs;
+	    zs = list_next(&zf->zf_stream, zs)) {
+
+		/*
+		 * XXX - should this be an assert?
+		 */
+		if (zs->zst_len == 0) {
+			/* bogus stream */
+			ZFETCHSTAT_BUMP(zfetchstat_bogus_streams);
+			continue;
+		}
+
+		/*
+		 * We hit this case when we are in a strided prefetch stream:
+		 * we will read "len" blocks before "striding".
+		 */
+		if (zh->zst_offset >= zs->zst_offset &&
+		    zh->zst_offset < zs->zst_offset + zs->zst_len) {
+			if (prefetched) {
+				/* already fetched */
+				ZFETCHSTAT_BUMP(zfetchstat_stride_hits);
+				rc = 1;
+				goto out;
+			} else {
+				ZFETCHSTAT_BUMP(zfetchstat_stride_misses);
+			}
+		}
+
+		/*
+		 * This is the forward sequential read case: we increment
+		 * len by one each time we hit here, so we will enter this
+		 * case on every read.
+		 */
+		if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
+
+			reset = !prefetched && zs->zst_len > 1;
+
+			mutex_enter(&zs->zst_lock);
+
+			if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
+				mutex_exit(&zs->zst_lock);
+				goto top;
+			}
+			zs->zst_len += zh->zst_len;
+			diff = zs->zst_len - zfetch_block_cap;
+			if (diff > 0) {
+				zs->zst_offset += diff;
+				zs->zst_len = zs->zst_len > diff ?
+				    zs->zst_len - diff : 0;
+			}
+			zs->zst_direction = ZFETCH_FORWARD;
+
+			break;
+
+		/*
+		 * Same as above, but reading backwards through the file.
+		 */
+		} else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
+			/* backwards sequential access */
+
+			reset = !prefetched && zs->zst_len > 1;
+
+			mutex_enter(&zs->zst_lock);
+
+			if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
+				mutex_exit(&zs->zst_lock);
+				goto top;
+			}
+
+			zs->zst_offset = zs->zst_offset > zh->zst_len ?
+			    zs->zst_offset - zh->zst_len : 0;
+			zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
+			    zs->zst_ph_offset - zh->zst_len : 0;
+			zs->zst_len += zh->zst_len;
+
+			diff = zs->zst_len - zfetch_block_cap;
+			if (diff > 0) {
+				zs->zst_ph_offset = zs->zst_ph_offset > diff ?
+				    zs->zst_ph_offset - diff : 0;
+				zs->zst_len = zs->zst_len > diff ?
+				    zs->zst_len - diff : zs->zst_len;
+			}
+			zs->zst_direction = ZFETCH_BACKWARD;
+
+			break;
+
+		} else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
+		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
+			/* strided forward access */
+
+			mutex_enter(&zs->zst_lock);
+
+			if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
+			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
+				mutex_exit(&zs->zst_lock);
+				goto top;
+			}
+
+			zs->zst_offset += zs->zst_stride;
+			zs->zst_direction = ZFETCH_FORWARD;
+
+			break;
+
+		} else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
+		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
+			/* strided reverse access */
+
+			mutex_enter(&zs->zst_lock);
+
+			if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
+			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
+				mutex_exit(&zs->zst_lock);
+				goto top;
+			}
+
+			zs->zst_offset = zs->zst_offset > zs->zst_stride ?
+			    zs->zst_offset - zs->zst_stride : 0;
+			zs->zst_ph_offset = (zs->zst_ph_offset >
+			    (2 * zs->zst_stride)) ?
+			    (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
+			zs->zst_direction = ZFETCH_BACKWARD;
+
+			break;
+		}
+	}
+
+	if (zs) {
+		if (reset) {
+			zstream_t *remove = zs;
+
+			ZFETCHSTAT_BUMP(zfetchstat_stream_resets);
+			rc = 0;
+			mutex_exit(&zs->zst_lock);
+			rw_exit(&zf->zf_rwlock);
+			rw_enter(&zf->zf_rwlock, RW_WRITER);
+			/*
+			 * Relocate the stream, in case someone removes
+			 * it while we were acquiring the WRITER lock.
+			 */
+			for (zs = list_head(&zf->zf_stream); zs;
+			    zs = list_next(&zf->zf_stream, zs)) {
+				if (zs == remove) {
+					dmu_zfetch_stream_remove(zf, zs);
+					mutex_destroy(&zs->zst_lock);
+					kmem_free(zs, sizeof (zstream_t));
+					break;
+				}
+			}
+		} else {
+			ZFETCHSTAT_BUMP(zfetchstat_stream_noresets);
+			rc = 1;
+			dmu_zfetch_dofetch(zf, zs);
+			mutex_exit(&zs->zst_lock);
+		}
+	}
+out:
+	rw_exit(&zf->zf_rwlock);
+	return (rc);
+}
+
+/*
+ * Clean-up state associated with a zfetch structure.  This frees allocated
+ * structure members, empties the zf_stream tree, and generally makes things
+ * nice.  This doesn't free the zfetch_t itself, that's left to the caller.
+ */
+void
+dmu_zfetch_rele(zfetch_t *zf)
+{
+	zstream_t	*zs;
+	zstream_t	*zs_next;
+
+	ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
+
+	for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
+		zs_next = list_next(&zf->zf_stream, zs);
+
+		list_remove(&zf->zf_stream, zs);
+		mutex_destroy(&zs->zst_lock);
+		kmem_free(zs, sizeof (zstream_t));
+	}
+	list_destroy(&zf->zf_stream);
+	rw_destroy(&zf->zf_rwlock);
+
+	zf->zf_dnode = NULL;
+}
+
+/*
+ * Given a zfetch and zstream structure, insert the zstream structure into the
+ * AVL tree contained within the zfetch structure.  Peform the appropriate
+ * book-keeping.  It is possible that another thread has inserted a stream which
+ * matches one that we are about to insert, so we must be sure to check for this
+ * case.  If one is found, return failure, and let the caller cleanup the
+ * duplicates.
+ */
+static int
+dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
+{
+	zstream_t	*zs_walk;
+	zstream_t	*zs_next;
+
+	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+
+	for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
+		zs_next = list_next(&zf->zf_stream, zs_walk);
+
+		if (dmu_zfetch_streams_equal(zs_walk, zs)) {
+			return (0);
+		}
+	}
+
+	list_insert_head(&zf->zf_stream, zs);
+	zf->zf_stream_cnt++;
+	return (1);
+}
+
+
+/*
+ * Walk the list of zstreams in the given zfetch, find an old one (by time), and
+ * reclaim it for use by the caller.
+ */
+static zstream_t *
+dmu_zfetch_stream_reclaim(zfetch_t *zf)
+{
+	zstream_t	*zs;
+
+	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
+		return (0);
+
+	for (zs = list_head(&zf->zf_stream); zs;
+	    zs = list_next(&zf->zf_stream, zs)) {
+
+		if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap)
+			break;
+	}
+
+	if (zs) {
+		dmu_zfetch_stream_remove(zf, zs);
+		mutex_destroy(&zs->zst_lock);
+		bzero(zs, sizeof (zstream_t));
+	} else {
+		zf->zf_alloc_fail++;
+	}
+	rw_exit(&zf->zf_rwlock);
+
+	return (zs);
+}
+
+/*
+ * Given a zfetch and zstream structure, remove the zstream structure from its
+ * container in the zfetch structure.  Perform the appropriate book-keeping.
+ */
+static void
+dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
+{
+	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+
+	list_remove(&zf->zf_stream, zs);
+	zf->zf_stream_cnt--;
+}
+
+static int
+dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
+{
+	if (zs1->zst_offset != zs2->zst_offset)
+		return (0);
+
+	if (zs1->zst_len != zs2->zst_len)
+		return (0);
+
+	if (zs1->zst_stride != zs2->zst_stride)
+		return (0);
+
+	if (zs1->zst_ph_offset != zs2->zst_ph_offset)
+		return (0);
+
+	if (zs1->zst_cap != zs2->zst_cap)
+		return (0);
+
+	if (zs1->zst_direction != zs2->zst_direction)
+		return (0);
+
+	return (1);
+}
+
+/*
+ * This is the prefetch entry point.  It calls all of the other dmu_zfetch
+ * routines to create, delete, find, or operate upon prefetch streams.
+ */
+void
+dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
+{
+	zstream_t	zst;
+	zstream_t	*newstream;
+	boolean_t	fetched;
+	int		inserted;
+	unsigned int	blkshft;
+	uint64_t	blksz;
+
+	if (zfs_prefetch_disable)
+		return;
+
+	/* files that aren't ln2 blocksz are only one block -- nothing to do */
+	if (!zf->zf_dnode->dn_datablkshift)
+		return;
+
+	/* convert offset and size, into blockid and nblocks */
+	blkshft = zf->zf_dnode->dn_datablkshift;
+	blksz = (1 << blkshft);
+
+	bzero(&zst, sizeof (zstream_t));
+	zst.zst_offset = offset >> blkshft;
+	zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
+	    P2ALIGN(offset, blksz)) >> blkshft;
+
+	fetched = dmu_zfetch_find(zf, &zst, prefetched);
+	if (fetched) {
+		ZFETCHSTAT_BUMP(zfetchstat_hits);
+	} else {
+		ZFETCHSTAT_BUMP(zfetchstat_misses);
+		if ((fetched = dmu_zfetch_colinear(zf, &zst))) {
+			ZFETCHSTAT_BUMP(zfetchstat_colinear_hits);
+		} else {
+			ZFETCHSTAT_BUMP(zfetchstat_colinear_misses);
+		}
+	}
+
+	if (!fetched) {
+		newstream = dmu_zfetch_stream_reclaim(zf);
+
+		/*
+		 * we still couldn't find a stream, drop the lock, and allocate
+		 * one if possible.  Otherwise, give up and go home.
+		 */
+		if (newstream) {
+			ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes);
+		} else {
+			uint64_t	maxblocks;
+			uint32_t	max_streams;
+			uint32_t	cur_streams;
+
+			ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures);
+			cur_streams = zf->zf_stream_cnt;
+			maxblocks = zf->zf_dnode->dn_maxblkid;
+
+			max_streams = MIN(zfetch_max_streams,
+			    (maxblocks / zfetch_block_cap));
+			if (max_streams == 0) {
+				max_streams++;
+			}
+
+			if (cur_streams >= max_streams) {
+				return;
+			}
+			newstream =
+			    kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
+		}
+
+		newstream->zst_offset = zst.zst_offset;
+		newstream->zst_len = zst.zst_len;
+		newstream->zst_stride = zst.zst_len;
+		newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
+		newstream->zst_cap = zst.zst_len;
+		newstream->zst_direction = ZFETCH_FORWARD;
+		newstream->zst_last = ddi_get_lbolt();
+
+		mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
+
+		rw_enter(&zf->zf_rwlock, RW_WRITER);
+		inserted = dmu_zfetch_stream_insert(zf, newstream);
+		rw_exit(&zf->zf_rwlock);
+
+		if (!inserted) {
+			mutex_destroy(&newstream->zst_lock);
+			kmem_free(newstream, sizeof (zstream_t));
+		}
+	}
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+module_param(zfs_prefetch_disable, int, 0644);
+MODULE_PARM_DESC(zfs_prefetch_disable, "Disable all ZFS prefetching");
+
+module_param(zfetch_max_streams, uint, 0644);
+MODULE_PARM_DESC(zfetch_max_streams, "Max number of streams per zfetch");
+
+module_param(zfetch_min_sec_reap, uint, 0644);
+MODULE_PARM_DESC(zfetch_min_sec_reap, "Min time before stream reclaim");
+
+module_param(zfetch_block_cap, uint, 0644);
+MODULE_PARM_DESC(zfetch_block_cap, "Max number of blocks to fetch at a time");
+
+module_param(zfetch_array_rd_sz, ulong, 0644);
+MODULE_PARM_DESC(zfetch_array_rd_sz, "Number of bytes in a array_read");
+#endif
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 2858bbfb492e..17ce8c36993e 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1112,7 +1112,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
 		drop_struct_lock = TRUE;
 	}
 
-	blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+	blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
 
 	db = dbuf_hold(mdn, blk, FTAG);
 	if (drop_struct_lock)
@@ -1409,7 +1409,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 		goto fail;
 
 	/* resize the old block */
-	err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
 	if (err == 0)
 		dbuf_new_size(db, size, tx);
 	else if (err != ENOENT)
@@ -1582,8 +1582,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 		ASSERT3U(blkoff + head, ==, blksz);
 		if (len < head)
 			head = len;
-		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
-		    FTAG, &db) == 0) {
+		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+		    TRUE, FALSE, FTAG, &db) == 0) {
 			caddr_t data;
 
 			/* don't dirty if it isn't on disk and isn't dirty */
@@ -1620,8 +1620,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 	if (tail) {
 		if (len < tail)
 			tail = len;
-		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
-		    TRUE, FTAG, &db) == 0) {
+		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+		    TRUE, FALSE, FTAG, &db) == 0) {
 			/* don't dirty if not on disk and not dirty */
 			if (db->db_last_dirty ||
 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
@@ -1853,7 +1853,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
  */
 static int
 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
-	int lvl, uint64_t blkfill, uint64_t txg)
+    int lvl, uint64_t blkfill, uint64_t txg)
 {
 	dmu_buf_impl_t *db = NULL;
 	void *data = NULL;
@@ -1875,8 +1875,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 		epb = dn->dn_phys->dn_nblkptr;
 		data = dn->dn_phys->dn_blkptr;
 	} else {
-		uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
-		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
+		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
 		if (error) {
 			if (error != ENOENT)
 				return (error);
diff --git a/module/zfs/dnode.c.orig b/module/zfs/dnode.c.orig
new file mode 100644
index 000000000000..2858bbfb492e
--- /dev/null
+++ b/module/zfs/dnode.c.orig
@@ -0,0 +1,2044 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/range_tree.h>
+#include <sys/trace_dnode.h>
+
+static kmem_cache_t *dnode_cache;
+/*
+ * Define DNODE_STATS to turn on statistic gathering. By default, it is only
+ * turned on when DEBUG is also defined.
+ */
+#ifdef	DEBUG
+#define	DNODE_STATS
+#endif	/* DEBUG */
+
+#ifdef	DNODE_STATS
+#define	DNODE_STAT_ADD(stat)			((stat)++)
+#else
+#define	DNODE_STAT_ADD(stat)			/* nothing */
+#endif	/* DNODE_STATS */
+
+ASSERTV(static dnode_phys_t dnode_phys_zero);
+
+int zfs_default_bs = SPA_MINBLOCKSHIFT;
+int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
+
+#ifdef	_KERNEL
+static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
+#endif /* _KERNEL */
+
+static int
+dbuf_compare(const void *x1, const void *x2)
+{
+	const dmu_buf_impl_t *d1 = x1;
+	const dmu_buf_impl_t *d2 = x2;
+
+	if (d1->db_level < d2->db_level) {
+		return (-1);
+	}
+	if (d1->db_level > d2->db_level) {
+		return (1);
+	}
+
+	if (d1->db_blkid < d2->db_blkid) {
+		return (-1);
+	}
+	if (d1->db_blkid > d2->db_blkid) {
+		return (1);
+	}
+
+	if (d1->db_state == DB_SEARCH) {
+		ASSERT3S(d2->db_state, !=, DB_SEARCH);
+		return (-1);
+	} else if (d2->db_state == DB_SEARCH) {
+		ASSERT3S(d1->db_state, !=, DB_SEARCH);
+		return (1);
+	}
+
+	if ((uintptr_t)d1 < (uintptr_t)d2) {
+		return (-1);
+	}
+	if ((uintptr_t)d1 > (uintptr_t)d2) {
+		return (1);
+	}
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+dnode_cons(void *arg, void *unused, int kmflag)
+{
+	dnode_t *dn = arg;
+	int i;
+
+	rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
+
+	/*
+	 * Every dbuf has a reference, and dropping a tracked reference is
+	 * O(number of references), so don't track dn_holds.
+	 */
+	refcount_create_untracked(&dn->dn_holds);
+	refcount_create(&dn->dn_tx_holds);
+	list_link_init(&dn->dn_link);
+
+	bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
+	bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
+	bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
+	bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
+	bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
+	bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
+	bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		list_link_init(&dn->dn_dirty_link[i]);
+		dn->dn_free_ranges[i] = NULL;
+		list_create(&dn->dn_dirty_records[i],
+		    sizeof (dbuf_dirty_record_t),
+		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
+	}
+
+	dn->dn_allocated_txg = 0;
+	dn->dn_free_txg = 0;
+	dn->dn_assigned_txg = 0;
+	dn->dn_dirtyctx = 0;
+	dn->dn_dirtyctx_firstset = NULL;
+	dn->dn_bonus = NULL;
+	dn->dn_have_spill = B_FALSE;
+	dn->dn_zio = NULL;
+	dn->dn_oldused = 0;
+	dn->dn_oldflags = 0;
+	dn->dn_olduid = 0;
+	dn->dn_oldgid = 0;
+	dn->dn_newuid = 0;
+	dn->dn_newgid = 0;
+	dn->dn_id_flags = 0;
+
+	dn->dn_dbufs_count = 0;
+	dn->dn_unlisted_l0_blkid = 0;
+	avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
+	    offsetof(dmu_buf_impl_t, db_link));
+
+	dn->dn_moved = 0;
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+dnode_dest(void *arg, void *unused)
+{
+	int i;
+	dnode_t *dn = arg;
+
+	rw_destroy(&dn->dn_struct_rwlock);
+	mutex_destroy(&dn->dn_mtx);
+	mutex_destroy(&dn->dn_dbufs_mtx);
+	cv_destroy(&dn->dn_notxholds);
+	refcount_destroy(&dn->dn_holds);
+	refcount_destroy(&dn->dn_tx_holds);
+	ASSERT(!list_link_active(&dn->dn_link));
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
+		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
+		list_destroy(&dn->dn_dirty_records[i]);
+		ASSERT0(dn->dn_next_nblkptr[i]);
+		ASSERT0(dn->dn_next_nlevels[i]);
+		ASSERT0(dn->dn_next_indblkshift[i]);
+		ASSERT0(dn->dn_next_bonustype[i]);
+		ASSERT0(dn->dn_rm_spillblk[i]);
+		ASSERT0(dn->dn_next_bonuslen[i]);
+		ASSERT0(dn->dn_next_blksz[i]);
+	}
+
+	ASSERT0(dn->dn_allocated_txg);
+	ASSERT0(dn->dn_free_txg);
+	ASSERT0(dn->dn_assigned_txg);
+	ASSERT0(dn->dn_dirtyctx);
+	ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
+	ASSERT3P(dn->dn_bonus, ==, NULL);
+	ASSERT(!dn->dn_have_spill);
+	ASSERT3P(dn->dn_zio, ==, NULL);
+	ASSERT0(dn->dn_oldused);
+	ASSERT0(dn->dn_oldflags);
+	ASSERT0(dn->dn_olduid);
+	ASSERT0(dn->dn_oldgid);
+	ASSERT0(dn->dn_newuid);
+	ASSERT0(dn->dn_newgid);
+	ASSERT0(dn->dn_id_flags);
+
+	ASSERT0(dn->dn_dbufs_count);
+	ASSERT0(dn->dn_unlisted_l0_blkid);
+	avl_destroy(&dn->dn_dbufs);
+}
+
+void
+dnode_init(void)
+{
+	ASSERT(dnode_cache == NULL);
+	dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t),
+	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+	kmem_cache_set_move(dnode_cache, dnode_move);
+}
+
+void
+dnode_fini(void)
+{
+	kmem_cache_destroy(dnode_cache);
+	dnode_cache = NULL;
+}
+
+
+#ifdef ZFS_DEBUG
+void
+dnode_verify(dnode_t *dn)
+{
+	int drop_struct_lock = FALSE;
+
+	ASSERT(dn->dn_phys);
+	ASSERT(dn->dn_objset);
+	ASSERT(dn->dn_handle->dnh_dnode == dn);
+
+	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
+
+	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
+		return;
+
+	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		drop_struct_lock = TRUE;
+	}
+	if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
+		int i;
+		ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
+		if (dn->dn_datablkshift) {
+			ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
+			ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
+			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
+		}
+		ASSERT3U(dn->dn_nlevels, <=, 30);
+		ASSERT(DMU_OT_IS_VALID(dn->dn_type));
+		ASSERT3U(dn->dn_nblkptr, >=, 1);
+		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+		ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+		ASSERT3U(dn->dn_datablksz, ==,
+		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+		ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
+		ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
+		    dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+		for (i = 0; i < TXG_SIZE; i++) {
+			ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
+		}
+	}
+	if (dn->dn_phys->dn_type != DMU_OT_NONE)
+		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
+	ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
+	if (dn->dn_dbuf != NULL) {
+		ASSERT3P(dn->dn_phys, ==,
+		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
+		    (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
+	}
+	if (drop_struct_lock)
+		rw_exit(&dn->dn_struct_rwlock);
+}
+#endif
+
+void
+dnode_byteswap(dnode_phys_t *dnp)
+{
+	uint64_t *buf64 = (void*)&dnp->dn_blkptr;
+	int i;
+
+	if (dnp->dn_type == DMU_OT_NONE) {
+		bzero(dnp, sizeof (dnode_phys_t));
+		return;
+	}
+
+	dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
+	dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+	dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
+	dnp->dn_used = BSWAP_64(dnp->dn_used);
+
+	/*
+	 * dn_nblkptr is only one byte, so it's OK to read it in either
+	 * byte order.  We can't read dn_bouslen.
+	 */
+	ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
+	ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
+	for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
+		buf64[i] = BSWAP_64(buf64[i]);
+
+	/*
+	 * OK to check dn_bonuslen for zero, because it won't matter if
+	 * we have the wrong byte order.  This is necessary because the
+	 * dnode dnode is smaller than a regular dnode.
+	 */
+	if (dnp->dn_bonuslen != 0) {
+		/*
+		 * Note that the bonus length calculated here may be
+		 * longer than the actual bonus buffer.  This is because
+		 * we always put the bonus buffer after the last block
+		 * pointer (instead of packing it against the end of the
+		 * dnode buffer).
+		 */
+		int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
+		size_t len = DN_MAX_BONUSLEN - off;
+		dmu_object_byteswap_t byteswap;
+		ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
+		byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
+		dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
+	}
+
+	/* Swap SPILL block if we have one */
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+		byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
+
+}
+
+void
+dnode_buf_byteswap(void *vbuf, size_t size)
+{
+	dnode_phys_t *buf = vbuf;
+	int i;
+
+	ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
+	ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
+
+	size >>= DNODE_SHIFT;
+	for (i = 0; i < size; i++) {
+		dnode_byteswap(buf);
+		buf++;
+	}
+}
+
+void
+dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
+{
+	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+
+	dnode_setdirty(dn, tx);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+	    (dn->dn_nblkptr-1) * sizeof (blkptr_t));
+	dn->dn_bonuslen = newsize;
+	if (newsize == 0)
+		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
+	else
+		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
+{
+	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+	dnode_setdirty(dn, tx);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	dn->dn_bonustype = newtype;
+	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+	dnode_setdirty(dn, tx);
+	dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
+	dn->dn_have_spill = B_FALSE;
+}
+
+static void
+dnode_setdblksz(dnode_t *dn, int size)
+{
+	ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
+	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
+	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
+	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
+	dn->dn_datablksz = size;
+	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
+	dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
+}
+
+static dnode_t *
+dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+    uint64_t object, dnode_handle_t *dnh)
+{
+	dnode_t *dn;
+
+	dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+	ASSERT(!POINTER_IS_VALID(dn->dn_objset));
+	dn->dn_moved = 0;
+
+	/*
+	 * Defer setting dn_objset until the dnode is ready to be a candidate
+	 * for the dnode_move() callback.
+	 */
+	dn->dn_object = object;
+	dn->dn_dbuf = db;
+	dn->dn_handle = dnh;
+	dn->dn_phys = dnp;
+
+	if (dnp->dn_datablkszsec) {
+		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+	} else {
+		dn->dn_datablksz = 0;
+		dn->dn_datablkszsec = 0;
+		dn->dn_datablkshift = 0;
+	}
+	dn->dn_indblkshift = dnp->dn_indblkshift;
+	dn->dn_nlevels = dnp->dn_nlevels;
+	dn->dn_type = dnp->dn_type;
+	dn->dn_nblkptr = dnp->dn_nblkptr;
+	dn->dn_checksum = dnp->dn_checksum;
+	dn->dn_compress = dnp->dn_compress;
+	dn->dn_bonustype = dnp->dn_bonustype;
+	dn->dn_bonuslen = dnp->dn_bonuslen;
+	dn->dn_maxblkid = dnp->dn_maxblkid;
+	dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
+	dn->dn_id_flags = 0;
+
+	dmu_zfetch_init(&dn->dn_zfetch, dn);
+
+	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
+
+	mutex_enter(&os->os_lock);
+	if (dnh->dnh_dnode != NULL) {
+		/* Lost the allocation race. */
+		mutex_exit(&os->os_lock);
+		kmem_cache_free(dnode_cache, dn);
+		return (dnh->dnh_dnode);
+	}
+
+	/*
+	 * Exclude special dnodes from os_dnodes so an empty os_dnodes
+	 * signifies that the special dnodes have no references from
+	 * their children (the entries in os_dnodes).  This allows
+	 * dnode_destroy() to easily determine if the last child has
+	 * been removed and then complete eviction of the objset.
+	 */
+	if (!DMU_OBJECT_IS_SPECIAL(object))
+		list_insert_head(&os->os_dnodes, dn);
+	membar_producer();
+
+	/*
+	 * Everything else must be valid before assigning dn_objset
+	 * makes the dnode eligible for dnode_move().
+	 */
+	dn->dn_objset = os;
+
+	dnh->dnh_dnode = dn;
+	mutex_exit(&os->os_lock);
+
+	arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
+	return (dn);
+}
+
+/*
+ * Caller must be holding the dnode handle, which is released upon return.
+ */
+static void
+dnode_destroy(dnode_t *dn)
+{
+	objset_t *os = dn->dn_objset;
+	boolean_t complete_os_eviction = B_FALSE;
+
+	ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
+
+	mutex_enter(&os->os_lock);
+	POINTER_INVALIDATE(&dn->dn_objset);
+	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+		list_remove(&os->os_dnodes, dn);
+		complete_os_eviction =
+		    list_is_empty(&os->os_dnodes) &&
+		    list_link_active(&os->os_evicting_node);
+	}
+	mutex_exit(&os->os_lock);
+
+	/* the dnode can no longer move, so we can release the handle */
+	zrl_remove(&dn->dn_handle->dnh_zrlock);
+
+	dn->dn_allocated_txg = 0;
+	dn->dn_free_txg = 0;
+	dn->dn_assigned_txg = 0;
+
+	dn->dn_dirtyctx = 0;
+	if (dn->dn_dirtyctx_firstset != NULL) {
+		kmem_free(dn->dn_dirtyctx_firstset, 1);
+		dn->dn_dirtyctx_firstset = NULL;
+	}
+	if (dn->dn_bonus != NULL) {
+		mutex_enter(&dn->dn_bonus->db_mtx);
+		dbuf_evict(dn->dn_bonus);
+		dn->dn_bonus = NULL;
+	}
+	dn->dn_zio = NULL;
+
+	dn->dn_have_spill = B_FALSE;
+	dn->dn_oldused = 0;
+	dn->dn_oldflags = 0;
+	dn->dn_olduid = 0;
+	dn->dn_oldgid = 0;
+	dn->dn_newuid = 0;
+	dn->dn_newgid = 0;
+	dn->dn_id_flags = 0;
+	dn->dn_unlisted_l0_blkid = 0;
+
+	dmu_zfetch_rele(&dn->dn_zfetch);
+	kmem_cache_free(dnode_cache, dn);
+	arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
+
+	if (complete_os_eviction)
+		dmu_objset_evict_done(os);
+}
+
+void
+dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	int i;
+
+	ASSERT3U(blocksize, <=,
+	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
+	if (blocksize == 0)
+		blocksize = 1 << zfs_default_bs;
+	else
+		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
+
+	if (ibs == 0)
+		ibs = zfs_default_ibs;
+
+	ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
+
+	dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
+	    dn->dn_object, tx->tx_txg, blocksize, ibs);
+
+	ASSERT(dn->dn_type == DMU_OT_NONE);
+	ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
+	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
+	ASSERT(ot != DMU_OT_NONE);
+	ASSERT(DMU_OT_IS_VALID(ot));
+	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
+	    (bonustype != DMU_OT_NONE && bonuslen != 0));
+	ASSERT(DMU_OT_IS_VALID(bonustype));
+	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+	ASSERT(dn->dn_type == DMU_OT_NONE);
+	ASSERT0(dn->dn_maxblkid);
+	ASSERT0(dn->dn_allocated_txg);
+	ASSERT0(dn->dn_assigned_txg);
+	ASSERT(refcount_is_zero(&dn->dn_tx_holds));
+	ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
+	ASSERT(avl_is_empty(&dn->dn_dbufs));
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		ASSERT0(dn->dn_next_nblkptr[i]);
+		ASSERT0(dn->dn_next_nlevels[i]);
+		ASSERT0(dn->dn_next_indblkshift[i]);
+		ASSERT0(dn->dn_next_bonuslen[i]);
+		ASSERT0(dn->dn_next_bonustype[i]);
+		ASSERT0(dn->dn_rm_spillblk[i]);
+		ASSERT0(dn->dn_next_blksz[i]);
+		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
+		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
+		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
+	}
+
+	dn->dn_type = ot;
+	dnode_setdblksz(dn, blocksize);
+	dn->dn_indblkshift = ibs;
+	dn->dn_nlevels = 1;
+	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+		dn->dn_nblkptr = 1;
+	else
+		dn->dn_nblkptr = 1 +
+		    ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+	dn->dn_bonustype = bonustype;
+	dn->dn_bonuslen = bonuslen;
+	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+	dn->dn_compress = ZIO_COMPRESS_INHERIT;
+	dn->dn_dirtyctx = 0;
+
+	dn->dn_free_txg = 0;
+	if (dn->dn_dirtyctx_firstset) {
+		kmem_free(dn->dn_dirtyctx_firstset, 1);
+		dn->dn_dirtyctx_firstset = NULL;
+	}
+
+	dn->dn_allocated_txg = tx->tx_txg;
+	dn->dn_id_flags = 0;
+
+	dnode_setdirty(dn, tx);
+	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
+	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
+	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
+}
+
+void
+dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	int nblkptr;
+
+	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
+	ASSERT3U(blocksize, <=,
+	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
+	ASSERT0(blocksize % SPA_MINBLOCKSIZE);
+	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
+	ASSERT(tx->tx_txg != 0);
+	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+	    (bonustype != DMU_OT_NONE && bonuslen != 0) ||
+	    (bonustype == DMU_OT_SA && bonuslen == 0));
+	ASSERT(DMU_OT_IS_VALID(bonustype));
+	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+
+	/* clean up any unreferenced dbufs */
+	dnode_evict_dbufs(dn);
+
+	dn->dn_id_flags = 0;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	dnode_setdirty(dn, tx);
+	if (dn->dn_datablksz != blocksize) {
+		/* change blocksize */
+		ASSERT(dn->dn_maxblkid == 0 &&
+		    (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
+		    dnode_block_freed(dn, 0)));
+		dnode_setdblksz(dn, blocksize);
+		dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
+	}
+	if (dn->dn_bonuslen != bonuslen)
+		dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
+
+	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+		nblkptr = 1;
+	else
+		nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+	if (dn->dn_bonustype != bonustype)
+		dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
+	if (dn->dn_nblkptr != nblkptr)
+		dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
+	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		dbuf_rm_spill(dn, tx);
+		dnode_rm_spill(dn, tx);
+	}
+	rw_exit(&dn->dn_struct_rwlock);
+
+	/* change type */
+	dn->dn_type = ot;
+
+	/* change bonus size and type */
+	mutex_enter(&dn->dn_mtx);
+	dn->dn_bonustype = bonustype;
+	dn->dn_bonuslen = bonuslen;
+	dn->dn_nblkptr = nblkptr;
+	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+	dn->dn_compress = ZIO_COMPRESS_INHERIT;
+	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+
+	/* fix up the bonus db_size */
+	if (dn->dn_bonus) {
+		dn->dn_bonus->db.db_size =
+		    DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+		ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
+	}
+
+	dn->dn_allocated_txg = tx->tx_txg;
+	mutex_exit(&dn->dn_mtx);
+}
+
+#ifdef	_KERNEL
+#ifdef	DNODE_STATS
+static struct {
+	uint64_t dms_dnode_invalid;
+	uint64_t dms_dnode_recheck1;
+	uint64_t dms_dnode_recheck2;
+	uint64_t dms_dnode_special;
+	uint64_t dms_dnode_handle;
+	uint64_t dms_dnode_rwlock;
+	uint64_t dms_dnode_active;
+} dnode_move_stats;
+#endif	/* DNODE_STATS */
+
+static void
+dnode_move_impl(dnode_t *odn, dnode_t *ndn)
+{
+	int i;
+
+	ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
+	ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
+	ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
+	ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
+
+	/* Copy fields. */
+	ndn->dn_objset = odn->dn_objset;
+	ndn->dn_object = odn->dn_object;
+	ndn->dn_dbuf = odn->dn_dbuf;
+	ndn->dn_handle = odn->dn_handle;
+	ndn->dn_phys = odn->dn_phys;
+	ndn->dn_type = odn->dn_type;
+	ndn->dn_bonuslen = odn->dn_bonuslen;
+	ndn->dn_bonustype = odn->dn_bonustype;
+	ndn->dn_nblkptr = odn->dn_nblkptr;
+	ndn->dn_checksum = odn->dn_checksum;
+	ndn->dn_compress = odn->dn_compress;
+	ndn->dn_nlevels = odn->dn_nlevels;
+	ndn->dn_indblkshift = odn->dn_indblkshift;
+	ndn->dn_datablkshift = odn->dn_datablkshift;
+	ndn->dn_datablkszsec = odn->dn_datablkszsec;
+	ndn->dn_datablksz = odn->dn_datablksz;
+	ndn->dn_maxblkid = odn->dn_maxblkid;
+	bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
+	    sizeof (odn->dn_next_nblkptr));
+	bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
+	    sizeof (odn->dn_next_nlevels));
+	bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
+	    sizeof (odn->dn_next_indblkshift));
+	bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
+	    sizeof (odn->dn_next_bonustype));
+	bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
+	    sizeof (odn->dn_rm_spillblk));
+	bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
+	    sizeof (odn->dn_next_bonuslen));
+	bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
+	    sizeof (odn->dn_next_blksz));
+	for (i = 0; i < TXG_SIZE; i++) {
+		list_move_tail(&ndn->dn_dirty_records[i],
+		    &odn->dn_dirty_records[i]);
+	}
+	bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
+	    sizeof (odn->dn_free_ranges));
+	ndn->dn_allocated_txg = odn->dn_allocated_txg;
+	ndn->dn_free_txg = odn->dn_free_txg;
+	ndn->dn_assigned_txg = odn->dn_assigned_txg;
+	ndn->dn_dirtyctx = odn->dn_dirtyctx;
+	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
+	ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
+	refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
+	ASSERT(avl_is_empty(&ndn->dn_dbufs));
+	avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
+	ndn->dn_dbufs_count = odn->dn_dbufs_count;
+	ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
+	ndn->dn_bonus = odn->dn_bonus;
+	ndn->dn_have_spill = odn->dn_have_spill;
+	ndn->dn_zio = odn->dn_zio;
+	ndn->dn_oldused = odn->dn_oldused;
+	ndn->dn_oldflags = odn->dn_oldflags;
+	ndn->dn_olduid = odn->dn_olduid;
+	ndn->dn_oldgid = odn->dn_oldgid;
+	ndn->dn_newuid = odn->dn_newuid;
+	ndn->dn_newgid = odn->dn_newgid;
+	ndn->dn_id_flags = odn->dn_id_flags;
+	dmu_zfetch_init(&ndn->dn_zfetch, NULL);
+	list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
+	ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
+	ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
+	ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
+
+	/*
+	 * Update back pointers. Updating the handle fixes the back pointer of
+	 * every descendant dbuf as well as the bonus dbuf.
+	 */
+	ASSERT(ndn->dn_handle->dnh_dnode == odn);
+	ndn->dn_handle->dnh_dnode = ndn;
+	if (ndn->dn_zfetch.zf_dnode == odn) {
+		ndn->dn_zfetch.zf_dnode = ndn;
+	}
+
+	/*
+	 * Invalidate the original dnode by clearing all of its back pointers.
+	 */
+	odn->dn_dbuf = NULL;
+	odn->dn_handle = NULL;
+	avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
+	    offsetof(dmu_buf_impl_t, db_link));
+	odn->dn_dbufs_count = 0;
+	odn->dn_unlisted_l0_blkid = 0;
+	odn->dn_bonus = NULL;
+	odn->dn_zfetch.zf_dnode = NULL;
+
+	/*
+	 * Set the low bit of the objset pointer to ensure that dnode_move()
+	 * recognizes the dnode as invalid in any subsequent callback.
+	 */
+	POINTER_INVALIDATE(&odn->dn_objset);
+
+	/*
+	 * Satisfy the destructor.
+	 */
+	for (i = 0; i < TXG_SIZE; i++) {
+		list_create(&odn->dn_dirty_records[i],
+		    sizeof (dbuf_dirty_record_t),
+		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
+		odn->dn_free_ranges[i] = NULL;
+		odn->dn_next_nlevels[i] = 0;
+		odn->dn_next_indblkshift[i] = 0;
+		odn->dn_next_bonustype[i] = 0;
+		odn->dn_rm_spillblk[i] = 0;
+		odn->dn_next_bonuslen[i] = 0;
+		odn->dn_next_blksz[i] = 0;
+	}
+	odn->dn_allocated_txg = 0;
+	odn->dn_free_txg = 0;
+	odn->dn_assigned_txg = 0;
+	odn->dn_dirtyctx = 0;
+	odn->dn_dirtyctx_firstset = NULL;
+	odn->dn_have_spill = B_FALSE;
+	odn->dn_zio = NULL;
+	odn->dn_oldused = 0;
+	odn->dn_oldflags = 0;
+	odn->dn_olduid = 0;
+	odn->dn_oldgid = 0;
+	odn->dn_newuid = 0;
+	odn->dn_newgid = 0;
+	odn->dn_id_flags = 0;
+
+	/*
+	 * Mark the dnode.
+	 */
+	ndn->dn_moved = 1;
+	odn->dn_moved = (uint8_t)-1;
+}
+
+/*ARGSUSED*/
+static kmem_cbrc_t
+dnode_move(void *buf, void *newbuf, size_t size, void *arg)
+{
+	dnode_t *odn = buf, *ndn = newbuf;
+	objset_t *os;
+	int64_t refcount;
+	uint32_t dbufs;
+
+	/*
+	 * The dnode is on the objset's list of known dnodes if the objset
+	 * pointer is valid. We set the low bit of the objset pointer when
+	 * freeing the dnode to invalidate it, and the memory patterns written
+	 * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
+	 * A newly created dnode sets the objset pointer last of all to indicate
+	 * that the dnode is known and in a valid state to be moved by this
+	 * function.
+	 */
+	os = odn->dn_objset;
+	if (!POINTER_IS_VALID(os)) {
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
+		return (KMEM_CBRC_DONT_KNOW);
+	}
+
+	/*
+	 * Ensure that the objset does not go away during the move.
+	 */
+	rw_enter(&os_lock, RW_WRITER);
+	if (os != odn->dn_objset) {
+		rw_exit(&os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
+		return (KMEM_CBRC_DONT_KNOW);
+	}
+
+	/*
+	 * If the dnode is still valid, then so is the objset. We know that no
+	 * valid objset can be freed while we hold os_lock, so we can safely
+	 * ensure that the objset remains in use.
+	 */
+	mutex_enter(&os->os_lock);
+
+	/*
+	 * Recheck the objset pointer in case the dnode was removed just before
+	 * acquiring the lock.
+	 */
+	if (os != odn->dn_objset) {
+		mutex_exit(&os->os_lock);
+		rw_exit(&os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
+		return (KMEM_CBRC_DONT_KNOW);
+	}
+
+	/*
+	 * At this point we know that as long as we hold os->os_lock, the dnode
+	 * cannot be freed and fields within the dnode can be safely accessed.
+	 * The objset listing this dnode cannot go away as long as this dnode is
+	 * on its list.
+	 */
+	rw_exit(&os_lock);
+	if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
+		mutex_exit(&os->os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
+		return (KMEM_CBRC_NO);
+	}
+	ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
+
+	/*
+	 * Lock the dnode handle to prevent the dnode from obtaining any new
+	 * holds. This also prevents the descendant dbufs and the bonus dbuf
+	 * from accessing the dnode, so that we can discount their holds. The
+	 * handle is safe to access because we know that while the dnode cannot
+	 * go away, neither can its handle. Once we hold dnh_zrlock, we can
+	 * safely move any dnode referenced only by dbufs.
+	 */
+	if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
+		mutex_exit(&os->os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
+		return (KMEM_CBRC_LATER);
+	}
+
+	/*
+	 * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
+	 * We need to guarantee that there is a hold for every dbuf in order to
+	 * determine whether the dnode is actively referenced. Falsely matching
+	 * a dbuf to an active hold would lead to an unsafe move. It's possible
+	 * that a thread already having an active dnode hold is about to add a
+	 * dbuf, and we can't compare hold and dbuf counts while the add is in
+	 * progress.
+	 */
+	if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
+		zrl_exit(&odn->dn_handle->dnh_zrlock);
+		mutex_exit(&os->os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
+		return (KMEM_CBRC_LATER);
+	}
+
+	/*
+	 * A dbuf may be removed (evicted) without an active dnode hold. In that
+	 * case, the dbuf count is decremented under the handle lock before the
+	 * dbuf's hold is released. This order ensures that if we count the hold
+	 * after the dbuf is removed but before its hold is released, we will
+	 * treat the unmatched hold as active and exit safely. If we count the
+	 * hold before the dbuf is removed, the hold is discounted, and the
+	 * removal is blocked until the move completes.
+	 */
+	refcount = refcount_count(&odn->dn_holds);
+	ASSERT(refcount >= 0);
+	dbufs = odn->dn_dbufs_count;
+
+	/* We can't have more dbufs than dnode holds. */
+	ASSERT3U(dbufs, <=, refcount);
+	DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
+	    uint32_t, dbufs);
+
+	if (refcount > dbufs) {
+		rw_exit(&odn->dn_struct_rwlock);
+		zrl_exit(&odn->dn_handle->dnh_zrlock);
+		mutex_exit(&os->os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
+		return (KMEM_CBRC_LATER);
+	}
+
+	rw_exit(&odn->dn_struct_rwlock);
+
+	/*
+	 * At this point we know that anyone with a hold on the dnode is not
+	 * actively referencing it. The dnode is known and in a valid state to
+	 * move. We're holding the locks needed to execute the critical section.
+	 */
+	dnode_move_impl(odn, ndn);
+
+	list_link_replace(&odn->dn_link, &ndn->dn_link);
+	/* If the dnode was safe to move, the refcount cannot have changed. */
+	ASSERT(refcount == refcount_count(&ndn->dn_holds));
+	ASSERT(dbufs == ndn->dn_dbufs_count);
+	zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
+	mutex_exit(&os->os_lock);
+
+	return (KMEM_CBRC_YES);
+}
+#endif	/* _KERNEL */
+
+void
+dnode_special_close(dnode_handle_t *dnh)
+{
+	dnode_t *dn = dnh->dnh_dnode;
+
+	/*
+	 * Wait for final references to the dnode to clear.  This can
+	 * only happen if the arc is asyncronously evicting state that
+	 * has a hold on this dnode while we are trying to evict this
+	 * dnode.
+	 */
+	while (refcount_count(&dn->dn_holds) > 0)
+		delay(1);
+	ASSERT(dn->dn_dbuf == NULL ||
+	    dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
+	zrl_add(&dnh->dnh_zrlock);
+	dnode_destroy(dn); /* implicit zrl_remove() */
+	zrl_destroy(&dnh->dnh_zrlock);
+	dnh->dnh_dnode = NULL;
+}
+
+void
+dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
+    dnode_handle_t *dnh)
+{
+	dnode_t *dn;
+
+	dn = dnode_create(os, dnp, NULL, object, dnh);
+	zrl_init(&dnh->dnh_zrlock);
+	DNODE_VERIFY(dn);
+}
+
+static void
+dnode_buf_pageout(void *dbu)
+{
+	dnode_children_t *children_dnodes = dbu;
+	int i;
+
+	for (i = 0; i < children_dnodes->dnc_count; i++) {
+		dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
+		dnode_t *dn;
+
+		/*
+		 * The dnode handle lock guards against the dnode moving to
+		 * another valid address, so there is no need here to guard
+		 * against changes to or from NULL.
+		 */
+		if (dnh->dnh_dnode == NULL) {
+			zrl_destroy(&dnh->dnh_zrlock);
+			continue;
+		}
+
+		zrl_add(&dnh->dnh_zrlock);
+		dn = dnh->dnh_dnode;
+		/*
+		 * If there are holds on this dnode, then there should
+		 * be holds on the dnode's containing dbuf as well; thus
+		 * it wouldn't be eligible for eviction and this function
+		 * would not have been called.
+		 */
+		ASSERT(refcount_is_zero(&dn->dn_holds));
+		ASSERT(refcount_is_zero(&dn->dn_tx_holds));
+
+		dnode_destroy(dn); /* implicit zrl_remove() */
+		zrl_destroy(&dnh->dnh_zrlock);
+		dnh->dnh_dnode = NULL;
+	}
+	kmem_free(children_dnodes, sizeof (dnode_children_t) +
+	    children_dnodes->dnc_count * sizeof (dnode_handle_t));
+}
+
+/*
+ * errors:
+ * EINVAL - invalid object number.
+ * EIO - i/o error.
+ * succeeds even for free dnodes.
+ */
+int
+dnode_hold_impl(objset_t *os, uint64_t object, int flag,
+    void *tag, dnode_t **dnp)
+{
+	int epb, idx, err;
+	int drop_struct_lock = FALSE;
+	int type;
+	uint64_t blk;
+	dnode_t *mdn, *dn;
+	dmu_buf_impl_t *db;
+	dnode_children_t *children_dnodes;
+	dnode_handle_t *dnh;
+
+	/*
+	 * If you are holding the spa config lock as writer, you shouldn't
+	 * be asking the DMU to do *anything* unless it's the root pool
+	 * which may require us to read from the root filesystem while
+	 * holding some (not all) of the locks as writer.
+	 */
+	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
+	    (spa_is_root(os->os_spa) &&
+	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
+
+	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
+		dn = (object == DMU_USERUSED_OBJECT) ?
+		    DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
+		if (dn == NULL)
+			return (SET_ERROR(ENOENT));
+		type = dn->dn_type;
+		if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
+			return (SET_ERROR(ENOENT));
+		if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
+			return (SET_ERROR(EEXIST));
+		DNODE_VERIFY(dn);
+		(void) refcount_add(&dn->dn_holds, tag);
+		*dnp = dn;
+		return (0);
+	}
+
+	if (object == 0 || object >= DN_MAX_OBJECT)
+		return (SET_ERROR(EINVAL));
+
+	mdn = DMU_META_DNODE(os);
+	ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
+
+	DNODE_VERIFY(mdn);
+
+	if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
+		rw_enter(&mdn->dn_struct_rwlock, RW_READER);
+		drop_struct_lock = TRUE;
+	}
+
+	blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+
+	db = dbuf_hold(mdn, blk, FTAG);
+	if (drop_struct_lock)
+		rw_exit(&mdn->dn_struct_rwlock);
+	if (db == NULL)
+		return (SET_ERROR(EIO));
+	err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+	if (err) {
+		dbuf_rele(db, FTAG);
+		return (err);
+	}
+
+	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
+	epb = db->db.db_size >> DNODE_SHIFT;
+
+	idx = object & (epb-1);
+
+	ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
+	children_dnodes = dmu_buf_get_user(&db->db);
+	if (children_dnodes == NULL) {
+		int i;
+		dnode_children_t *winner;
+		children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
+		    epb * sizeof (dnode_handle_t), KM_SLEEP);
+		children_dnodes->dnc_count = epb;
+		dnh = &children_dnodes->dnc_children[0];
+		for (i = 0; i < epb; i++) {
+			zrl_init(&dnh[i].dnh_zrlock);
+		}
+		dmu_buf_init_user(&children_dnodes->dnc_dbu,
+		    dnode_buf_pageout, NULL);
+		winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
+		if (winner != NULL) {
+
+			for (i = 0; i < epb; i++) {
+				zrl_destroy(&dnh[i].dnh_zrlock);
+			}
+
+			kmem_free(children_dnodes, sizeof (dnode_children_t) +
+			    epb * sizeof (dnode_handle_t));
+			children_dnodes = winner;
+		}
+	}
+	ASSERT(children_dnodes->dnc_count == epb);
+
+	dnh = &children_dnodes->dnc_children[idx];
+	zrl_add(&dnh->dnh_zrlock);
+	dn = dnh->dnh_dnode;
+	if (dn == NULL) {
+		dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
+
+		dn = dnode_create(os, phys, db, object, dnh);
+	}
+
+	mutex_enter(&dn->dn_mtx);
+	type = dn->dn_type;
+	if (dn->dn_free_txg ||
+	    ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
+	    ((flag & DNODE_MUST_BE_FREE) &&
+	    (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
+		mutex_exit(&dn->dn_mtx);
+		zrl_remove(&dnh->dnh_zrlock);
+		dbuf_rele(db, FTAG);
+		return (type == DMU_OT_NONE ? ENOENT : EEXIST);
+	}
+	if (refcount_add(&dn->dn_holds, tag) == 1)
+		dbuf_add_ref(db, dnh);
+	mutex_exit(&dn->dn_mtx);
+
+	/* Now we can rely on the hold to prevent the dnode from moving. */
+	zrl_remove(&dnh->dnh_zrlock);
+
+	DNODE_VERIFY(dn);
+	ASSERT3P(dn->dn_dbuf, ==, db);
+	ASSERT3U(dn->dn_object, ==, object);
+	dbuf_rele(db, FTAG);
+
+	*dnp = dn;
+	return (0);
+}
+
+/*
+ * Return held dnode if the object is allocated, NULL if not.
+ */
+int
+dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
+{
+	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
+}
+
+/*
+ * Can only add a reference if there is already at least one
+ * reference on the dnode.  Returns FALSE if unable to add a
+ * new reference.
+ */
+boolean_t
+dnode_add_ref(dnode_t *dn, void *tag)
+{
+	mutex_enter(&dn->dn_mtx);
+	if (refcount_is_zero(&dn->dn_holds)) {
+		mutex_exit(&dn->dn_mtx);
+		return (FALSE);
+	}
+	VERIFY(1 < refcount_add(&dn->dn_holds, tag));
+	mutex_exit(&dn->dn_mtx);
+	return (TRUE);
+}
+
+void
+dnode_rele(dnode_t *dn, void *tag)
+{
+	mutex_enter(&dn->dn_mtx);
+	dnode_rele_and_unlock(dn, tag);
+}
+
+void
+dnode_rele_and_unlock(dnode_t *dn, void *tag)
+{
+	uint64_t refs;
+	/* Get while the hold prevents the dnode from moving. */
+	dmu_buf_impl_t *db = dn->dn_dbuf;
+	dnode_handle_t *dnh = dn->dn_handle;
+
+	refs = refcount_remove(&dn->dn_holds, tag);
+	mutex_exit(&dn->dn_mtx);
+
+	/*
+	 * It's unsafe to release the last hold on a dnode by dnode_rele() or
+	 * indirectly by dbuf_rele() while relying on the dnode handle to
+	 * prevent the dnode from moving, since releasing the last hold could
+	 * result in the dnode's parent dbuf evicting its dnode handles. For
+	 * that reason anyone calling dnode_rele() or dbuf_rele() without some
+	 * other direct or indirect hold on the dnode must first drop the dnode
+	 * handle.
+	 */
+	ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
+
+	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
+	if (refs == 0 && db != NULL) {
+		/*
+		 * Another thread could add a hold to the dnode handle in
+		 * dnode_hold_impl() while holding the parent dbuf. Since the
+		 * hold on the parent dbuf prevents the handle from being
+		 * destroyed, the hold on the handle is OK. We can't yet assert
+		 * that the handle has zero references, but that will be
+		 * asserted anyway when the handle gets destroyed.
+		 */
+		dbuf_rele(db, dnh);
+	}
+}
+
+void
+dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
+{
+	objset_t *os = dn->dn_objset;
+	uint64_t txg = tx->tx_txg;
+
+	if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+		dsl_dataset_dirty(os->os_dsl_dataset, tx);
+		return;
+	}
+
+	DNODE_VERIFY(dn);
+
+#ifdef ZFS_DEBUG
+	mutex_enter(&dn->dn_mtx);
+	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
+	ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
+	mutex_exit(&dn->dn_mtx);
+#endif
+
+	/*
+	 * Determine old uid/gid when necessary
+	 */
+	dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
+
+	mutex_enter(&os->os_lock);
+
+	/*
+	 * If we are already marked dirty, we're done.
+	 */
+	if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
+		mutex_exit(&os->os_lock);
+		return;
+	}
+
+	ASSERT(!refcount_is_zero(&dn->dn_holds) ||
+	    !avl_is_empty(&dn->dn_dbufs));
+	ASSERT(dn->dn_datablksz != 0);
+	ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
+	ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
+	ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
+
+	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
+	    dn->dn_object, txg);
+
+	if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
+		list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
+	} else {
+		list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
+	}
+
+	mutex_exit(&os->os_lock);
+
+	/*
+	 * The dnode maintains a hold on its containing dbuf as
+	 * long as there are holds on it.  Each instantiated child
+	 * dbuf maintains a hold on the dnode.  When the last child
+	 * drops its hold, the dnode will drop its hold on the
+	 * containing dbuf. We add a "dirty hold" here so that the
+	 * dnode will hang around after we finish processing its
+	 * children.
+	 */
+	VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
+
+	(void) dbuf_dirty(dn->dn_dbuf, tx);
+
+	dsl_dataset_dirty(os->os_dsl_dataset, tx);
+}
+
+void
+dnode_free(dnode_t *dn, dmu_tx_t *tx)
+{
+	int txgoff = tx->tx_txg & TXG_MASK;
+
+	dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
+
+	/* we should be the only holder... hopefully */
+	/* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
+
+	mutex_enter(&dn->dn_mtx);
+	if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
+		mutex_exit(&dn->dn_mtx);
+		return;
+	}
+	dn->dn_free_txg = tx->tx_txg;
+	mutex_exit(&dn->dn_mtx);
+
+	/*
+	 * If the dnode is already dirty, it needs to be moved from
+	 * the dirty list to the free list.
+	 */
+	mutex_enter(&dn->dn_objset->os_lock);
+	if (list_link_active(&dn->dn_dirty_link[txgoff])) {
+		list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
+		list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
+		mutex_exit(&dn->dn_objset->os_lock);
+	} else {
+		mutex_exit(&dn->dn_objset->os_lock);
+		dnode_setdirty(dn, tx);
+	}
+}
+
+/*
+ * Try to change the block size for the indicated dnode.  This can only
+ * succeed if there are no blocks allocated or dirty beyond first block
+ */
+int
+dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db;
+	int err;
+
+	ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
+	if (size == 0)
+		size = SPA_MINBLOCKSIZE;
+	else
+		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
+
+	if (ibs == dn->dn_indblkshift)
+		ibs = 0;
+
+	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
+		return (0);
+
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+	/* Check for any allocated blocks beyond the first */
+	if (dn->dn_maxblkid != 0)
+		goto fail;
+
+	mutex_enter(&dn->dn_dbufs_mtx);
+	for (db = avl_first(&dn->dn_dbufs); db != NULL;
+	    db = AVL_NEXT(&dn->dn_dbufs, db)) {
+		if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
+		    db->db_blkid != DMU_SPILL_BLKID) {
+			mutex_exit(&dn->dn_dbufs_mtx);
+			goto fail;
+		}
+	}
+	mutex_exit(&dn->dn_dbufs_mtx);
+
+	if (ibs && dn->dn_nlevels != 1)
+		goto fail;
+
+	/* resize the old block */
+	err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+	if (err == 0)
+		dbuf_new_size(db, size, tx);
+	else if (err != ENOENT)
+		goto fail;
+
+	dnode_setdblksz(dn, size);
+	dnode_setdirty(dn, tx);
+	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
+	if (ibs) {
+		dn->dn_indblkshift = ibs;
+		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+	}
+	/* rele after we have fixed the blocksize in the dnode */
+	if (db)
+		dbuf_rele(db, FTAG);
+
+	rw_exit(&dn->dn_struct_rwlock);
+	return (0);
+
+fail:
+	rw_exit(&dn->dn_struct_rwlock);
+	return (SET_ERROR(ENOTSUP));
+}
+
+/* read-holding callers must not rely on the lock being continuously held */
+void
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
+{
+	uint64_t txgoff = tx->tx_txg & TXG_MASK;
+	int epbs, new_nlevels;
+	uint64_t sz;
+
+	ASSERT(blkid != DMU_BONUS_BLKID);
+
+	ASSERT(have_read ?
+	    RW_READ_HELD(&dn->dn_struct_rwlock) :
+	    RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+	/*
+	 * if we have a read-lock, check to see if we need to do any work
+	 * before upgrading to a write-lock.
+	 */
+	if (have_read) {
+		if (blkid <= dn->dn_maxblkid)
+			return;
+
+		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
+			rw_exit(&dn->dn_struct_rwlock);
+			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		}
+	}
+
+	if (blkid <= dn->dn_maxblkid)
+		goto out;
+
+	dn->dn_maxblkid = blkid;
+
+	/*
+	 * Compute the number of levels necessary to support the new maxblkid.
+	 */
+	new_nlevels = 1;
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	for (sz = dn->dn_nblkptr;
+	    sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
+		new_nlevels++;
+
+	if (new_nlevels > dn->dn_nlevels) {
+		int old_nlevels = dn->dn_nlevels;
+		dmu_buf_impl_t *db;
+		list_t *list;
+		dbuf_dirty_record_t *new, *dr, *dr_next;
+
+		dn->dn_nlevels = new_nlevels;
+
+		ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
+		dn->dn_next_nlevels[txgoff] = new_nlevels;
+
+		/* dirty the left indirects */
+		db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+		ASSERT(db != NULL);
+		new = dbuf_dirty(db, tx);
+		dbuf_rele(db, FTAG);
+
+		/* transfer the dirty records to the new indirect */
+		mutex_enter(&dn->dn_mtx);
+		mutex_enter(&new->dt.di.dr_mtx);
+		list = &dn->dn_dirty_records[txgoff];
+		for (dr = list_head(list); dr; dr = dr_next) {
+			dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
+			if (dr->dr_dbuf->db_level != new_nlevels-1 &&
+			    dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+			    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+				ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
+				list_remove(&dn->dn_dirty_records[txgoff], dr);
+				list_insert_tail(&new->dt.di.dr_children, dr);
+				dr->dr_parent = new;
+			}
+		}
+		mutex_exit(&new->dt.di.dr_mtx);
+		mutex_exit(&dn->dn_mtx);
+	}
+
+out:
+	if (have_read)
+		rw_downgrade(&dn->dn_struct_rwlock);
+}
+
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+	if (db != NULL) {
+		dmu_buf_will_dirty(&db->db, tx);
+		dbuf_rele(db, FTAG);
+	}
+}
+
+void
+dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db;
+	uint64_t blkoff, blkid, nblks;
+	int blksz, blkshift, head, tail;
+	int trunc = FALSE;
+	int epbs;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	blksz = dn->dn_datablksz;
+	blkshift = dn->dn_datablkshift;
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+	if (len == DMU_OBJECT_END) {
+		len = UINT64_MAX - off;
+		trunc = TRUE;
+	}
+
+	/*
+	 * First, block align the region to free:
+	 */
+	if (ISP2(blksz)) {
+		head = P2NPHASE(off, blksz);
+		blkoff = P2PHASE(off, blksz);
+		if ((off >> blkshift) > dn->dn_maxblkid)
+			goto out;
+	} else {
+		ASSERT(dn->dn_maxblkid == 0);
+		if (off == 0 && len >= blksz) {
+			/*
+			 * Freeing the whole block; fast-track this request.
+			 * Note that we won't dirty any indirect blocks,
+			 * which is fine because we will be freeing the entire
+			 * file and thus all indirect blocks will be freed
+			 * by free_children().
+			 */
+			blkid = 0;
+			nblks = 1;
+			goto done;
+		} else if (off >= blksz) {
+			/* Freeing past end-of-data */
+			goto out;
+		} else {
+			/* Freeing part of the block. */
+			head = blksz - off;
+			ASSERT3U(head, >, 0);
+		}
+		blkoff = off;
+	}
+	/* zero out any partial block data at the start of the range */
+	if (head) {
+		ASSERT3U(blkoff + head, ==, blksz);
+		if (len < head)
+			head = len;
+		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
+		    FTAG, &db) == 0) {
+			caddr_t data;
+
+			/* don't dirty if it isn't on disk and isn't dirty */
+			if (db->db_last_dirty ||
+			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+				rw_exit(&dn->dn_struct_rwlock);
+				dmu_buf_will_dirty(&db->db, tx);
+				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+				data = db->db.db_data;
+				bzero(data + blkoff, head);
+			}
+			dbuf_rele(db, FTAG);
+		}
+		off += head;
+		len -= head;
+	}
+
+	/* If the range was less than one block, we're done */
+	if (len == 0)
+		goto out;
+
+	/* If the remaining range is past end of file, we're done */
+	if ((off >> blkshift) > dn->dn_maxblkid)
+		goto out;
+
+	ASSERT(ISP2(blksz));
+	if (trunc)
+		tail = 0;
+	else
+		tail = P2PHASE(len, blksz);
+
+	ASSERT0(P2PHASE(off, blksz));
+	/* zero out any partial block data at the end of the range */
+	if (tail) {
+		if (len < tail)
+			tail = len;
+		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
+		    TRUE, FTAG, &db) == 0) {
+			/* don't dirty if not on disk and not dirty */
+			if (db->db_last_dirty ||
+			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
+				rw_exit(&dn->dn_struct_rwlock);
+				dmu_buf_will_dirty(&db->db, tx);
+				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+				bzero(db->db.db_data, tail);
+			}
+			dbuf_rele(db, FTAG);
+		}
+		len -= tail;
+	}
+
+	/* If the range did not include a full block, we are done */
+	if (len == 0)
+		goto out;
+
+	ASSERT(IS_P2ALIGNED(off, blksz));
+	ASSERT(trunc || IS_P2ALIGNED(len, blksz));
+	blkid = off >> blkshift;
+	nblks = len >> blkshift;
+	if (trunc)
+		nblks += 1;
+
+	/*
+	 * Dirty all the indirect blocks in this range.  Note that only
+	 * the first and last indirect blocks can actually be written
+	 * (if they were partially freed) -- they must be dirtied, even if
+	 * they do not exist on disk yet.  The interior blocks will
+	 * be freed by free_children(), so they will not actually be written.
+	 * Even though these interior blocks will not be written, we
+	 * dirty them for two reasons:
+	 *
+	 *  - It ensures that the indirect blocks remain in memory until
+	 *    syncing context.  (They have already been prefetched by
+	 *    dmu_tx_hold_free(), so we don't have to worry about reading
+	 *    them serially here.)
+	 *
+	 *  - The dirty space accounting will put pressure on the txg sync
+	 *    mechanism to begin syncing, and to delay transactions if there
+	 *    is a large amount of freeing.  Even though these indirect
+	 *    blocks will not be written, we could need to write the same
+	 *    amount of space if we copy the freed BPs into deadlists.
+	 */
+	if (dn->dn_nlevels > 1) {
+		uint64_t first, last, i, ibyte;
+		int shift, err;
+
+		first = blkid >> epbs;
+		dnode_dirty_l1(dn, first, tx);
+		if (trunc)
+			last = dn->dn_maxblkid >> epbs;
+		else
+			last = (blkid + nblks - 1) >> epbs;
+		if (last != first)
+			dnode_dirty_l1(dn, last, tx);
+
+		shift = dn->dn_datablkshift + dn->dn_indblkshift -
+		    SPA_BLKPTRSHIFT;
+		for (i = first + 1; i < last; i++) {
+			/*
+			 * Set i to the blockid of the next non-hole
+			 * level-1 indirect block at or after i.  Note
+			 * that dnode_next_offset() operates in terms of
+			 * level-0-equivalent bytes.
+			 */
+			ibyte = i << shift;
+			err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+			    &ibyte, 2, 1, 0);
+			i = ibyte >> shift;
+			if (i >= last)
+				break;
+
+			/*
+			 * Normally we should not see an error, either
+			 * from dnode_next_offset() or dbuf_hold_level()
+			 * (except for ESRCH from dnode_next_offset).
+			 * If there is an i/o error, then when we read
+			 * this block in syncing context, it will use
+			 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+			 * to the "failmode" property.  dnode_next_offset()
+			 * doesn't have a flag to indicate MUSTSUCCEED.
+			 */
+			if (err != 0)
+				break;
+
+			dnode_dirty_l1(dn, i, tx);
+		}
+	}
+
+done:
+	/*
+	 * Add this range to the dnode range list.
+	 * We will finish up this free operation in the syncing phase.
+	 */
+	mutex_enter(&dn->dn_mtx);
+	{
+	int txgoff = tx->tx_txg & TXG_MASK;
+	if (dn->dn_free_ranges[txgoff] == NULL) {
+		dn->dn_free_ranges[txgoff] =
+		    range_tree_create(NULL, NULL, &dn->dn_mtx);
+	}
+	range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
+	range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
+	}
+	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+	    blkid, nblks, tx->tx_txg);
+	mutex_exit(&dn->dn_mtx);
+
+	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
+	dnode_setdirty(dn, tx);
+out:
+
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+static boolean_t
+dnode_spill_freed(dnode_t *dn)
+{
+	int i;
+
+	mutex_enter(&dn->dn_mtx);
+	for (i = 0; i < TXG_SIZE; i++) {
+		if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
+			break;
+	}
+	mutex_exit(&dn->dn_mtx);
+	return (i < TXG_SIZE);
+}
+
+/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
+uint64_t
+dnode_block_freed(dnode_t *dn, uint64_t blkid)
+{
+	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
+	int i;
+
+	if (blkid == DMU_BONUS_BLKID)
+		return (FALSE);
+
+	/*
+	 * If we're in the process of opening the pool, dp will not be
+	 * set yet, but there shouldn't be anything dirty.
+	 */
+	if (dp == NULL)
+		return (FALSE);
+
+	if (dn->dn_free_txg)
+		return (TRUE);
+
+	if (blkid == DMU_SPILL_BLKID)
+		return (dnode_spill_freed(dn));
+
+	mutex_enter(&dn->dn_mtx);
+	for (i = 0; i < TXG_SIZE; i++) {
+		if (dn->dn_free_ranges[i] != NULL &&
+		    range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
+			break;
+	}
+	mutex_exit(&dn->dn_mtx);
+	return (i < TXG_SIZE);
+}
+
+/* call from syncing context when we actually write/free space for this dnode */
+void
+dnode_diduse_space(dnode_t *dn, int64_t delta)
+{
+	uint64_t space;
+	dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
+	    dn, dn->dn_phys,
+	    (u_longlong_t)dn->dn_phys->dn_used,
+	    (longlong_t)delta);
+
+	mutex_enter(&dn->dn_mtx);
+	space = DN_USED_BYTES(dn->dn_phys);
+	if (delta > 0) {
+		ASSERT3U(space + delta, >=, space); /* no overflow */
+	} else {
+		ASSERT3U(space, >=, -delta); /* no underflow */
+	}
+	space += delta;
+	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
+		ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
+		ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
+		dn->dn_phys->dn_used = space >> DEV_BSHIFT;
+	} else {
+		dn->dn_phys->dn_used = space;
+		dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
+	}
+	mutex_exit(&dn->dn_mtx);
+}
+
+/*
+ * Call when we think we're going to write/free space in open context to track
+ * the amount of memory in use by the currently open txg.
+ */
+void
+dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
+{
+	objset_t *os = dn->dn_objset;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+	int64_t aspace = spa_get_asize(os->os_spa, space);
+
+	if (ds != NULL) {
+		dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
+		dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
+	}
+
+	dmu_tx_willuse_space(tx, aspace);
+}
+
+/*
+ * Scans a block at the indicated "level" looking for a hole or data,
+ * depending on 'flags'.
+ *
+ * If level > 0, then we are scanning an indirect block looking at its
+ * pointers.  If level == 0, then we are looking at a block of dnodes.
+ *
+ * If we don't find what we are looking for in the block, we return ESRCH.
+ * Otherwise, return with *offset pointing to the beginning (if searching
+ * forwards) or end (if searching backwards) of the range covered by the
+ * block pointer we matched on (or dnode).
+ *
+ * The basic search algorithm used below by dnode_next_offset() is to
+ * use this function to search up the block tree (widen the search) until
+ * we find something (i.e., we don't return ESRCH) and then search back
+ * down the tree (narrow the search) until we reach our original search
+ * level.
+ */
+static int
+dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
+	int lvl, uint64_t blkfill, uint64_t txg)
+{
+	dmu_buf_impl_t *db = NULL;
+	void *data = NULL;
+	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	uint64_t epb = 1ULL << epbs;
+	uint64_t minfill, maxfill;
+	boolean_t hole;
+	int i, inc, error, span;
+
+	dprintf("probing object %llu offset %llx level %d of %u\n",
+	    dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
+
+	hole = ((flags & DNODE_FIND_HOLE) != 0);
+	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
+	ASSERT(txg == 0 || !hole);
+
+	if (lvl == dn->dn_phys->dn_nlevels) {
+		error = 0;
+		epb = dn->dn_phys->dn_nblkptr;
+		data = dn->dn_phys->dn_blkptr;
+	} else {
+		uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
+		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+		if (error) {
+			if (error != ENOENT)
+				return (error);
+			if (hole)
+				return (0);
+			/*
+			 * This can only happen when we are searching up
+			 * the block tree for data.  We don't really need to
+			 * adjust the offset, as we will just end up looking
+			 * at the pointer to this block in its parent, and its
+			 * going to be unallocated, so we will skip over it.
+			 */
+			return (SET_ERROR(ESRCH));
+		}
+		error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
+		if (error) {
+			dbuf_rele(db, FTAG);
+			return (error);
+		}
+		data = db->db.db_data;
+	}
+
+
+	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
+	    db->db_blkptr->blk_birth <= txg ||
+	    BP_IS_HOLE(db->db_blkptr))) {
+		/*
+		 * This can only happen when we are searching up the tree
+		 * and these conditions mean that we need to keep climbing.
+		 */
+		error = SET_ERROR(ESRCH);
+	} else if (lvl == 0) {
+		dnode_phys_t *dnp = data;
+		span = DNODE_SHIFT;
+		ASSERT(dn->dn_type == DMU_OT_DNODE);
+
+		for (i = (*offset >> span) & (blkfill - 1);
+		    i >= 0 && i < blkfill; i += inc) {
+			if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
+				break;
+			*offset += (1ULL << span) * inc;
+		}
+		if (i < 0 || i == blkfill)
+			error = SET_ERROR(ESRCH);
+	} else {
+		blkptr_t *bp = data;
+		uint64_t start = *offset;
+		span = (lvl - 1) * epbs + dn->dn_datablkshift;
+		minfill = 0;
+		maxfill = blkfill << ((lvl - 1) * epbs);
+
+		if (hole)
+			maxfill--;
+		else
+			minfill++;
+
+		*offset = *offset >> span;
+		for (i = BF64_GET(*offset, 0, epbs);
+		    i >= 0 && i < epb; i += inc) {
+			if (BP_GET_FILL(&bp[i]) >= minfill &&
+			    BP_GET_FILL(&bp[i]) <= maxfill &&
+			    (hole || bp[i].blk_birth > txg))
+				break;
+			if (inc > 0 || *offset > 0)
+				*offset += inc;
+		}
+		*offset = *offset << span;
+		if (inc < 0) {
+			/* traversing backwards; position offset at the end */
+			ASSERT3U(*offset, <=, start);
+			*offset = MIN(*offset + (1ULL << span) - 1, start);
+		} else if (*offset < start) {
+			*offset = start;
+		}
+		if (i < 0 || i >= epb)
+			error = SET_ERROR(ESRCH);
+	}
+
+	if (db)
+		dbuf_rele(db, FTAG);
+
+	return (error);
+}
+
+/*
+ * Find the next hole, data, or sparse region at or after *offset.
+ * The value 'blkfill' tells us how many items we expect to find
+ * in an L0 data block; this value is 1 for normal objects,
+ * DNODES_PER_BLOCK for the meta dnode, and some fraction of
+ * DNODES_PER_BLOCK when searching for sparse regions thereof.
+ *
+ * Examples:
+ *
+ * dnode_next_offset(dn, flags, offset, 1, 1, 0);
+ *	Finds the next/previous hole/data in a file.
+ *	Used in dmu_offset_next().
+ *
+ * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
+ *	Finds the next free/allocated dnode an objset's meta-dnode.
+ *	Only finds objects that have new contents since txg (ie.
+ *	bonus buffer changes and content removal are ignored).
+ *	Used in dmu_object_next().
+ *
+ * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
+ *	Used in dmu_object_alloc().
+ */
+int
+dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
+    int minlvl, uint64_t blkfill, uint64_t txg)
+{
+	uint64_t initial_offset = *offset;
+	int lvl, maxlvl;
+	int error = 0;
+
+	if (!(flags & DNODE_FIND_HAVELOCK))
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+	if (dn->dn_phys->dn_nlevels == 0) {
+		error = SET_ERROR(ESRCH);
+		goto out;
+	}
+
+	if (dn->dn_datablkshift == 0) {
+		if (*offset < dn->dn_datablksz) {
+			if (flags & DNODE_FIND_HOLE)
+				*offset = dn->dn_datablksz;
+		} else {
+			error = SET_ERROR(ESRCH);
+		}
+		goto out;
+	}
+
+	maxlvl = dn->dn_phys->dn_nlevels;
+
+	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
+		error = dnode_next_offset_level(dn,
+		    flags, offset, lvl, blkfill, txg);
+		if (error != ESRCH)
+			break;
+	}
+
+	while (error == 0 && --lvl >= minlvl) {
+		error = dnode_next_offset_level(dn,
+		    flags, offset, lvl, blkfill, txg);
+	}
+
+	/*
+	 * There's always a "virtual hole" at the end of the object, even
+	 * if all BP's which physically exist are non-holes.
+	 */
+	if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
+	    minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
+		error = 0;
+	}
+
+	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
+	    initial_offset < *offset : initial_offset > *offset))
+		error = SET_ERROR(ESRCH);
+out:
+	if (!(flags & DNODE_FIND_HAVELOCK))
+		rw_exit(&dn->dn_struct_rwlock);
+
+	return (error);
+}
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index df5c8e4ee6c4..b47395a1e26a 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -192,7 +192,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		err = dbuf_hold_impl(dn, db->db_level-1,
-		    (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
+		    (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
 		rw_exit(&dn->dn_struct_rwlock);
 		if (err == ENOENT)
 			continue;
@@ -288,7 +288,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
 				continue;
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
-			    i, B_TRUE, FTAG, &subdb));
+			    i, TRUE, FALSE, FTAG, &subdb));
 			rw_exit(&dn->dn_struct_rwlock);
 			ASSERT3P(bp, ==, subdb->db_blkptr);
 
@@ -362,7 +362,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
 				continue;
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
-			    TRUE, FTAG, &db));
+			    TRUE, FALSE, FTAG, &db));
 			rw_exit(&dn->dn_struct_rwlock);
 
 			free_children(db, blkid, nblks, tx);
diff --git a/module/zfs/dnode_sync.c.orig b/module/zfs/dnode_sync.c.orig
new file mode 100644
index 000000000000..df5c8e4ee6c4
--- /dev/null
+++ b/module/zfs/dnode_sync.c.orig
@@ -0,0 +1,737 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/range_tree.h>
+#include <sys/zfeature.h>
+
+static void
+dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db;
+	int txgoff = tx->tx_txg & TXG_MASK;
+	int nblkptr = dn->dn_phys->dn_nblkptr;
+	int old_toplvl = dn->dn_phys->dn_nlevels - 1;
+	int new_level = dn->dn_next_nlevels[txgoff];
+	int i;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+	/* this dnode can't be paged out because it's dirty */
+	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+	ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
+
+	db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
+	ASSERT(db != NULL);
+
+	dn->dn_phys->dn_nlevels = new_level;
+	dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
+	    dn->dn_object, dn->dn_phys->dn_nlevels);
+
+	/* check for existing blkptrs in the dnode */
+	for (i = 0; i < nblkptr; i++)
+		if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
+			break;
+	if (i != nblkptr) {
+		/* transfer dnode's block pointers to new indirect block */
+		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
+		ASSERT(db->db.db_data);
+		ASSERT(arc_released(db->db_buf));
+		ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
+		bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+		    sizeof (blkptr_t) * nblkptr);
+		arc_buf_freeze(db->db_buf);
+	}
+
+	/* set dbuf's parent pointers to new indirect buf */
+	for (i = 0; i < nblkptr; i++) {
+		dmu_buf_impl_t *child =
+		    dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
+
+		if (child == NULL)
+			continue;
+#ifdef	DEBUG
+		DB_DNODE_ENTER(child);
+		ASSERT3P(DB_DNODE(child), ==, dn);
+		DB_DNODE_EXIT(child);
+#endif	/* DEBUG */
+		if (child->db_parent && child->db_parent != dn->dn_dbuf) {
+			ASSERT(child->db_parent->db_level == db->db_level);
+			ASSERT(child->db_blkptr !=
+			    &dn->dn_phys->dn_blkptr[child->db_blkid]);
+			mutex_exit(&child->db_mtx);
+			continue;
+		}
+		ASSERT(child->db_parent == NULL ||
+		    child->db_parent == dn->dn_dbuf);
+
+		child->db_parent = db;
+		dbuf_add_ref(db, child);
+		if (db->db.db_data)
+			child->db_blkptr = (blkptr_t *)db->db.db_data + i;
+		else
+			child->db_blkptr = NULL;
+		dprintf_dbuf_bp(child, child->db_blkptr,
+		    "changed db_blkptr to new indirect %s", "");
+
+		mutex_exit(&child->db_mtx);
+	}
+
+	bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
+
+	dbuf_rele(db, FTAG);
+
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+static void
+free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+	uint64_t bytesfreed = 0;
+	int i;
+
+	dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
+
+	for (i = 0; i < num; i++, bp++) {
+		uint64_t lsize, lvl;
+		dmu_object_type_t type;
+
+		if (BP_IS_HOLE(bp))
+			continue;
+
+		bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
+		ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
+
+		/*
+		 * Save some useful information on the holes being
+		 * punched, including logical size, type, and indirection
+		 * level. Retaining birth time enables detection of when
+		 * holes are punched for reducing the number of free
+		 * records transmitted during a zfs send.
+		 */
+
+		lsize = BP_GET_LSIZE(bp);
+		type = BP_GET_TYPE(bp);
+		lvl = BP_GET_LEVEL(bp);
+
+		bzero(bp, sizeof (blkptr_t));
+
+		if (spa_feature_is_active(dn->dn_objset->os_spa,
+		    SPA_FEATURE_HOLE_BIRTH)) {
+			BP_SET_LSIZE(bp, lsize);
+			BP_SET_TYPE(bp, type);
+			BP_SET_LEVEL(bp, lvl);
+			BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
+		}
+	}
+	dnode_diduse_space(dn, -bytesfreed);
+}
+
+#ifdef ZFS_DEBUG
+static void
+free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
+{
+	int off, num;
+	int i, err, epbs;
+	uint64_t txg = tx->tx_txg;
+	dnode_t *dn;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	off = start - (db->db_blkid * 1<<epbs);
+	num = end - start + 1;
+
+	ASSERT3U(off, >=, 0);
+	ASSERT3U(num, >=, 0);
+	ASSERT3U(db->db_level, >, 0);
+	ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
+	ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
+	ASSERT(db->db_blkptr != NULL);
+
+	for (i = off; i < off+num; i++) {
+		uint64_t *buf;
+		dmu_buf_impl_t *child;
+		dbuf_dirty_record_t *dr;
+		int j;
+
+		ASSERT(db->db_level == 1);
+
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		err = dbuf_hold_impl(dn, db->db_level-1,
+		    (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
+		rw_exit(&dn->dn_struct_rwlock);
+		if (err == ENOENT)
+			continue;
+		ASSERT(err == 0);
+		ASSERT(child->db_level == 0);
+		dr = child->db_last_dirty;
+		while (dr && dr->dr_txg > txg)
+			dr = dr->dr_next;
+		ASSERT(dr == NULL || dr->dr_txg == txg);
+
+		/* data_old better be zeroed */
+		if (dr) {
+			buf = dr->dt.dl.dr_data->b_data;
+			for (j = 0; j < child->db.db_size >> 3; j++) {
+				if (buf[j] != 0) {
+					panic("freed data not zero: "
+					    "child=%p i=%d off=%d num=%d\n",
+					    (void *)child, i, off, num);
+				}
+			}
+		}
+
+		/*
+		 * db_data better be zeroed unless it's dirty in a
+		 * future txg.
+		 */
+		mutex_enter(&child->db_mtx);
+		buf = child->db.db_data;
+		if (buf != NULL && child->db_state != DB_FILL &&
+		    child->db_last_dirty == NULL) {
+			for (j = 0; j < child->db.db_size >> 3; j++) {
+				if (buf[j] != 0) {
+					panic("freed data not zero: "
+					    "child=%p i=%d off=%d num=%d\n",
+					    (void *)child, i, off, num);
+				}
+			}
+		}
+		mutex_exit(&child->db_mtx);
+
+		dbuf_rele(child, FTAG);
+	}
+	DB_DNODE_EXIT(db);
+}
+#endif
+
+static void
+free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
+    dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	blkptr_t *bp;
+	dmu_buf_impl_t *subdb;
+	uint64_t start, end, dbstart, dbend, i;
+	int epbs, shift;
+
+	/*
+	 * There is a small possibility that this block will not be cached:
+	 *   1 - if level > 1 and there are no children with level <= 1
+	 *   2 - if this block was evicted since we read it from
+	 *	 dmu_tx_hold_free().
+	 */
+	if (db->db_state != DB_CACHED)
+		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+
+	dbuf_release_bp(db);
+	bp = db->db.db_data;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	shift = (db->db_level - 1) * epbs;
+	dbstart = db->db_blkid << epbs;
+	start = blkid >> shift;
+	if (dbstart < start) {
+		bp += start - dbstart;
+	} else {
+		start = dbstart;
+	}
+	dbend = ((db->db_blkid + 1) << epbs) - 1;
+	end = (blkid + nblks - 1) >> shift;
+	if (dbend <= end)
+		end = dbend;
+
+	ASSERT3U(start, <=, end);
+
+	if (db->db_level == 1) {
+		FREE_VERIFY(db, start, end, tx);
+		free_blocks(dn, bp, end-start+1, tx);
+	} else {
+		for (i = start; i <= end; i++, bp++) {
+			if (BP_IS_HOLE(bp))
+				continue;
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
+			    i, B_TRUE, FTAG, &subdb));
+			rw_exit(&dn->dn_struct_rwlock);
+			ASSERT3P(bp, ==, subdb->db_blkptr);
+
+			free_children(subdb, blkid, nblks, tx);
+			dbuf_rele(subdb, FTAG);
+		}
+	}
+
+	/* If this whole block is free, free ourself too. */
+	for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
+		if (!BP_IS_HOLE(bp))
+			break;
+	}
+	if (i == 1 << epbs) {
+		/* didn't find any non-holes */
+		bzero(db->db.db_data, db->db.db_size);
+		free_blocks(dn, db->db_blkptr, 1, tx);
+	} else {
+		/*
+		 * Partial block free; must be marked dirty so that it
+		 * will be written out.
+		 */
+		ASSERT(db->db_dirtycnt > 0);
+	}
+
+	DB_DNODE_EXIT(db);
+	arc_buf_freeze(db->db_buf);
+}
+
+/*
+ * Traverse the indicated range of the provided file
+ * and "free" all the blocks contained there.
+ */
+static void
+dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
+    dmu_tx_t *tx)
+{
+	blkptr_t *bp = dn->dn_phys->dn_blkptr;
+	int dnlevel = dn->dn_phys->dn_nlevels;
+	boolean_t trunc = B_FALSE;
+
+	if (blkid > dn->dn_phys->dn_maxblkid)
+		return;
+
+	ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
+	if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
+		nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
+		trunc = B_TRUE;
+	}
+
+	/* There are no indirect blocks in the object */
+	if (dnlevel == 1) {
+		if (blkid >= dn->dn_phys->dn_nblkptr) {
+			/* this range was never made persistent */
+			return;
+		}
+		ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
+		free_blocks(dn, bp + blkid, nblks, tx);
+	} else {
+		int shift = (dnlevel - 1) *
+		    (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
+		int start = blkid >> shift;
+		int end = (blkid + nblks - 1) >> shift;
+		dmu_buf_impl_t *db;
+		int i;
+
+		ASSERT(start < dn->dn_phys->dn_nblkptr);
+		bp += start;
+		for (i = start; i <= end; i++, bp++) {
+			if (BP_IS_HOLE(bp))
+				continue;
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
+			    TRUE, FTAG, &db));
+			rw_exit(&dn->dn_struct_rwlock);
+
+			free_children(db, blkid, nblks, tx);
+			dbuf_rele(db, FTAG);
+		}
+	}
+
+	if (trunc) {
+		ASSERTV(uint64_t off);
+		dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
+
+		ASSERTV(off = (dn->dn_phys->dn_maxblkid + 1) *
+		    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT));
+		ASSERT(off < dn->dn_phys->dn_maxblkid ||
+		    dn->dn_phys->dn_maxblkid == 0 ||
+		    dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
+	}
+}
+
+typedef struct dnode_sync_free_range_arg {
+	dnode_t *dsfra_dnode;
+	dmu_tx_t *dsfra_tx;
+} dnode_sync_free_range_arg_t;
+
+static void
+dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
+{
+	dnode_sync_free_range_arg_t *dsfra = arg;
+	dnode_t *dn = dsfra->dsfra_dnode;
+
+	mutex_exit(&dn->dn_mtx);
+	dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
+	mutex_enter(&dn->dn_mtx);
+}
+
+/*
+ * Try to kick all the dnode's dbufs out of the cache...
+ */
+void
+dnode_evict_dbufs(dnode_t *dn)
+{
+	dmu_buf_impl_t *db_marker;
+	dmu_buf_impl_t *db, *db_next;
+
+	db_marker = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
+
+	mutex_enter(&dn->dn_dbufs_mtx);
+	for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
+
+#ifdef	DEBUG
+		DB_DNODE_ENTER(db);
+		ASSERT3P(DB_DNODE(db), ==, dn);
+		DB_DNODE_EXIT(db);
+#endif	/* DEBUG */
+
+		mutex_enter(&db->db_mtx);
+		if (db->db_state != DB_EVICTING &&
+		    refcount_is_zero(&db->db_holds)) {
+			db_marker->db_level = db->db_level;
+			db_marker->db_blkid = db->db_blkid;
+			db_marker->db_state = DB_SEARCH;
+			avl_insert_here(&dn->dn_dbufs, db_marker, db,
+			    AVL_BEFORE);
+
+			dbuf_clear(db);
+
+			db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);
+			avl_remove(&dn->dn_dbufs, db_marker);
+		} else {
+			db->db_pending_evict = TRUE;
+			mutex_exit(&db->db_mtx);
+			db_next = AVL_NEXT(&dn->dn_dbufs, db);
+		}
+	}
+	mutex_exit(&dn->dn_dbufs_mtx);
+
+	kmem_free(db_marker, sizeof (dmu_buf_impl_t));
+
+	dnode_evict_bonus(dn);
+}
+
+void
+dnode_evict_bonus(dnode_t *dn)
+{
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	if (dn->dn_bonus != NULL) {
+		if (refcount_is_zero(&dn->dn_bonus->db_holds)) {
+			mutex_enter(&dn->dn_bonus->db_mtx);
+			dbuf_evict(dn->dn_bonus);
+			dn->dn_bonus = NULL;
+		} else {
+			dn->dn_bonus->db_pending_evict = TRUE;
+		}
+	}
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+static void
+dnode_undirty_dbufs(list_t *list)
+{
+	dbuf_dirty_record_t *dr;
+
+	while ((dr = list_head(list))) {
+		dmu_buf_impl_t *db = dr->dr_dbuf;
+		uint64_t txg = dr->dr_txg;
+
+		if (db->db_level != 0)
+			dnode_undirty_dbufs(&dr->dt.di.dr_children);
+
+		mutex_enter(&db->db_mtx);
+		/* XXX - use dbuf_undirty()? */
+		list_remove(list, dr);
+		ASSERT(db->db_last_dirty == dr);
+		db->db_last_dirty = NULL;
+		db->db_dirtycnt -= 1;
+		if (db->db_level == 0) {
+			ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
+			    dr->dt.dl.dr_data == db->db_buf);
+			dbuf_unoverride(dr);
+		} else {
+			mutex_destroy(&dr->dt.di.dr_mtx);
+			list_destroy(&dr->dt.di.dr_children);
+		}
+		kmem_free(dr, sizeof (dbuf_dirty_record_t));
+		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+	}
+}
+
+static void
+dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
+{
+	int txgoff = tx->tx_txg & TXG_MASK;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	/*
+	 * Our contents should have been freed in dnode_sync() by the
+	 * free range record inserted by the caller of dnode_free().
+	 */
+	ASSERT0(DN_USED_BYTES(dn->dn_phys));
+	ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
+
+	dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
+	dnode_evict_dbufs(dn);
+
+	/*
+	 * XXX - It would be nice to assert this, but we may still
+	 * have residual holds from async evictions from the arc...
+	 *
+	 * zfs_obj_to_path() also depends on this being
+	 * commented out.
+	 *
+	 * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
+	 */
+
+	/* Undirty next bits */
+	dn->dn_next_nlevels[txgoff] = 0;
+	dn->dn_next_indblkshift[txgoff] = 0;
+	dn->dn_next_blksz[txgoff] = 0;
+
+	/* ASSERT(blkptrs are zero); */
+	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+	ASSERT(dn->dn_type != DMU_OT_NONE);
+
+	ASSERT(dn->dn_free_txg > 0);
+	if (dn->dn_allocated_txg != dn->dn_free_txg)
+		dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
+	bzero(dn->dn_phys, sizeof (dnode_phys_t));
+
+	mutex_enter(&dn->dn_mtx);
+	dn->dn_type = DMU_OT_NONE;
+	dn->dn_maxblkid = 0;
+	dn->dn_allocated_txg = 0;
+	dn->dn_free_txg = 0;
+	dn->dn_have_spill = B_FALSE;
+	mutex_exit(&dn->dn_mtx);
+
+	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+
+	dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+	/*
+	 * Now that we've released our hold, the dnode may
+	 * be evicted, so we musn't access it.
+	 */
+}
+
+/*
+ * Write out the dnode's dirty buffers.
+ */
+void
+dnode_sync(dnode_t *dn, dmu_tx_t *tx)
+{
+	dnode_phys_t *dnp = dn->dn_phys;
+	int txgoff = tx->tx_txg & TXG_MASK;
+	list_t *list = &dn->dn_dirty_records[txgoff];
+	boolean_t kill_spill = B_FALSE;
+	boolean_t freeing_dnode;
+	ASSERTV(static const dnode_phys_t zerodn = { 0 });
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
+	ASSERT(dnp->dn_type != DMU_OT_NONE ||
+	    bcmp(dnp, &zerodn, DNODE_SIZE) == 0);
+	DNODE_VERIFY(dn);
+
+	ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
+
+	if (dmu_objset_userused_enabled(dn->dn_objset) &&
+	    !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+		mutex_enter(&dn->dn_mtx);
+		dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
+		dn->dn_oldflags = dn->dn_phys->dn_flags;
+		dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
+		mutex_exit(&dn->dn_mtx);
+		dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
+	} else {
+		/* Once we account for it, we should always account for it. */
+		ASSERT(!(dn->dn_phys->dn_flags &
+		    DNODE_FLAG_USERUSED_ACCOUNTED));
+	}
+
+	mutex_enter(&dn->dn_mtx);
+	if (dn->dn_allocated_txg == tx->tx_txg) {
+		/* The dnode is newly allocated or reallocated */
+		if (dnp->dn_type == DMU_OT_NONE) {
+			/* this is a first alloc, not a realloc */
+			dnp->dn_nlevels = 1;
+			dnp->dn_nblkptr = dn->dn_nblkptr;
+		}
+
+		dnp->dn_type = dn->dn_type;
+		dnp->dn_bonustype = dn->dn_bonustype;
+		dnp->dn_bonuslen = dn->dn_bonuslen;
+	}
+	ASSERT(dnp->dn_nlevels > 1 ||
+	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+	    BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
+	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
+	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+	ASSERT(dnp->dn_nlevels < 2 ||
+	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
+
+	if (dn->dn_next_type[txgoff] != 0) {
+		dnp->dn_type = dn->dn_type;
+		dn->dn_next_type[txgoff] = 0;
+	}
+
+	if (dn->dn_next_blksz[txgoff] != 0) {
+		ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
+		    SPA_MINBLOCKSIZE) == 0);
+		ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+		    dn->dn_maxblkid == 0 || list_head(list) != NULL ||
+		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
+		    dnp->dn_datablkszsec ||
+		    range_tree_space(dn->dn_free_ranges[txgoff]) != 0);
+		dnp->dn_datablkszsec =
+		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
+		dn->dn_next_blksz[txgoff] = 0;
+	}
+
+	if (dn->dn_next_bonuslen[txgoff] != 0) {
+		if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
+			dnp->dn_bonuslen = 0;
+		else
+			dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
+		ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
+		dn->dn_next_bonuslen[txgoff] = 0;
+	}
+
+	if (dn->dn_next_bonustype[txgoff] != 0) {
+		ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
+		dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
+		dn->dn_next_bonustype[txgoff] = 0;
+	}
+
+	freeing_dnode = dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg;
+
+	/*
+	 * Remove the spill block if we have been explicitly asked to
+	 * remove it, or if the object is being removed.
+	 */
+	if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) {
+		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+			kill_spill = B_TRUE;
+		dn->dn_rm_spillblk[txgoff] = 0;
+	}
+
+	if (dn->dn_next_indblkshift[txgoff] != 0) {
+		ASSERT(dnp->dn_nlevels == 1);
+		dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
+		dn->dn_next_indblkshift[txgoff] = 0;
+	}
+
+	/*
+	 * Just take the live (open-context) values for checksum and compress.
+	 * Strictly speaking it's a future leak, but nothing bad happens if we
+	 * start using the new checksum or compress algorithm a little early.
+	 */
+	dnp->dn_checksum = dn->dn_checksum;
+	dnp->dn_compress = dn->dn_compress;
+
+	mutex_exit(&dn->dn_mtx);
+
+	if (kill_spill) {
+		free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
+		mutex_enter(&dn->dn_mtx);
+		dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
+		mutex_exit(&dn->dn_mtx);
+	}
+
+	/* process all the "freed" ranges in the file */
+	if (dn->dn_free_ranges[txgoff] != NULL) {
+		dnode_sync_free_range_arg_t dsfra;
+		dsfra.dsfra_dnode = dn;
+		dsfra.dsfra_tx = tx;
+		mutex_enter(&dn->dn_mtx);
+		range_tree_vacate(dn->dn_free_ranges[txgoff],
+		    dnode_sync_free_range, &dsfra);
+		range_tree_destroy(dn->dn_free_ranges[txgoff]);
+		dn->dn_free_ranges[txgoff] = NULL;
+		mutex_exit(&dn->dn_mtx);
+	}
+
+	if (freeing_dnode) {
+		dnode_sync_free(dn, tx);
+		return;
+	}
+
+	if (dn->dn_next_nlevels[txgoff]) {
+		dnode_increase_indirection(dn, tx);
+		dn->dn_next_nlevels[txgoff] = 0;
+	}
+
+	if (dn->dn_next_nblkptr[txgoff]) {
+		/* this should only happen on a realloc */
+		ASSERT(dn->dn_allocated_txg == tx->tx_txg);
+		if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
+			/* zero the new blkptrs we are gaining */
+			bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
+			    sizeof (blkptr_t) *
+			    (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
+#ifdef ZFS_DEBUG
+		} else {
+			int i;
+			ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
+			/* the blkptrs we are losing better be unallocated */
+			for (i = 0; i < dnp->dn_nblkptr; i++) {
+				if (i >= dn->dn_next_nblkptr[txgoff])
+					ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i]));
+			}
+#endif
+		}
+		mutex_enter(&dn->dn_mtx);
+		dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff];
+		dn->dn_next_nblkptr[txgoff] = 0;
+		mutex_exit(&dn->dn_mtx);
+	}
+
+	dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
+
+	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+		ASSERT3P(list_head(list), ==, NULL);
+		dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+	}
+
+	/*
+	 * Although we have dropped our reference to the dnode, it
+	 * can't be evicted until its written, and we haven't yet
+	 * initiated the IO for the dnode's dbuf.
+	 */
+}
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 829452b1d22a..edc5ea17a407 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -547,6 +547,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
 	const char *snapname;
 	uint64_t obj;
 	int err = 0;
+	dsl_dataset_t *ds;
 
 	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
 	if (err != 0)
@@ -555,36 +556,37 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name,
 	ASSERT(dsl_pool_config_held(dp));
 	obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	if (obj != 0)
-		err = dsl_dataset_hold_obj(dp, obj, tag, dsp);
+		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
 	else
 		err = SET_ERROR(ENOENT);
 
 	/* we may be looking for a snapshot */
 	if (err == 0 && snapname != NULL) {
-		dsl_dataset_t *ds;
+		dsl_dataset_t *snap_ds;
 
 		if (*snapname++ != '@') {
-			dsl_dataset_rele(*dsp, tag);
+			dsl_dataset_rele(ds, tag);
 			dsl_dir_rele(dd, FTAG);
 			return (SET_ERROR(ENOENT));
 		}
 
 		dprintf("looking for snapshot '%s'\n", snapname);
-		err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
+		err = dsl_dataset_snap_lookup(ds, snapname, &obj);
 		if (err == 0)
-			err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
-		dsl_dataset_rele(*dsp, tag);
+			err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
+		dsl_dataset_rele(ds, tag);
 
 		if (err == 0) {
-			mutex_enter(&ds->ds_lock);
-			if (ds->ds_snapname[0] == 0)
-				(void) strlcpy(ds->ds_snapname, snapname,
-				    sizeof (ds->ds_snapname));
-			mutex_exit(&ds->ds_lock);
-			*dsp = ds;
+			mutex_enter(&snap_ds->ds_lock);
+			if (snap_ds->ds_snapname[0] == 0)
+				(void) strlcpy(snap_ds->ds_snapname, snapname,
+				    sizeof (snap_ds->ds_snapname));
+			mutex_exit(&snap_ds->ds_lock);
+			ds = snap_ds;
 		}
 	}
-
+	if (err == 0)
+		*dsp = ds;
 	dsl_dir_rele(dd, FTAG);
 	return (err);
 }
diff --git a/module/zfs/dsl_dataset.c.orig b/module/zfs/dsl_dataset.c.orig
new file mode 100644
index 000000000000..829452b1d22a
--- /dev/null
+++ b/module/zfs/dsl_dataset.c.orig
@@ -0,0 +1,3451 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 RackTop Systems.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/unique.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/spa.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/dsl_deadlist.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_bookmark.h>
+
+/*
+ * The SPA supports block sizes up to 16MB.  However, very large blocks
+ * can have an impact on i/o latency (e.g. tying up a spinning disk for
+ * ~300ms), and also potentially on the memory allocator.  Therefore,
+ * we do not allow the recordsize to be set larger than zfs_max_recordsize
+ * (default 1MB).  Larger blocks can be created by changing this tunable,
+ * and pools with larger blocks can always be imported and used, regardless
+ * of this setting.
+ */
+int zfs_max_recordsize = 1 * 1024 * 1024;
+
+#define	SWITCH64(x, y) \
+	{ \
+		uint64_t __tmp = (x); \
+		(x) = (y); \
+		(y) = __tmp; \
+	}
+
+#define	DS_REF_MAX	(1ULL << 62)
+
+extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
+
+/*
+ * Figure out how much of this delta should be propogated to the dsl_dir
+ * layer.  If there's a refreservation, that space has already been
+ * partially accounted for in our ancestors.
+ */
+static int64_t
+parent_delta(dsl_dataset_t *ds, int64_t delta)
+{
+	dsl_dataset_phys_t *ds_phys;
+	uint64_t old_bytes, new_bytes;
+
+	if (ds->ds_reserved == 0)
+		return (delta);
+
+	ds_phys = dsl_dataset_phys(ds);
+	old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
+	new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
+
+	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
+	return (new_bytes - old_bytes);
+}
+
+void
+dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	int used, compressed, uncompressed;
+	int64_t delta;
+
+	used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
+	compressed = BP_GET_PSIZE(bp);
+	uncompressed = BP_GET_UCSIZE(bp);
+
+	dprintf_bp(bp, "ds=%p", ds);
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	/* It could have been compressed away to nothing */
+	if (BP_IS_HOLE(bp))
+		return;
+	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
+	ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
+	if (ds == NULL) {
+		dsl_pool_mos_diduse_space(tx->tx_pool,
+		    used, compressed, uncompressed);
+		return;
+	}
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	mutex_enter(&ds->ds_lock);
+	delta = parent_delta(ds, used);
+	dsl_dataset_phys(ds)->ds_referenced_bytes += used;
+	dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
+	dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
+	dsl_dataset_phys(ds)->ds_unique_bytes += used;
+	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
+		ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
+		    B_TRUE;
+	}
+	mutex_exit(&ds->ds_lock);
+	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
+	    compressed, uncompressed, tx);
+	dsl_dir_transfer_space(ds->ds_dir, used - delta,
+	    DD_USED_REFRSRV, DD_USED_HEAD, tx);
+}
+
+int
+dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
+    boolean_t async)
+{
+	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
+	int compressed = BP_GET_PSIZE(bp);
+	int uncompressed = BP_GET_UCSIZE(bp);
+
+	if (BP_IS_HOLE(bp))
+		return (0);
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(bp->blk_birth <= tx->tx_txg);
+
+	if (ds == NULL) {
+		dsl_free(tx->tx_pool, tx->tx_txg, bp);
+		dsl_pool_mos_diduse_space(tx->tx_pool,
+		    -used, -compressed, -uncompressed);
+		return (used);
+	}
+	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
+
+	ASSERT(!ds->ds_is_snapshot);
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+		int64_t delta;
+
+		dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
+		dsl_free(tx->tx_pool, tx->tx_txg, bp);
+
+		mutex_enter(&ds->ds_lock);
+		ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
+		    !DS_UNIQUE_IS_ACCURATE(ds));
+		delta = parent_delta(ds, -used);
+		dsl_dataset_phys(ds)->ds_unique_bytes -= used;
+		mutex_exit(&ds->ds_lock);
+		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+		    delta, -compressed, -uncompressed, tx);
+		dsl_dir_transfer_space(ds->ds_dir, -used - delta,
+		    DD_USED_REFRSRV, DD_USED_HEAD, tx);
+	} else {
+		dprintf_bp(bp, "putting on dead list: %s", "");
+		if (async) {
+			/*
+			 * We are here as part of zio's write done callback,
+			 * which means we're a zio interrupt thread.  We can't
+			 * call dsl_deadlist_insert() now because it may block
+			 * waiting for I/O.  Instead, put bp on the deferred
+			 * queue and let dsl_pool_sync() finish the job.
+			 */
+			bplist_append(&ds->ds_pending_deadlist, bp);
+		} else {
+			dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
+		}
+		ASSERT3U(ds->ds_prev->ds_object, ==,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj);
+		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
+		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
+		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+		    ds->ds_object && bp->blk_birth >
+		    dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
+			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+			mutex_enter(&ds->ds_prev->ds_lock);
+			dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
+			mutex_exit(&ds->ds_prev->ds_lock);
+		}
+		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+			dsl_dir_transfer_space(ds->ds_dir, used,
+			    DD_USED_HEAD, DD_USED_SNAP, tx);
+		}
+	}
+	mutex_enter(&ds->ds_lock);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
+	dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
+	dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
+	dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
+	mutex_exit(&ds->ds_lock);
+
+	return (used);
+}
+
+uint64_t
+dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
+{
+	uint64_t trysnap = 0;
+
+	if (ds == NULL)
+		return (0);
+	/*
+	 * The snapshot creation could fail, but that would cause an
+	 * incorrect FALSE return, which would only result in an
+	 * overestimation of the amount of space that an operation would
+	 * consume, which is OK.
+	 *
+	 * There's also a small window where we could miss a pending
+	 * snapshot, because we could set the sync task in the quiescing
+	 * phase.  So this should only be used as a guess.
+	 */
+	if (ds->ds_trysnap_txg >
+	    spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
+		trysnap = ds->ds_trysnap_txg;
+	return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap));
+}
+
+boolean_t
+dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
+    uint64_t blk_birth)
+{
+	if (blk_birth <= dsl_dataset_prev_snap_txg(ds) ||
+	    (bp != NULL && BP_IS_HOLE(bp)))
+		return (B_FALSE);
+
+	ddt_prefetch(dsl_dataset_get_spa(ds), bp);
+
+	return (B_TRUE);
+}
+
+static void
+dsl_dataset_evict(void *dbu)
+{
+	dsl_dataset_t *ds = dbu;
+
+	ASSERT(ds->ds_owner == NULL);
+
+	ds->ds_dbuf = NULL;
+
+	unique_remove(ds->ds_fsid_guid);
+
+	if (ds->ds_objset != NULL)
+		dmu_objset_evict(ds->ds_objset);
+
+	if (ds->ds_prev) {
+		dsl_dataset_rele(ds->ds_prev, ds);
+		ds->ds_prev = NULL;
+	}
+
+	bplist_destroy(&ds->ds_pending_deadlist);
+	if (ds->ds_deadlist.dl_os != NULL)
+		dsl_deadlist_close(&ds->ds_deadlist);
+	if (ds->ds_dir)
+		dsl_dir_async_rele(ds->ds_dir, ds);
+
+	ASSERT(!list_link_active(&ds->ds_synced_link));
+
+	mutex_destroy(&ds->ds_lock);
+	mutex_destroy(&ds->ds_opening_lock);
+	mutex_destroy(&ds->ds_sendstream_lock);
+	refcount_destroy(&ds->ds_longholds);
+
+	kmem_free(ds, sizeof (dsl_dataset_t));
+}
+
+int
+dsl_dataset_get_snapname(dsl_dataset_t *ds)
+{
+	dsl_dataset_phys_t *headphys;
+	int err;
+	dmu_buf_t *headdbuf;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+
+	if (ds->ds_snapname[0])
+		return (0);
+	if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
+		return (0);
+
+	err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
+	    FTAG, &headdbuf);
+	if (err != 0)
+		return (err);
+	headphys = headdbuf->db_data;
+	err = zap_value_search(dp->dp_meta_objset,
+	    headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
+	if (err != 0 && zfs_recover == B_TRUE) {
+		err = 0;
+		(void) snprintf(ds->ds_snapname, sizeof (ds->ds_snapname),
+		    "SNAPOBJ=%llu-ERR=%d",
+		    (unsigned long long)ds->ds_object, err);
+	}
+	dmu_buf_rele(headdbuf, FTAG);
+	return (err);
+}
+
+int
+dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
+	matchtype_t mt;
+	int err;
+
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+		mt = MT_FIRST;
+	else
+		mt = MT_EXACT;
+
+	err = zap_lookup_norm(mos, snapobj, name, 8, 1,
+	    value, mt, NULL, 0, NULL);
+	if (err == ENOTSUP && mt == MT_FIRST)
+		err = zap_lookup(mos, snapobj, name, 8, 1, value);
+	return (err);
+}
+
+int
+dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
+    boolean_t adj_cnt)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
+	matchtype_t mt;
+	int err;
+
+	dsl_dir_snap_cmtime_update(ds->ds_dir);
+
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+		mt = MT_FIRST;
+	else
+		mt = MT_EXACT;
+
+	err = zap_remove_norm(mos, snapobj, name, mt, tx);
+	if (err == ENOTSUP && mt == MT_FIRST)
+		err = zap_remove(mos, snapobj, name, tx);
+
+	if (err == 0 && adj_cnt)
+		dsl_fs_ss_count_adjust(ds->ds_dir, -1,
+		    DD_FIELD_SNAPSHOT_COUNT, tx);
+
+	return (err);
+}
+
+boolean_t
+dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
+{
+	dmu_buf_t *dbuf = ds->ds_dbuf;
+	boolean_t result = B_FALSE;
+
+	if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
+	    ds->ds_object, DMU_BONUS_BLKID, tag)) {
+
+		if (ds == dmu_buf_get_user(dbuf))
+			result = B_TRUE;
+		else
+			dmu_buf_rele(dbuf, tag);
+	}
+
+	return (result);
+}
+
+int
+dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
+    dsl_dataset_t **dsp)
+{
+	objset_t *mos = dp->dp_meta_objset;
+	dmu_buf_t *dbuf;
+	dsl_dataset_t *ds;
+	int err;
+	dmu_object_info_t doi;
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
+	if (err != 0)
+		return (err);
+
+	/* Make sure dsobj has the correct object type. */
+	dmu_object_info_from_db(dbuf, &doi);
+	if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
+		dmu_buf_rele(dbuf, tag);
+		return (SET_ERROR(EINVAL));
+	}
+
+	ds = dmu_buf_get_user(dbuf);
+	if (ds == NULL) {
+		dsl_dataset_t *winner = NULL;
+
+		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
+		ds->ds_dbuf = dbuf;
+		ds->ds_object = dsobj;
+		ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
+		list_link_init(&ds->ds_synced_link);
+
+		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
+		refcount_create(&ds->ds_longholds);
+
+		bplist_create(&ds->ds_pending_deadlist);
+		dsl_deadlist_open(&ds->ds_deadlist,
+		    mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
+
+		list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
+		    offsetof(dmu_sendarg_t, dsa_link));
+
+		if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+			spa_feature_t f;
+
+			for (f = 0; f < SPA_FEATURES; f++) {
+				if (!(spa_feature_table[f].fi_flags &
+				    ZFEATURE_FLAG_PER_DATASET))
+					continue;
+				err = zap_contains(mos, dsobj,
+				    spa_feature_table[f].fi_guid);
+				if (err == 0) {
+					ds->ds_feature_inuse[f] = B_TRUE;
+				} else {
+					ASSERT3U(err, ==, ENOENT);
+					err = 0;
+				}
+			}
+		}
+
+		err = dsl_dir_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir);
+		if (err != 0) {
+			mutex_destroy(&ds->ds_lock);
+			mutex_destroy(&ds->ds_opening_lock);
+			mutex_destroy(&ds->ds_sendstream_lock);
+			refcount_destroy(&ds->ds_longholds);
+			bplist_destroy(&ds->ds_pending_deadlist);
+			dsl_deadlist_close(&ds->ds_deadlist);
+			kmem_free(ds, sizeof (dsl_dataset_t));
+			dmu_buf_rele(dbuf, tag);
+			return (err);
+		}
+
+		if (!ds->ds_is_snapshot) {
+			ds->ds_snapname[0] = '\0';
+			if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+				err = dsl_dataset_hold_obj(dp,
+				    dsl_dataset_phys(ds)->ds_prev_snap_obj,
+				    ds, &ds->ds_prev);
+			}
+			if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+				int zaperr = zap_lookup(mos, ds->ds_object,
+				    DS_FIELD_BOOKMARK_NAMES,
+				    sizeof (ds->ds_bookmarks), 1,
+				    &ds->ds_bookmarks);
+				if (zaperr != ENOENT)
+					VERIFY0(zaperr);
+			}
+		} else {
+			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
+				err = dsl_dataset_get_snapname(ds);
+			if (err == 0 &&
+			    dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
+				err = zap_count(
+				    ds->ds_dir->dd_pool->dp_meta_objset,
+				    dsl_dataset_phys(ds)->ds_userrefs_obj,
+				    &ds->ds_userrefs);
+			}
+		}
+
+		if (err == 0 && !ds->ds_is_snapshot) {
+			err = dsl_prop_get_int_ds(ds,
+			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+			    &ds->ds_reserved);
+			if (err == 0) {
+				err = dsl_prop_get_int_ds(ds,
+				    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+				    &ds->ds_quota);
+			}
+		} else {
+			ds->ds_reserved = ds->ds_quota = 0;
+		}
+
+		dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict, &ds->ds_dbuf);
+		if (err == 0)
+			winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
+
+		if (err != 0 || winner != NULL) {
+			bplist_destroy(&ds->ds_pending_deadlist);
+			dsl_deadlist_close(&ds->ds_deadlist);
+			if (ds->ds_prev)
+				dsl_dataset_rele(ds->ds_prev, ds);
+			dsl_dir_rele(ds->ds_dir, ds);
+			mutex_destroy(&ds->ds_lock);
+			mutex_destroy(&ds->ds_opening_lock);
+			mutex_destroy(&ds->ds_sendstream_lock);
+			refcount_destroy(&ds->ds_longholds);
+			kmem_free(ds, sizeof (dsl_dataset_t));
+			if (err != 0) {
+				dmu_buf_rele(dbuf, tag);
+				return (err);
+			}
+			ds = winner;
+		} else {
+			ds->ds_fsid_guid =
+			    unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
+		}
+	}
+	ASSERT3P(ds->ds_dbuf, ==, dbuf);
+	ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
+	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
+	    spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
+	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
+	*dsp = ds;
+	return (0);
+}
+
+int
+dsl_dataset_hold(dsl_pool_t *dp, const char *name,
+    void *tag, dsl_dataset_t **dsp)
+{
+	dsl_dir_t *dd;
+	const char *snapname;
+	uint64_t obj;
+	int err = 0;
+
+	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
+	if (err != 0)
+		return (err);
+
+	ASSERT(dsl_pool_config_held(dp));
+	obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
+	if (obj != 0)
+		err = dsl_dataset_hold_obj(dp, obj, tag, dsp);
+	else
+		err = SET_ERROR(ENOENT);
+
+	/* we may be looking for a snapshot */
+	if (err == 0 && snapname != NULL) {
+		dsl_dataset_t *ds;
+
+		if (*snapname++ != '@') {
+			dsl_dataset_rele(*dsp, tag);
+			dsl_dir_rele(dd, FTAG);
+			return (SET_ERROR(ENOENT));
+		}
+
+		dprintf("looking for snapshot '%s'\n", snapname);
+		err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
+		if (err == 0)
+			err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
+		dsl_dataset_rele(*dsp, tag);
+
+		if (err == 0) {
+			mutex_enter(&ds->ds_lock);
+			if (ds->ds_snapname[0] == 0)
+				(void) strlcpy(ds->ds_snapname, snapname,
+				    sizeof (ds->ds_snapname));
+			mutex_exit(&ds->ds_lock);
+			*dsp = ds;
+		}
+	}
+
+	dsl_dir_rele(dd, FTAG);
+	return (err);
+}
+
+int
+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
+    void *tag, dsl_dataset_t **dsp)
+{
+	int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
+	if (err != 0)
+		return (err);
+	if (!dsl_dataset_tryown(*dsp, tag)) {
+		dsl_dataset_rele(*dsp, tag);
+		*dsp = NULL;
+		return (SET_ERROR(EBUSY));
+	}
+	return (0);
+}
+
+int
+dsl_dataset_own(dsl_pool_t *dp, const char *name,
+    void *tag, dsl_dataset_t **dsp)
+{
+	int err = dsl_dataset_hold(dp, name, tag, dsp);
+	if (err != 0)
+		return (err);
+	if (!dsl_dataset_tryown(*dsp, tag)) {
+		dsl_dataset_rele(*dsp, tag);
+		return (SET_ERROR(EBUSY));
+	}
+	return (0);
+}
+
+/*
+ * See the comment above dsl_pool_hold() for details.  In summary, a long
+ * hold is used to prevent destruction of a dataset while the pool hold
+ * is dropped, allowing other concurrent operations (e.g. spa_sync()).
+ *
+ * The dataset and pool must be held when this function is called.  After it
+ * is called, the pool hold may be released while the dataset is still held
+ * and accessed.
+ */
+void
+dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
+{
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+	(void) refcount_add(&ds->ds_longholds, tag);
+}
+
+void
+dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
+{
+	(void) refcount_remove(&ds->ds_longholds, tag);
+}
+
+/* Return B_TRUE if there are any long holds on this dataset. */
+boolean_t
+dsl_dataset_long_held(dsl_dataset_t *ds)
+{
+	return (!refcount_is_zero(&ds->ds_longholds));
+}
+
+void
+dsl_dataset_name(dsl_dataset_t *ds, char *name)
+{
+	if (ds == NULL) {
+		(void) strcpy(name, "mos");
+	} else {
+		dsl_dir_name(ds->ds_dir, name);
+		VERIFY0(dsl_dataset_get_snapname(ds));
+		if (ds->ds_snapname[0]) {
+			(void) strcat(name, "@");
+			/*
+			 * We use a "recursive" mutex so that we
+			 * can call dprintf_ds() with ds_lock held.
+			 */
+			if (!MUTEX_HELD(&ds->ds_lock)) {
+				mutex_enter(&ds->ds_lock);
+				(void) strcat(name, ds->ds_snapname);
+				mutex_exit(&ds->ds_lock);
+			} else {
+				(void) strcat(name, ds->ds_snapname);
+			}
+		}
+	}
+}
+
+void
+dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
+{
+	dmu_buf_rele(ds->ds_dbuf, tag);
+}
+
+void
+dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
+{
+	ASSERT3P(ds->ds_owner, ==, tag);
+	ASSERT(ds->ds_dbuf != NULL);
+
+	mutex_enter(&ds->ds_lock);
+	ds->ds_owner = NULL;
+	mutex_exit(&ds->ds_lock);
+	dsl_dataset_long_rele(ds, tag);
+	dsl_dataset_rele(ds, tag);
+}
+
+boolean_t
+dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
+{
+	boolean_t gotit = FALSE;
+
+	mutex_enter(&ds->ds_lock);
+	if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
+		ds->ds_owner = tag;
+		dsl_dataset_long_hold(ds, tag);
+		gotit = TRUE;
+	}
+	mutex_exit(&ds->ds_lock);
+	return (gotit);
+}
+
+static void
+dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+	uint64_t zero = 0;
+
+	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+	spa_feature_incr(spa, f, tx);
+	dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+
+	VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
+	    sizeof (zero), 1, &zero, tx));
+}
+
+void
+dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+
+	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+	VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
+	spa_feature_decr(spa, f, tx);
+}
+
+uint64_t
+dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
+    uint64_t flags, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dd->dd_pool;
+	dmu_buf_t *dbuf;
+	dsl_dataset_phys_t *dsphys;
+	uint64_t dsobj;
+	objset_t *mos = dp->dp_meta_objset;
+
+	if (origin == NULL)
+		origin = dp->dp_origin_snap;
+
+	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
+	ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
+
+	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+	dmu_buf_will_dirty(dbuf, tx);
+	dsphys = dbuf->db_data;
+	bzero(dsphys, sizeof (dsl_dataset_phys_t));
+	dsphys->ds_dir_obj = dd->dd_object;
+	dsphys->ds_flags = flags;
+	dsphys->ds_fsid_guid = unique_create();
+	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+	    sizeof (dsphys->ds_guid));
+	dsphys->ds_snapnames_zapobj =
+	    zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
+	    DMU_OT_NONE, 0, tx);
+	dsphys->ds_creation_time = gethrestime_sec();
+	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
+
+	if (origin == NULL) {
+		dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
+	} else {
+		spa_feature_t f;
+		dsl_dataset_t *ohds; /* head of the origin snapshot */
+
+		dsphys->ds_prev_snap_obj = origin->ds_object;
+		dsphys->ds_prev_snap_txg =
+		    dsl_dataset_phys(origin)->ds_creation_txg;
+		dsphys->ds_referenced_bytes =
+		    dsl_dataset_phys(origin)->ds_referenced_bytes;
+		dsphys->ds_compressed_bytes =
+		    dsl_dataset_phys(origin)->ds_compressed_bytes;
+		dsphys->ds_uncompressed_bytes =
+		    dsl_dataset_phys(origin)->ds_uncompressed_bytes;
+		dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
+
+		/*
+		 * Inherit flags that describe the dataset's contents
+		 * (INCONSISTENT) or properties (Case Insensitive).
+		 */
+		dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
+		    (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
+
+		for (f = 0; f < SPA_FEATURES; f++) {
+			if (origin->ds_feature_inuse[f])
+				dsl_dataset_activate_feature(dsobj, f, tx);
+		}
+
+		dmu_buf_will_dirty(origin->ds_dbuf, tx);
+		dsl_dataset_phys(origin)->ds_num_children++;
+
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
+		    FTAG, &ohds));
+		dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
+		    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
+		dsl_dataset_rele(ohds, FTAG);
+
+		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
+			if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
+				dsl_dataset_phys(origin)->ds_next_clones_obj =
+				    zap_create(mos,
+				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
+			}
+			VERIFY0(zap_add_int(mos,
+			    dsl_dataset_phys(origin)->ds_next_clones_obj,
+			    dsobj, tx));
+		}
+
+		dmu_buf_will_dirty(dd->dd_dbuf, tx);
+		dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
+		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+			if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
+				dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+				dsl_dir_phys(origin->ds_dir)->dd_clones =
+				    zap_create(mos,
+				    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+			}
+			VERIFY0(zap_add_int(mos,
+			    dsl_dir_phys(origin->ds_dir)->dd_clones,
+			    dsobj, tx));
+		}
+	}
+
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
+	dmu_buf_rele(dbuf, FTAG);
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
+
+	return (dsobj);
+}
+
+static void
+dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	objset_t *os;
+
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	bzero(&os->os_zil_header, sizeof (os->os_zil_header));
+	dsl_dataset_dirty(ds, tx);
+}
+
+uint64_t
+dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
+    dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = pdd->dd_pool;
+	uint64_t dsobj, ddobj;
+	dsl_dir_t *dd;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(lastname[0] != '@');
+
+	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
+	VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
+
+	dsobj = dsl_dataset_create_sync_dd(dd, origin,
+	    flags & ~DS_CREATE_FLAG_NODIRTY, tx);
+
+	dsl_deleg_set_create_perms(dd, tx, cr);
+
+	/*
+	 * Since we're creating a new node we know it's a leaf, so we can
+	 * initialize the counts if the limit feature is active.
+	 */
+	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+		uint64_t cnt = 0;
+		objset_t *os = dd->dd_pool->dp_meta_objset;
+
+		dsl_dir_zapify(dd, tx);
+		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+		    sizeof (cnt), 1, &cnt, tx));
+		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+		    sizeof (cnt), 1, &cnt, tx));
+	}
+
+	dsl_dir_rele(dd, FTAG);
+
+	/*
+	 * If we are creating a clone, make sure we zero out any stale
+	 * data from the origin snapshots zil header.
+	 */
+	if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
+		dsl_dataset_t *ds;
+
+		VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+		dsl_dataset_zero_zil(ds, tx);
+		dsl_dataset_rele(ds, FTAG);
+	}
+
+	return (dsobj);
+}
+
+/*
+ * The unique space in the head dataset can be calculated by subtracting
+ * the space used in the most recent snapshot, that is still being used
+ * in this file system, from the space currently in use.  To figure out
+ * the space in the most recent snapshot still in use, we need to take
+ * the total space used in the snapshot and subtract out the space that
+ * has been freed up since the snapshot was taken.
+ */
+void
+dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
+{
+	uint64_t mrs_used;
+	uint64_t dlused, dlcomp, dluncomp;
+
+	ASSERT(!ds->ds_is_snapshot);
+
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
+		mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
+	else
+		mrs_used = 0;
+
+	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
+
+	ASSERT3U(dlused, <=, mrs_used);
+	dsl_dataset_phys(ds)->ds_unique_bytes =
+	    dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
+
+	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+	    SPA_VERSION_UNIQUE_ACCURATE)
+		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+}
+
+void
+dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
+    dmu_tx_t *tx)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	int err;
+	ASSERTV(uint64_t count);
+
+	ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
+	err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+	    obj, tx);
+	/*
+	 * The err should not be ENOENT, but a bug in a previous version
+	 * of the code could cause upgrade_clones_cb() to not set
+	 * ds_next_snap_obj when it should, leading to a missing entry.
+	 * If we knew that the pool was created after
+	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
+	 * ENOENT.  However, at least we can check that we don't have
+	 * too many entries in the next_clones_obj even after failing to
+	 * remove this one.
+	 */
+	if (err != ENOENT)
+		VERIFY0(err);
+	ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+	    &count));
+	ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
+}
+
+
+blkptr_t *
+dsl_dataset_get_blkptr(dsl_dataset_t *ds)
+{
+	return (&dsl_dataset_phys(ds)->ds_bp);
+}
+
+void
+dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+{
+	ASSERT(dmu_tx_is_syncing(tx));
+	/* If it's the meta-objset, set dp_meta_rootbp */
+	if (ds == NULL) {
+		tx->tx_pool->dp_meta_rootbp = *bp;
+	} else {
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		dsl_dataset_phys(ds)->ds_bp = *bp;
+	}
+}
+
+spa_t *
+dsl_dataset_get_spa(dsl_dataset_t *ds)
+{
+	return (ds->ds_dir->dd_pool->dp_spa);
+}
+
+void
+dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp;
+
+	if (ds == NULL) /* this is the meta-objset */
+		return;
+
+	ASSERT(ds->ds_objset != NULL);
+
+	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
+		panic("dirtying snapshot!");
+
+	dp = ds->ds_dir->dd_pool;
+
+	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
+		/* up the hold count until we can be written out */
+		dmu_buf_add_ref(ds->ds_dbuf, ds);
+	}
+}
+
+boolean_t
+dsl_dataset_is_dirty(dsl_dataset_t *ds)
+{
+	int t;
+
+	for (t = 0; t < TXG_SIZE; t++) {
+		if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
+		    ds, t))
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+static int
+dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	uint64_t asize;
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	/*
+	 * If there's an fs-only reservation, any blocks that might become
+	 * owned by the snapshot dataset must be accommodated by space
+	 * outside of the reservation.
+	 */
+	ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
+	asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
+	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
+		return (SET_ERROR(ENOSPC));
+
+	/*
+	 * Propagate any reserved space for this snapshot to other
+	 * snapshot checks in this sync group.
+	 */
+	if (asize > 0)
+		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
+
+	return (0);
+}
+
+typedef struct dsl_dataset_snapshot_arg {
+	nvlist_t *ddsa_snaps;
+	nvlist_t *ddsa_props;
+	nvlist_t *ddsa_errors;
+	cred_t *ddsa_cr;
+} dsl_dataset_snapshot_arg_t;
+
+int
+dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
+    dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
+{
+	int error;
+	uint64_t value;
+
+	ds->ds_trysnap_txg = tx->tx_txg;
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	/*
+	 * We don't allow multiple snapshots of the same txg.  If there
+	 * is already one, try again.
+	 */
+	if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
+		return (SET_ERROR(EAGAIN));
+
+	/*
+	 * Check for conflicting snapshot name.
+	 */
+	error = dsl_dataset_snap_lookup(ds, snapname, &value);
+	if (error == 0)
+		return (SET_ERROR(EEXIST));
+	if (error != ENOENT)
+		return (error);
+
+	/*
+	 * We don't allow taking snapshots of inconsistent datasets, such as
+	 * those into which we are currently receiving.  However, if we are
+	 * creating this snapshot as part of a receive, this check will be
+	 * executed atomically with respect to the completion of the receive
+	 * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
+	 * case we ignore this, knowing it will be fixed up for us shortly in
+	 * dmu_recv_end_sync().
+	 */
+	if (!recv && DS_IS_INCONSISTENT(ds))
+		return (SET_ERROR(EBUSY));
+
+	/*
+	 * Skip the check for temporary snapshots or if we have already checked
+	 * the counts in dsl_dataset_snapshot_check. This means we really only
+	 * check the count here when we're receiving a stream.
+	 */
+	if (cnt != 0 && cr != NULL) {
+		error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
+		if (error != 0)
+			return (error);
+	}
+
+	error = dsl_dataset_snapshot_reserve_space(ds, tx);
+	if (error != 0)
+		return (error);
+
+	return (0);
+}
+
+static int
+dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_snapshot_arg_t *ddsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvpair_t *pair;
+	int rv = 0;
+
+	/*
+	 * Pre-compute how many total new snapshots will be created for each
+	 * level in the tree and below. This is needed for validating the
+	 * snapshot limit when either taking a recursive snapshot or when
+	 * taking multiple snapshots.
+	 *
+	 * The problem is that the counts are not actually adjusted when
+	 * we are checking, only when we finally sync. For a single snapshot,
+	 * this is easy, the count will increase by 1 at each node up the tree,
+	 * but its more complicated for the recursive/multiple snapshot case.
+	 *
+	 * The dsl_fs_ss_limit_check function does recursively check the count
+	 * at each level up the tree but since it is validating each snapshot
+	 * independently we need to be sure that we are validating the complete
+	 * count for the entire set of snapshots. We do this by rolling up the
+	 * counts for each component of the name into an nvlist and then
+	 * checking each of those cases with the aggregated count.
+	 *
+	 * This approach properly handles not only the recursive snapshot
+	 * case (where we get all of those on the ddsa_snaps list) but also
+	 * the sibling case (e.g. snapshot a/b and a/c so that we will also
+	 * validate the limit on 'a' using a count of 2).
+	 *
+	 * We validate the snapshot names in the third loop and only report
+	 * name errors once.
+	 */
+	if (dmu_tx_is_syncing(tx)) {
+		char *nm;
+		nvlist_t *cnt_track = NULL;
+		cnt_track = fnvlist_alloc();
+
+		nm = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+		/* Rollup aggregated counts into the cnt_track list */
+		for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+		    pair != NULL;
+		    pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+			char *pdelim;
+			uint64_t val;
+
+			(void) strlcpy(nm, nvpair_name(pair), MAXPATHLEN);
+			pdelim = strchr(nm, '@');
+			if (pdelim == NULL)
+				continue;
+			*pdelim = '\0';
+
+			do {
+				if (nvlist_lookup_uint64(cnt_track, nm,
+				    &val) == 0) {
+					/* update existing entry */
+					fnvlist_add_uint64(cnt_track, nm,
+					    val + 1);
+				} else {
+					/* add to list */
+					fnvlist_add_uint64(cnt_track, nm, 1);
+				}
+
+				pdelim = strrchr(nm, '/');
+				if (pdelim != NULL)
+					*pdelim = '\0';
+			} while (pdelim != NULL);
+		}
+
+		kmem_free(nm, MAXPATHLEN);
+
+		/* Check aggregated counts at each level */
+		for (pair = nvlist_next_nvpair(cnt_track, NULL);
+		    pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
+			int error = 0;
+			char *name;
+			uint64_t cnt = 0;
+			dsl_dataset_t *ds;
+
+			name = nvpair_name(pair);
+			cnt = fnvpair_value_uint64(pair);
+			ASSERT(cnt > 0);
+
+			error = dsl_dataset_hold(dp, name, FTAG, &ds);
+			if (error == 0) {
+				error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+				    ZFS_PROP_SNAPSHOT_LIMIT, NULL,
+				    ddsa->ddsa_cr);
+				dsl_dataset_rele(ds, FTAG);
+			}
+
+			if (error != 0) {
+				if (ddsa->ddsa_errors != NULL)
+					fnvlist_add_int32(ddsa->ddsa_errors,
+					    name, error);
+				rv = error;
+				/* only report one error for this check */
+				break;
+			}
+		}
+		nvlist_free(cnt_track);
+	}
+
+	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+		int error = 0;
+		dsl_dataset_t *ds;
+		char *name, *atp;
+		char dsname[MAXNAMELEN];
+
+		name = nvpair_name(pair);
+		if (strlen(name) >= MAXNAMELEN)
+			error = SET_ERROR(ENAMETOOLONG);
+		if (error == 0) {
+			atp = strchr(name, '@');
+			if (atp == NULL)
+				error = SET_ERROR(EINVAL);
+			if (error == 0)
+				(void) strlcpy(dsname, name, atp - name + 1);
+		}
+		if (error == 0)
+			error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+		if (error == 0) {
+			/* passing 0/NULL skips dsl_fs_ss_limit_check */
+			error = dsl_dataset_snapshot_check_impl(ds,
+			    atp + 1, tx, B_FALSE, 0, NULL);
+			dsl_dataset_rele(ds, FTAG);
+		}
+
+		if (error != 0) {
+			if (ddsa->ddsa_errors != NULL) {
+				fnvlist_add_int32(ddsa->ddsa_errors,
+				    name, error);
+			}
+			rv = error;
+		}
+	}
+
+	return (rv);
+}
+
+void
+dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
+    dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	dmu_buf_t *dbuf;
+	dsl_dataset_phys_t *dsphys;
+	uint64_t dsobj, crtxg;
+	objset_t *mos = dp->dp_meta_objset;
+	spa_feature_t f;
+	ASSERTV(static zil_header_t zero_zil);
+	ASSERTV(objset_t *os);
+
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+	/*
+	 * If we are on an old pool, the zil must not be active, in which
+	 * case it will be zeroed.  Usually zil_suspend() accomplishes this.
+	 */
+	ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
+	    dmu_objset_from_ds(ds, &os) != 0 ||
+	    bcmp(&os->os_phys->os_zil_header, &zero_zil,
+	    sizeof (zero_zil)) == 0);
+
+	dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
+
+	/*
+	 * The origin's ds_creation_txg has to be < TXG_INITIAL
+	 */
+	if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
+		crtxg = 1;
+	else
+		crtxg = tx->tx_txg;
+
+	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+	dmu_buf_will_dirty(dbuf, tx);
+	dsphys = dbuf->db_data;
+	bzero(dsphys, sizeof (dsl_dataset_phys_t));
+	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
+	dsphys->ds_fsid_guid = unique_create();
+	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+	    sizeof (dsphys->ds_guid));
+	dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+	dsphys->ds_next_snap_obj = ds->ds_object;
+	dsphys->ds_num_children = 1;
+	dsphys->ds_creation_time = gethrestime_sec();
+	dsphys->ds_creation_txg = crtxg;
+	dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+	dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
+	dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
+	dsphys->ds_uncompressed_bytes =
+	    dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+	dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
+	dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
+	dmu_buf_rele(dbuf, FTAG);
+
+	for (f = 0; f < SPA_FEATURES; f++) {
+		if (ds->ds_feature_inuse[f])
+			dsl_dataset_activate_feature(dsobj, f, tx);
+	}
+
+	ASSERT3U(ds->ds_prev != 0, ==,
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
+	if (ds->ds_prev) {
+		uint64_t next_clones_obj =
+		    dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
+		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+		    ds->ds_object ||
+		    dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
+		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+		    ds->ds_object) {
+			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+			ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+			    dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
+			dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
+		} else if (next_clones_obj != 0) {
+			dsl_dataset_remove_from_next_clones(ds->ds_prev,
+			    dsphys->ds_next_snap_obj, tx);
+			VERIFY0(zap_add_int(mos,
+			    next_clones_obj, dsobj, tx));
+		}
+	}
+
+	/*
+	 * If we have a reference-reservation on this dataset, we will
+	 * need to increase the amount of refreservation being charged
+	 * since our unique space is going to zero.
+	 */
+	if (ds->ds_reserved) {
+		int64_t delta;
+		ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+		delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
+		    ds->ds_reserved);
+		dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
+		    delta, 0, 0, tx);
+	}
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_deadlist_obj =
+	    dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_open(&ds->ds_deadlist, mos,
+	    dsl_dataset_phys(ds)->ds_deadlist_obj);
+	dsl_deadlist_add_key(&ds->ds_deadlist,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+
+	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
+	dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
+	dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
+	dsl_dataset_phys(ds)->ds_unique_bytes = 0;
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
+	VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
+	    snapname, 8, 1, &dsobj, tx));
+
+	if (ds->ds_prev)
+		dsl_dataset_rele(ds->ds_prev, ds);
+	VERIFY0(dsl_dataset_hold_obj(dp,
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
+
+	dsl_scan_ds_snapshotted(ds, tx);
+
+	dsl_dir_snap_cmtime_update(ds->ds_dir);
+
+	spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
+}
+
+static void
+dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_snapshot_arg_t *ddsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvpair_t *pair;
+
+	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+		dsl_dataset_t *ds;
+		char *name, *atp;
+		char dsname[MAXNAMELEN];
+
+		name = nvpair_name(pair);
+		atp = strchr(name, '@');
+		(void) strlcpy(dsname, name, atp - name + 1);
+		VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
+
+		dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
+		if (ddsa->ddsa_props != NULL) {
+			dsl_props_set_sync_impl(ds->ds_prev,
+			    ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
+		}
+		dsl_dataset_rele(ds, FTAG);
+	}
+}
+
+/*
+ * The snapshots must all be in the same pool.
+ * All-or-nothing: if there are any failures, nothing will be modified.
+ */
+int
+dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
+{
+	dsl_dataset_snapshot_arg_t ddsa;
+	nvpair_t *pair;
+	boolean_t needsuspend;
+	int error;
+	spa_t *spa;
+	char *firstname;
+	nvlist_t *suspended = NULL;
+
+	pair = nvlist_next_nvpair(snaps, NULL);
+	if (pair == NULL)
+		return (0);
+	firstname = nvpair_name(pair);
+
+	error = spa_open(firstname, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+	spa_close(spa, FTAG);
+
+	if (needsuspend) {
+		suspended = fnvlist_alloc();
+		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+		    pair = nvlist_next_nvpair(snaps, pair)) {
+			char fsname[MAXNAMELEN];
+			char *snapname = nvpair_name(pair);
+			char *atp;
+			void *cookie;
+
+			atp = strchr(snapname, '@');
+			if (atp == NULL) {
+				error = SET_ERROR(EINVAL);
+				break;
+			}
+			(void) strlcpy(fsname, snapname, atp - snapname + 1);
+
+			error = zil_suspend(fsname, &cookie);
+			if (error != 0)
+				break;
+			fnvlist_add_uint64(suspended, fsname,
+			    (uintptr_t)cookie);
+		}
+	}
+
+	ddsa.ddsa_snaps = snaps;
+	ddsa.ddsa_props = props;
+	ddsa.ddsa_errors = errors;
+	ddsa.ddsa_cr = CRED();
+
+	if (error == 0) {
+		error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
+		    dsl_dataset_snapshot_sync, &ddsa,
+		    fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
+	}
+
+	if (suspended != NULL) {
+		for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
+		    pair = nvlist_next_nvpair(suspended, pair)) {
+			zil_resume((void *)(uintptr_t)
+			    fnvpair_value_uint64(pair));
+		}
+		fnvlist_free(suspended);
+	}
+
+#ifdef _KERNEL
+	if (error == 0) {
+		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+		    pair = nvlist_next_nvpair(snaps, pair)) {
+			char *snapname = nvpair_name(pair);
+			zvol_create_minors(snapname);
+		}
+	}
+#endif
+
+	return (error);
+}
+
+typedef struct dsl_dataset_snapshot_tmp_arg {
+	const char *ddsta_fsname;
+	const char *ddsta_snapname;
+	minor_t ddsta_cleanup_minor;
+	const char *ddsta_htag;
+} dsl_dataset_snapshot_tmp_arg_t;
+
+static int
+dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int error;
+
+	error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	/* NULL cred means no limit check for tmp snapshot */
+	error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
+	    tx, B_FALSE, 0, NULL);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+
+	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+	error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
+	    B_TRUE, tx);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+
+	VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
+
+	dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
+	dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
+	    ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
+	dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
+
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
+    minor_t cleanup_minor, const char *htag)
+{
+	dsl_dataset_snapshot_tmp_arg_t ddsta;
+	int error;
+	spa_t *spa;
+	boolean_t needsuspend;
+	void *cookie;
+
+	ddsta.ddsta_fsname = fsname;
+	ddsta.ddsta_snapname = snapname;
+	ddsta.ddsta_cleanup_minor = cleanup_minor;
+	ddsta.ddsta_htag = htag;
+
+	error = spa_open(fsname, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+	spa_close(spa, FTAG);
+
+	if (needsuspend) {
+		error = zil_suspend(fsname, &cookie);
+		if (error != 0)
+			return (error);
+	}
+
+	error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
+	    dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
+
+	if (needsuspend)
+		zil_resume(cookie);
+	return (error);
+}
+
+
+void
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+{
+	spa_feature_t f;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(ds->ds_objset != NULL);
+	ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
+
+	/*
+	 * in case we had to change ds_fsid_guid when we opened it,
+	 * sync it out now.
+	 */
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
+
+	dmu_objset_sync(ds->ds_objset, zio, tx);
+
+	for (f = 0; f < SPA_FEATURES; f++) {
+		if (ds->ds_feature_activation_needed[f]) {
+			if (ds->ds_feature_inuse[f])
+				continue;
+			dsl_dataset_activate_feature(ds->ds_object, f, tx);
+			ds->ds_feature_inuse[f] = B_TRUE;
+		}
+	}
+}
+
+static void
+get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
+{
+	uint64_t count = 0;
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	nvlist_t *propval = fnvlist_alloc();
+	nvlist_t *val = fnvlist_alloc();
+
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+
+	/*
+	 * There may be missing entries in ds_next_clones_obj
+	 * due to a bug in a previous version of the code.
+	 * Only trust it if it has the right number of entries.
+	 */
+	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+		VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+		    &count));
+	}
+	if (count != dsl_dataset_phys(ds)->ds_num_children - 1)
+		goto fail;
+	for (zap_cursor_init(&zc, mos,
+	    dsl_dataset_phys(ds)->ds_next_clones_obj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		dsl_dataset_t *clone;
+		char buf[ZFS_MAXNAMELEN];
+		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+		    za.za_first_integer, FTAG, &clone));
+		dsl_dir_name(clone->ds_dir, buf);
+		fnvlist_add_boolean(val, buf);
+		dsl_dataset_rele(clone, FTAG);
+	}
+	zap_cursor_fini(&zc);
+	fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
+	fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval);
+fail:
+	nvlist_free(val);
+	nvlist_free(propval);
+}
+
+void
+dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+	uint64_t refd, avail, uobjs, aobjs, ratio;
+	ASSERTV(dsl_pool_t *dp = ds->ds_dir->dd_pool);
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
+	    (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
+	    dsl_dataset_phys(ds)->ds_compressed_bytes);
+
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
+	    dsl_dataset_phys(ds)->ds_uncompressed_bytes);
+
+	if (ds->ds_is_snapshot) {
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
+		    dsl_dataset_phys(ds)->ds_unique_bytes);
+		get_clones_stat(ds, nv);
+	} else {
+		dsl_dir_stats(ds->ds_dir, nv);
+	}
+
+	dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
+
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
+	    dsl_dataset_phys(ds)->ds_creation_time);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
+	    dsl_dataset_phys(ds)->ds_creation_txg);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
+	    ds->ds_quota);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
+	    ds->ds_reserved);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
+	    dsl_dataset_phys(ds)->ds_guid);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
+	    dsl_dataset_phys(ds)->ds_unique_bytes);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
+	    ds->ds_object);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
+	    ds->ds_userrefs);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
+	    DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
+
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+		uint64_t written, comp, uncomp;
+		dsl_pool_t *dp = ds->ds_dir->dd_pool;
+		dsl_dataset_t *prev;
+		int err;
+
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+		if (err == 0) {
+			err = dsl_dataset_space_written(prev, ds, &written,
+			    &comp, &uncomp);
+			dsl_dataset_rele(prev, FTAG);
+			if (err == 0) {
+				dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
+				    written);
+			}
+		}
+	}
+
+}
+
+void
+dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	ASSERT(dsl_pool_config_held(dp));
+
+	stat->dds_creation_txg = dsl_dataset_phys(ds)->ds_creation_txg;
+	stat->dds_inconsistent =
+	    dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT;
+	stat->dds_guid = dsl_dataset_phys(ds)->ds_guid;
+	stat->dds_origin[0] = '\0';
+	if (ds->ds_is_snapshot) {
+		stat->dds_is_snapshot = B_TRUE;
+		stat->dds_num_clones =
+		    dsl_dataset_phys(ds)->ds_num_children - 1;
+	} else {
+		stat->dds_is_snapshot = B_FALSE;
+		stat->dds_num_clones = 0;
+
+		if (dsl_dir_is_clone(ds->ds_dir)) {
+			dsl_dataset_t *ods;
+
+			VERIFY0(dsl_dataset_hold_obj(dp,
+			    dsl_dir_phys(ds->ds_dir)->dd_origin_obj,
+			    FTAG, &ods));
+			dsl_dataset_name(ods, stat->dds_origin);
+			dsl_dataset_rele(ods, FTAG);
+		}
+	}
+}
+
+uint64_t
+dsl_dataset_fsid_guid(dsl_dataset_t *ds)
+{
+	return (ds->ds_fsid_guid);
+}
+
+void
+dsl_dataset_space(dsl_dataset_t *ds,
+    uint64_t *refdbytesp, uint64_t *availbytesp,
+    uint64_t *usedobjsp, uint64_t *availobjsp)
+{
+	*refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
+	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
+	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
+		*availbytesp +=
+		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
+	if (ds->ds_quota != 0) {
+		/*
+		 * Adjust available bytes according to refquota
+		 */
+		if (*refdbytesp < ds->ds_quota)
+			*availbytesp = MIN(*availbytesp,
+			    ds->ds_quota - *refdbytesp);
+		else
+			*availbytesp = 0;
+	}
+	*usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
+	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
+}
+
+boolean_t
+dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
+{
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+	if (snap == NULL)
+		return (B_FALSE);
+	if (dsl_dataset_phys(ds)->ds_bp.blk_birth >
+	    dsl_dataset_phys(snap)->ds_creation_txg) {
+		objset_t *os, *os_snap;
+		/*
+		 * It may be that only the ZIL differs, because it was
+		 * reset in the head.  Don't count that as being
+		 * modified.
+		 */
+		if (dmu_objset_from_ds(ds, &os) != 0)
+			return (B_TRUE);
+		if (dmu_objset_from_ds(snap, &os_snap) != 0)
+			return (B_TRUE);
+		return (bcmp(&os->os_phys->os_meta_dnode,
+		    &os_snap->os_phys->os_meta_dnode,
+		    sizeof (os->os_phys->os_meta_dnode)) != 0);
+	}
+	return (B_FALSE);
+}
+
+typedef struct dsl_dataset_rename_snapshot_arg {
+	const char *ddrsa_fsname;
+	const char *ddrsa_oldsnapname;
+	const char *ddrsa_newsnapname;
+	boolean_t ddrsa_recursive;
+	dmu_tx_t *ddrsa_tx;
+} dsl_dataset_rename_snapshot_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
+    dsl_dataset_t *hds, void *arg)
+{
+	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+	int error;
+	uint64_t val;
+
+	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
+	if (error != 0) {
+		/* ignore nonexistent snapshots */
+		return (error == ENOENT ? 0 : error);
+	}
+
+	/* new name should not exist */
+	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
+	if (error == 0)
+		error = SET_ERROR(EEXIST);
+	else if (error == ENOENT)
+		error = 0;
+
+	/* dataset name + 1 for the "@" + the new snapshot name must fit */
+	if (dsl_dir_namelen(hds->ds_dir) + 1 +
+	    strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN)
+		error = SET_ERROR(ENAMETOOLONG);
+
+	return (error);
+}
+
+static int
+dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *hds;
+	int error;
+
+	error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
+	if (error != 0)
+		return (error);
+
+	if (ddrsa->ddrsa_recursive) {
+		error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
+		    dsl_dataset_rename_snapshot_check_impl, ddrsa,
+		    DS_FIND_CHILDREN);
+	} else {
+		error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
+	}
+	dsl_dataset_rele(hds, FTAG);
+	return (error);
+}
+
+static int
+dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
+    dsl_dataset_t *hds, void *arg)
+{
+	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+	dsl_dataset_t *ds;
+	uint64_t val;
+	dmu_tx_t *tx = ddrsa->ddrsa_tx;
+	int error;
+
+	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
+	ASSERT(error == 0 || error == ENOENT);
+	if (error == ENOENT) {
+		/* ignore nonexistent snapshots */
+		return (0);
+	}
+
+	VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
+
+	/* log before we change the name */
+	spa_history_log_internal_ds(ds, "rename", tx,
+	    "-> @%s", ddrsa->ddrsa_newsnapname);
+
+	VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
+	    B_FALSE));
+	mutex_enter(&ds->ds_lock);
+	(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
+	mutex_exit(&ds->ds_lock);
+	VERIFY0(zap_add(dp->dp_meta_objset,
+	    dsl_dataset_phys(hds)->ds_snapnames_zapobj,
+	    ds->ds_snapname, 8, 1, &ds->ds_object, tx));
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *hds;
+
+	VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
+	ddrsa->ddrsa_tx = tx;
+	if (ddrsa->ddrsa_recursive) {
+		VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
+		    dsl_dataset_rename_snapshot_sync_impl, ddrsa,
+		    DS_FIND_CHILDREN));
+	} else {
+		VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
+	}
+	dsl_dataset_rele(hds, FTAG);
+}
+
+int
+dsl_dataset_rename_snapshot(const char *fsname,
+    const char *oldsnapname, const char *newsnapname, boolean_t recursive)
+{
+#ifdef _KERNEL
+	char *oldname, *newname;
+#endif
+	int error;
+
+	dsl_dataset_rename_snapshot_arg_t ddrsa;
+
+	ddrsa.ddrsa_fsname = fsname;
+	ddrsa.ddrsa_oldsnapname = oldsnapname;
+	ddrsa.ddrsa_newsnapname = newsnapname;
+	ddrsa.ddrsa_recursive = recursive;
+
+	error = dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
+	    dsl_dataset_rename_snapshot_sync, &ddrsa,
+	    1, ZFS_SPACE_CHECK_RESERVED);
+
+	if (error)
+	    return (SET_ERROR(error));
+
+#ifdef _KERNEL
+	oldname = kmem_asprintf("%s@%s", fsname, oldsnapname);
+	newname = kmem_asprintf("%s@%s", fsname, newsnapname);
+	zvol_rename_minors(oldname, newname);
+	strfree(newname);
+	strfree(oldname);
+#endif
+
+	return (0);
+}
+
+/*
+ * If we're doing an ownership handoff, we need to make sure that there is
+ * only one long hold on the dataset.  We're not allowed to change anything here
+ * so we don't permanently release the long hold or regular hold here.  We want
+ * to do this only when syncing to avoid the dataset unexpectedly going away
+ * when we release the long hold.
+ */
+static int
+dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
+{
+	boolean_t held;
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	if (owner != NULL) {
+		VERIFY3P(ds->ds_owner, ==, owner);
+		dsl_dataset_long_rele(ds, owner);
+	}
+
+	held = dsl_dataset_long_held(ds);
+
+	if (owner != NULL)
+		dsl_dataset_long_hold(ds, owner);
+
+	if (held)
+		return (SET_ERROR(EBUSY));
+
+	return (0);
+}
+
+typedef struct dsl_dataset_rollback_arg {
+	const char *ddra_fsname;
+	void *ddra_owner;
+	nvlist_t *ddra_result;
+} dsl_dataset_rollback_arg_t;
+
+static int
+dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_rollback_arg_t *ddra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int64_t unused_refres_delta;
+	int error;
+	nvpair_t *pair;
+	nvlist_t *proprequest, *bookmarks;
+
+	error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	/* must not be a snapshot */
+	if (ds->ds_is_snapshot) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/* must have a most recent snapshot */
+	if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/* must not have any bookmarks after the most recent snapshot */
+	proprequest = fnvlist_alloc();
+	fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
+	bookmarks = fnvlist_alloc();
+	error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
+	fnvlist_free(proprequest);
+	if (error != 0)
+		return (error);
+	for (pair = nvlist_next_nvpair(bookmarks, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
+		nvlist_t *valuenv =
+		    fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
+		    zfs_prop_to_name(ZFS_PROP_CREATETXG));
+		uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
+		if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+			fnvlist_free(bookmarks);
+			dsl_dataset_rele(ds, FTAG);
+			return (SET_ERROR(EEXIST));
+		}
+	}
+	fnvlist_free(bookmarks);
+
+	error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+
+	/*
+	 * Check if the snap we are rolling back to uses more than
+	 * the refquota.
+	 */
+	if (ds->ds_quota != 0 &&
+	    dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EDQUOT));
+	}
+
+	/*
+	 * When we do the clone swap, we will temporarily use more space
+	 * due to the refreservation (the head will no longer have any
+	 * unique space, so the entire amount of the refreservation will need
+	 * to be free).  We will immediately destroy the clone, freeing
+	 * this space, but the freeing happens over many txg's.
+	 */
+	unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
+	    dsl_dataset_phys(ds)->ds_unique_bytes);
+
+	if (unused_refres_delta > 0 &&
+	    unused_refres_delta >
+	    dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(ENOSPC));
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_rollback_arg_t *ddra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds, *clone;
+	uint64_t cloneobj;
+	char namebuf[ZFS_MAXNAMELEN];
+
+	VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
+
+	dsl_dataset_name(ds->ds_prev, namebuf);
+	fnvlist_add_string(ddra->ddra_result, "target", namebuf);
+
+	cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
+	    ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
+
+	VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
+
+	dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
+	dsl_dataset_zero_zil(ds, tx);
+
+	dsl_destroy_head_sync_impl(clone, tx);
+
+	dsl_dataset_rele(clone, FTAG);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * Rolls back the given filesystem or volume to the most recent snapshot.
+ * The name of the most recent snapshot will be returned under key "target"
+ * in the result nvlist.
+ *
+ * If owner != NULL:
+ * - The existing dataset MUST be owned by the specified owner at entry
+ * - Upon return, dataset will still be held by the same owner, whether we
+ *   succeed or not.
+ *
+ * This mode is required any time the existing filesystem is mounted.  See
+ * notes above zfs_suspend_fs() for further details.
+ */
+int
+dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result)
+{
+	dsl_dataset_rollback_arg_t ddra;
+
+	ddra.ddra_fsname = fsname;
+	ddra.ddra_owner = owner;
+	ddra.ddra_result = result;
+
+	return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
+	    dsl_dataset_rollback_sync, &ddra,
+	    1, ZFS_SPACE_CHECK_RESERVED));
+}
+
+struct promotenode {
+	list_node_t link;
+	dsl_dataset_t *ds;
+};
+
+typedef struct dsl_dataset_promote_arg {
+	const char *ddpa_clonename;
+	dsl_dataset_t *ddpa_clone;
+	list_t shared_snaps, origin_snaps, clone_snaps;
+	dsl_dataset_t *origin_origin; /* origin of the origin */
+	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
+	char *err_ds;
+	cred_t *cr;
+} dsl_dataset_promote_arg_t;
+
+static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
+static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
+    void *tag);
+static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
+
+static int
+dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_promote_arg_t *ddpa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *hds;
+	struct promotenode *snap;
+	dsl_dataset_t *origin_ds;
+	int err;
+	uint64_t unused;
+	uint64_t ss_mv_cnt;
+	size_t max_snap_len;
+
+	err = promote_hold(ddpa, dp, FTAG);
+	if (err != 0)
+		return (err);
+
+	hds = ddpa->ddpa_clone;
+	max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
+
+	if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
+		promote_rele(ddpa, FTAG);
+		return (SET_ERROR(EXDEV));
+	}
+
+	/*
+	 * Compute and check the amount of space to transfer.  Since this is
+	 * so expensive, don't do the preliminary check.
+	 */
+	if (!dmu_tx_is_syncing(tx)) {
+		promote_rele(ddpa, FTAG);
+		return (0);
+	}
+
+	snap = list_head(&ddpa->shared_snaps);
+	origin_ds = snap->ds;
+
+	/* compute origin's new unique space */
+	snap = list_tail(&ddpa->clone_snaps);
+	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+	    origin_ds->ds_object);
+	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+	    dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
+	    &ddpa->unique, &unused, &unused);
+
+	/*
+	 * Walk the snapshots that we are moving
+	 *
+	 * Compute space to transfer.  Consider the incremental changes
+	 * to used by each snapshot:
+	 * (my used) = (prev's used) + (blocks born) - (blocks killed)
+	 * So each snapshot gave birth to:
+	 * (blocks born) = (my used) - (prev's used) + (blocks killed)
+	 * So a sequence would look like:
+	 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
+	 * Which simplifies to:
+	 * uN + kN + kN-1 + ... + k1 + k0
+	 * Note however, if we stop before we reach the ORIGIN we get:
+	 * uN + kN + kN-1 + ... + kM - uM-1
+	 */
+	ss_mv_cnt = 0;
+	ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
+	ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
+	ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
+	for (snap = list_head(&ddpa->shared_snaps); snap;
+	    snap = list_next(&ddpa->shared_snaps, snap)) {
+		uint64_t val, dlused, dlcomp, dluncomp;
+		dsl_dataset_t *ds = snap->ds;
+
+		ss_mv_cnt++;
+
+		/*
+		 * If there are long holds, we won't be able to evict
+		 * the objset.
+		 */
+		if (dsl_dataset_long_held(ds)) {
+			err = SET_ERROR(EBUSY);
+			goto out;
+		}
+
+		/* Check that the snapshot name does not conflict */
+		VERIFY0(dsl_dataset_get_snapname(ds));
+		if (strlen(ds->ds_snapname) >= max_snap_len) {
+			err = SET_ERROR(ENAMETOOLONG);
+			goto out;
+		}
+		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
+		if (err == 0) {
+			(void) strcpy(ddpa->err_ds, snap->ds->ds_snapname);
+			err = SET_ERROR(EEXIST);
+			goto out;
+		}
+		if (err != ENOENT)
+			goto out;
+
+		/* The very first snapshot does not have a deadlist */
+		if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
+			continue;
+
+		dsl_deadlist_space(&ds->ds_deadlist,
+		    &dlused, &dlcomp, &dluncomp);
+		ddpa->used += dlused;
+		ddpa->comp += dlcomp;
+		ddpa->uncomp += dluncomp;
+	}
+
+	/*
+	 * If we are a clone of a clone then we never reached ORIGIN,
+	 * so we need to subtract out the clone origin's used space.
+	 */
+	if (ddpa->origin_origin) {
+		ddpa->used -=
+		    dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
+		ddpa->comp -=
+		    dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
+		ddpa->uncomp -=
+		    dsl_dataset_phys(ddpa->origin_origin)->
+		    ds_uncompressed_bytes;
+	}
+
+	/* Check that there is enough space and limit headroom here */
+	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
+	    0, ss_mv_cnt, ddpa->used, ddpa->cr);
+	if (err != 0)
+		goto out;
+
+	/*
+	 * Compute the amounts of space that will be used by snapshots
+	 * after the promotion (for both origin and clone).  For each,
+	 * it is the amount of space that will be on all of their
+	 * deadlists (that was not born before their new origin).
+	 */
+	if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+		uint64_t space;
+
+		/*
+		 * Note, typically this will not be a clone of a clone,
+		 * so dd_origin_txg will be < TXG_INITIAL, so
+		 * these snaplist_space() -> dsl_deadlist_space_range()
+		 * calls will be fast because they do not have to
+		 * iterate over all bps.
+		 */
+		snap = list_head(&ddpa->origin_snaps);
+		err = snaplist_space(&ddpa->shared_snaps,
+		    snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
+		if (err != 0)
+			goto out;
+
+		err = snaplist_space(&ddpa->clone_snaps,
+		    snap->ds->ds_dir->dd_origin_txg, &space);
+		if (err != 0)
+			goto out;
+		ddpa->cloneusedsnap += space;
+	}
+	if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
+	    DD_FLAG_USED_BREAKDOWN) {
+		err = snaplist_space(&ddpa->origin_snaps,
+		    dsl_dataset_phys(origin_ds)->ds_creation_txg,
+		    &ddpa->originusedsnap);
+		if (err != 0)
+			goto out;
+	}
+
+out:
+	promote_rele(ddpa, FTAG);
+	return (err);
+}
+
+static void
+dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_promote_arg_t *ddpa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *hds;
+	struct promotenode *snap;
+	dsl_dataset_t *origin_ds;
+	dsl_dataset_t *origin_head;
+	dsl_dir_t *dd;
+	dsl_dir_t *odd = NULL;
+	uint64_t oldnext_obj;
+	int64_t delta;
+
+	VERIFY0(promote_hold(ddpa, dp, FTAG));
+	hds = ddpa->ddpa_clone;
+
+	ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
+
+	snap = list_head(&ddpa->shared_snaps);
+	origin_ds = snap->ds;
+	dd = hds->ds_dir;
+
+	snap = list_head(&ddpa->origin_snaps);
+	origin_head = snap->ds;
+
+	/*
+	 * We need to explicitly open odd, since origin_ds's dd will be
+	 * changing.
+	 */
+	VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
+	    NULL, FTAG, &odd));
+
+	/* change origin's next snap */
+	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
+	oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
+	snap = list_tail(&ddpa->clone_snaps);
+	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+	    origin_ds->ds_object);
+	dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
+
+	/* change the origin's next clone */
+	if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
+		dsl_dataset_remove_from_next_clones(origin_ds,
+		    snap->ds->ds_object, tx);
+		VERIFY0(zap_add_int(dp->dp_meta_objset,
+		    dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
+		    oldnext_obj, tx));
+	}
+
+	/* change origin */
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
+	dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
+	dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
+	dmu_buf_will_dirty(odd->dd_dbuf, tx);
+	dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
+	origin_head->ds_dir->dd_origin_txg =
+	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
+
+	/* change dd_clone entries */
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+		VERIFY0(zap_remove_int(dp->dp_meta_objset,
+		    dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
+		VERIFY0(zap_add_int(dp->dp_meta_objset,
+		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
+		    hds->ds_object, tx));
+
+		VERIFY0(zap_remove_int(dp->dp_meta_objset,
+		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
+		    origin_head->ds_object, tx));
+		if (dsl_dir_phys(dd)->dd_clones == 0) {
+			dsl_dir_phys(dd)->dd_clones =
+			    zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
+			    DMU_OT_NONE, 0, tx);
+		}
+		VERIFY0(zap_add_int(dp->dp_meta_objset,
+		    dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
+	}
+
+	/* move snapshots to this dir */
+	for (snap = list_head(&ddpa->shared_snaps); snap;
+	    snap = list_next(&ddpa->shared_snaps, snap)) {
+		dsl_dataset_t *ds = snap->ds;
+
+		/*
+		 * Property callbacks are registered to a particular
+		 * dsl_dir.  Since ours is changing, evict the objset
+		 * so that they will be unregistered from the old dsl_dir.
+		 */
+		if (ds->ds_objset) {
+			dmu_objset_evict(ds->ds_objset);
+			ds->ds_objset = NULL;
+		}
+
+		/* move snap name entry */
+		VERIFY0(dsl_dataset_get_snapname(ds));
+		VERIFY0(dsl_dataset_snap_remove(origin_head,
+		    ds->ds_snapname, tx, B_TRUE));
+		VERIFY0(zap_add(dp->dp_meta_objset,
+		    dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
+		    8, 1, &ds->ds_object, tx));
+		dsl_fs_ss_count_adjust(hds->ds_dir, 1,
+		    DD_FIELD_SNAPSHOT_COUNT, tx);
+
+		/* change containing dsl_dir */
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
+		dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
+		ASSERT3P(ds->ds_dir, ==, odd);
+		dsl_dir_rele(ds->ds_dir, ds);
+		VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
+		    NULL, ds, &ds->ds_dir));
+
+		/* move any clone references */
+		if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
+		    spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+			zap_cursor_t zc;
+			zap_attribute_t za;
+
+			for (zap_cursor_init(&zc, dp->dp_meta_objset,
+			    dsl_dataset_phys(ds)->ds_next_clones_obj);
+			    zap_cursor_retrieve(&zc, &za) == 0;
+			    zap_cursor_advance(&zc)) {
+				dsl_dataset_t *cnds;
+				uint64_t o;
+
+				if (za.za_first_integer == oldnext_obj) {
+					/*
+					 * We've already moved the
+					 * origin's reference.
+					 */
+					continue;
+				}
+
+				VERIFY0(dsl_dataset_hold_obj(dp,
+				    za.za_first_integer, FTAG, &cnds));
+				o = dsl_dir_phys(cnds->ds_dir)->
+				    dd_head_dataset_obj;
+
+				VERIFY0(zap_remove_int(dp->dp_meta_objset,
+				    dsl_dir_phys(odd)->dd_clones, o, tx));
+				VERIFY0(zap_add_int(dp->dp_meta_objset,
+				    dsl_dir_phys(dd)->dd_clones, o, tx));
+				dsl_dataset_rele(cnds, FTAG);
+			}
+			zap_cursor_fini(&zc);
+		}
+
+		ASSERT(!dsl_prop_hascb(ds));
+	}
+
+	/*
+	 * Change space accounting.
+	 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
+	 * both be valid, or both be 0 (resulting in delta == 0).  This
+	 * is true for each of {clone,origin} independently.
+	 */
+
+	delta = ddpa->cloneusedsnap -
+	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
+	ASSERT3S(delta, >=, 0);
+	ASSERT3U(ddpa->used, >=, delta);
+	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
+	dsl_dir_diduse_space(dd, DD_USED_HEAD,
+	    ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
+
+	delta = ddpa->originusedsnap -
+	    dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
+	ASSERT3S(delta, <=, 0);
+	ASSERT3U(ddpa->used, >=, -delta);
+	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
+	dsl_dir_diduse_space(odd, DD_USED_HEAD,
+	    -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
+
+	dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
+
+	/* log history record */
+	spa_history_log_internal_ds(hds, "promote", tx, "");
+
+	dsl_dir_rele(odd, FTAG);
+	promote_rele(ddpa, FTAG);
+}
+
+/*
+ * Make a list of dsl_dataset_t's for the snapshots between first_obj
+ * (exclusive) and last_obj (inclusive).  The list will be in reverse
+ * order (last_obj will be the list_head()).  If first_obj == 0, do all
+ * snapshots back to this dataset's origin.
+ */
+static int
+snaplist_make(dsl_pool_t *dp,
+    uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
+{
+	uint64_t obj = last_obj;
+
+	list_create(l, sizeof (struct promotenode),
+	    offsetof(struct promotenode, link));
+
+	while (obj != first_obj) {
+		dsl_dataset_t *ds;
+		struct promotenode *snap;
+		int err;
+
+		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
+		ASSERT(err != ENOENT);
+		if (err != 0)
+			return (err);
+
+		if (first_obj == 0)
+			first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
+
+		snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
+		snap->ds = ds;
+		list_insert_tail(l, snap);
+		obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	}
+
+	return (0);
+}
+
+static int
+snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
+{
+	struct promotenode *snap;
+
+	*spacep = 0;
+	for (snap = list_head(l); snap; snap = list_next(l, snap)) {
+		uint64_t used, comp, uncomp;
+		dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+		    mintxg, UINT64_MAX, &used, &comp, &uncomp);
+		*spacep += used;
+	}
+	return (0);
+}
+
+static void
+snaplist_destroy(list_t *l, void *tag)
+{
+	struct promotenode *snap;
+
+	if (l == NULL || !list_link_active(&l->list_head))
+		return;
+
+	while ((snap = list_tail(l)) != NULL) {
+		list_remove(l, snap);
+		dsl_dataset_rele(snap->ds, tag);
+		kmem_free(snap, sizeof (*snap));
+	}
+	list_destroy(l);
+}
+
+static int
+promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
+{
+	int error;
+	dsl_dir_t *dd;
+	struct promotenode *snap;
+
+	error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
+	    &ddpa->ddpa_clone);
+	if (error != 0)
+		return (error);
+	dd = ddpa->ddpa_clone->ds_dir;
+
+	if (ddpa->ddpa_clone->ds_is_snapshot ||
+	    !dsl_dir_is_clone(dd)) {
+		dsl_dataset_rele(ddpa->ddpa_clone, tag);
+		return (SET_ERROR(EINVAL));
+	}
+
+	error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
+	    &ddpa->shared_snaps, tag);
+	if (error != 0)
+		goto out;
+
+	error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
+	    &ddpa->clone_snaps, tag);
+	if (error != 0)
+		goto out;
+
+	snap = list_head(&ddpa->shared_snaps);
+	ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
+	error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
+	    dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
+	    &ddpa->origin_snaps, tag);
+	if (error != 0)
+		goto out;
+
+	if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
+		error = dsl_dataset_hold_obj(dp,
+		    dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
+		    tag, &ddpa->origin_origin);
+		if (error != 0)
+			goto out;
+	}
+out:
+	if (error != 0)
+		promote_rele(ddpa, tag);
+	return (error);
+}
+
+static void
+promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
+{
+	snaplist_destroy(&ddpa->shared_snaps, tag);
+	snaplist_destroy(&ddpa->clone_snaps, tag);
+	snaplist_destroy(&ddpa->origin_snaps, tag);
+	if (ddpa->origin_origin != NULL)
+		dsl_dataset_rele(ddpa->origin_origin, tag);
+	dsl_dataset_rele(ddpa->ddpa_clone, tag);
+}
+
+/*
+ * Promote a clone.
+ *
+ * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
+ * in with the name.  (It must be at least MAXNAMELEN bytes long.)
+ */
+int
+dsl_dataset_promote(const char *name, char *conflsnap)
+{
+	dsl_dataset_promote_arg_t ddpa = { 0 };
+	uint64_t numsnaps;
+	int error;
+	objset_t *os;
+
+	/*
+	 * We will modify space proportional to the number of
+	 * snapshots.  Compute numsnaps.
+	 */
+	error = dmu_objset_hold(name, FTAG, &os);
+	if (error != 0)
+		return (error);
+	error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
+	    dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
+	    &numsnaps);
+	dmu_objset_rele(os, FTAG);
+	if (error != 0)
+		return (error);
+
+	ddpa.ddpa_clonename = name;
+	ddpa.err_ds = conflsnap;
+	ddpa.cr = CRED();
+
+	return (dsl_sync_task(name, dsl_dataset_promote_check,
+	    dsl_dataset_promote_sync, &ddpa,
+	    2 + numsnaps, ZFS_SPACE_CHECK_RESERVED));
+}
+
+int
+dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
+    dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
+{
+	int64_t unused_refres_delta;
+
+	/* they should both be heads */
+	if (clone->ds_is_snapshot ||
+	    origin_head->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
+
+	/* if we are not forcing, the branch point should be just before them */
+	if (!force && clone->ds_prev != origin_head->ds_prev)
+		return (SET_ERROR(EINVAL));
+
+	/* clone should be the clone (unless they are unrelated) */
+	if (clone->ds_prev != NULL &&
+	    clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
+	    origin_head->ds_dir != clone->ds_prev->ds_dir)
+		return (SET_ERROR(EINVAL));
+
+	/* the clone should be a child of the origin */
+	if (clone->ds_dir->dd_parent != origin_head->ds_dir)
+		return (SET_ERROR(EINVAL));
+
+	/* origin_head shouldn't be modified unless 'force' */
+	if (!force &&
+	    dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
+		return (SET_ERROR(ETXTBSY));
+
+	/* origin_head should have no long holds (e.g. is not mounted) */
+	if (dsl_dataset_handoff_check(origin_head, owner, tx))
+		return (SET_ERROR(EBUSY));
+
+	/* check amount of any unconsumed refreservation */
+	unused_refres_delta =
+	    (int64_t)MIN(origin_head->ds_reserved,
+	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
+	    (int64_t)MIN(origin_head->ds_reserved,
+	    dsl_dataset_phys(clone)->ds_unique_bytes);
+
+	if (unused_refres_delta > 0 &&
+	    unused_refres_delta >
+	    dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
+		return (SET_ERROR(ENOSPC));
+
+	/* clone can't be over the head's refquota */
+	if (origin_head->ds_quota != 0 &&
+	    dsl_dataset_phys(clone)->ds_referenced_bytes >
+	    origin_head->ds_quota)
+		return (SET_ERROR(EDQUOT));
+
+	return (0);
+}
+
+void
+dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
+    dsl_dataset_t *origin_head, dmu_tx_t *tx)
+{
+	spa_feature_t f;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	int64_t unused_refres_delta;
+
+	ASSERT(clone->ds_reserved == 0);
+	ASSERT(origin_head->ds_quota == 0 ||
+	    dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota);
+	ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
+
+	/*
+	 * Swap per-dataset feature flags.
+	 */
+	for (f = 0; f < SPA_FEATURES; f++) {
+		boolean_t clone_inuse;
+		boolean_t origin_head_inuse;
+
+		if (!(spa_feature_table[f].fi_flags &
+		    ZFEATURE_FLAG_PER_DATASET)) {
+			ASSERT(!clone->ds_feature_inuse[f]);
+			ASSERT(!origin_head->ds_feature_inuse[f]);
+			continue;
+		}
+
+		clone_inuse = clone->ds_feature_inuse[f];
+		origin_head_inuse = origin_head->ds_feature_inuse[f];
+
+		if (clone_inuse) {
+			dsl_dataset_deactivate_feature(clone->ds_object, f, tx);
+			clone->ds_feature_inuse[f] = B_FALSE;
+		}
+		if (origin_head_inuse) {
+			dsl_dataset_deactivate_feature(origin_head->ds_object,
+			    f, tx);
+			origin_head->ds_feature_inuse[f] = B_FALSE;
+		}
+		if (clone_inuse) {
+			dsl_dataset_activate_feature(origin_head->ds_object,
+			    f, tx);
+			origin_head->ds_feature_inuse[f] = B_TRUE;
+		}
+		if (origin_head_inuse) {
+			dsl_dataset_activate_feature(clone->ds_object, f, tx);
+			clone->ds_feature_inuse[f] = B_TRUE;
+		}
+	}
+
+	dmu_buf_will_dirty(clone->ds_dbuf, tx);
+	dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
+
+	if (clone->ds_objset != NULL) {
+		dmu_objset_evict(clone->ds_objset);
+		clone->ds_objset = NULL;
+	}
+
+	if (origin_head->ds_objset != NULL) {
+		dmu_objset_evict(origin_head->ds_objset);
+		origin_head->ds_objset = NULL;
+	}
+
+	unused_refres_delta =
+	    (int64_t)MIN(origin_head->ds_reserved,
+	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
+	    (int64_t)MIN(origin_head->ds_reserved,
+	    dsl_dataset_phys(clone)->ds_unique_bytes);
+
+	/*
+	 * Reset origin's unique bytes, if it exists.
+	 */
+	if (clone->ds_prev) {
+		dsl_dataset_t *origin = clone->ds_prev;
+		uint64_t comp, uncomp;
+
+		dmu_buf_will_dirty(origin->ds_dbuf, tx);
+		dsl_deadlist_space_range(&clone->ds_deadlist,
+		    dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
+		    &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
+	}
+
+	/* swap blkptrs */
+	{
+		blkptr_t tmp;
+		tmp = dsl_dataset_phys(origin_head)->ds_bp;
+		dsl_dataset_phys(origin_head)->ds_bp =
+		    dsl_dataset_phys(clone)->ds_bp;
+		dsl_dataset_phys(clone)->ds_bp = tmp;
+	}
+
+	/* set dd_*_bytes */
+	{
+		int64_t dused, dcomp, duncomp;
+		uint64_t cdl_used, cdl_comp, cdl_uncomp;
+		uint64_t odl_used, odl_comp, odl_uncomp;
+
+		ASSERT3U(dsl_dir_phys(clone->ds_dir)->
+		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
+
+		dsl_deadlist_space(&clone->ds_deadlist,
+		    &cdl_used, &cdl_comp, &cdl_uncomp);
+		dsl_deadlist_space(&origin_head->ds_deadlist,
+		    &odl_used, &odl_comp, &odl_uncomp);
+
+		dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
+		    cdl_used -
+		    (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
+		    odl_used);
+		dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
+		    cdl_comp -
+		    (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
+		    odl_comp);
+		duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
+		    cdl_uncomp -
+		    (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
+		    odl_uncomp);
+
+		dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
+		    dused, dcomp, duncomp, tx);
+		dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
+		    -dused, -dcomp, -duncomp, tx);
+
+		/*
+		 * The difference in the space used by snapshots is the
+		 * difference in snapshot space due to the head's
+		 * deadlist (since that's the only thing that's
+		 * changing that affects the snapused).
+		 */
+		dsl_deadlist_space_range(&clone->ds_deadlist,
+		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
+		    &cdl_used, &cdl_comp, &cdl_uncomp);
+		dsl_deadlist_space_range(&origin_head->ds_deadlist,
+		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
+		    &odl_used, &odl_comp, &odl_uncomp);
+		dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
+		    DD_USED_HEAD, DD_USED_SNAP, tx);
+	}
+
+	/* swap ds_*_bytes */
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
+	    dsl_dataset_phys(clone)->ds_referenced_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
+	    dsl_dataset_phys(clone)->ds_compressed_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
+	    dsl_dataset_phys(clone)->ds_uncompressed_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
+	    dsl_dataset_phys(clone)->ds_unique_bytes);
+
+	/* apply any parent delta for change in unconsumed refreservation */
+	dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
+	    unused_refres_delta, 0, 0, tx);
+
+	/*
+	 * Swap deadlists.
+	 */
+	dsl_deadlist_close(&clone->ds_deadlist);
+	dsl_deadlist_close(&origin_head->ds_deadlist);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
+	    dsl_dataset_phys(clone)->ds_deadlist_obj);
+	dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
+	    dsl_dataset_phys(clone)->ds_deadlist_obj);
+	dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
+	    dsl_dataset_phys(origin_head)->ds_deadlist_obj);
+
+	dsl_scan_ds_clone_swapped(origin_head, clone, tx);
+
+	spa_history_log_internal_ds(clone, "clone swap", tx,
+	    "parent=%s", origin_head->ds_dir->dd_myname);
+}
+
+/*
+ * Given a pool name and a dataset object number in that pool,
+ * return the name of that dataset.
+ */
+int
+dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	int error;
+
+	error = dsl_pool_hold(pname, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
+	if (error == 0) {
+		dsl_dataset_name(ds, buf);
+		dsl_dataset_rele(ds, FTAG);
+	}
+	dsl_pool_rele(dp, FTAG);
+
+	return (error);
+}
+
+int
+dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
+    uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
+{
+	int error = 0;
+
+	ASSERT3S(asize, >, 0);
+
+	/*
+	 * *ref_rsrv is the portion of asize that will come from any
+	 * unconsumed refreservation space.
+	 */
+	*ref_rsrv = 0;
+
+	mutex_enter(&ds->ds_lock);
+	/*
+	 * Make a space adjustment for reserved bytes.
+	 */
+	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
+		ASSERT3U(*used, >=,
+		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
+		*used -=
+		    (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
+		*ref_rsrv =
+		    asize - MIN(asize, parent_delta(ds, asize + inflight));
+	}
+
+	if (!check_quota || ds->ds_quota == 0) {
+		mutex_exit(&ds->ds_lock);
+		return (0);
+	}
+	/*
+	 * If they are requesting more space, and our current estimate
+	 * is over quota, they get to try again unless the actual
+	 * on-disk is over quota and there are no pending changes (which
+	 * may free up space for us).
+	 */
+	if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
+	    ds->ds_quota) {
+		if (inflight > 0 ||
+		    dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
+			error = SET_ERROR(ERESTART);
+		else
+			error = SET_ERROR(EDQUOT);
+	}
+	mutex_exit(&ds->ds_lock);
+
+	return (error);
+}
+
+typedef struct dsl_dataset_set_qr_arg {
+	const char *ddsqra_name;
+	zprop_source_t ddsqra_source;
+	uint64_t ddsqra_value;
+} dsl_dataset_set_qr_arg_t;
+
+
+/* ARGSUSED */
+static int
+dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int error;
+	uint64_t newval;
+
+	if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
+		return (SET_ERROR(ENOTSUP));
+
+	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	if (ds->ds_is_snapshot) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	error = dsl_prop_predict(ds->ds_dir,
+	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+
+	if (newval == 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (0);
+	}
+
+	if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
+	    newval < ds->ds_reserved) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(ENOSPC));
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	uint64_t newval;
+
+	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+
+	dsl_prop_set_sync_impl(ds,
+	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+	    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+	    &ddsqra->ddsqra_value, tx);
+
+	VERIFY0(dsl_prop_get_int_ds(ds,
+	    zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
+
+	if (ds->ds_quota != newval) {
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		ds->ds_quota = newval;
+	}
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
+    uint64_t refquota)
+{
+	dsl_dataset_set_qr_arg_t ddsqra;
+
+	ddsqra.ddsqra_name = dsname;
+	ddsqra.ddsqra_source = source;
+	ddsqra.ddsqra_value = refquota;
+
+	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
+	    dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
+}
+
+static int
+dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int error;
+	uint64_t newval, unique;
+
+	if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
+		return (SET_ERROR(ENOTSUP));
+
+	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	if (ds->ds_is_snapshot) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	error = dsl_prop_predict(ds->ds_dir,
+	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+
+	/*
+	 * If we are doing the preliminary check in open context, the
+	 * space estimates may be inaccurate.
+	 */
+	if (!dmu_tx_is_syncing(tx)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (0);
+	}
+
+	mutex_enter(&ds->ds_lock);
+	if (!DS_UNIQUE_IS_ACCURATE(ds))
+		dsl_dataset_recalc_head_uniq(ds);
+	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
+	mutex_exit(&ds->ds_lock);
+
+	if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
+		uint64_t delta = MAX(unique, newval) -
+		    MAX(unique, ds->ds_reserved);
+
+		if (delta >
+		    dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
+		    (ds->ds_quota > 0 && newval > ds->ds_quota)) {
+			dsl_dataset_rele(ds, FTAG);
+			return (SET_ERROR(ENOSPC));
+		}
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+void
+dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
+    zprop_source_t source, uint64_t value, dmu_tx_t *tx)
+{
+	uint64_t newval;
+	uint64_t unique;
+	int64_t delta;
+
+	dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+	    source, sizeof (value), 1, &value, tx);
+
+	VERIFY0(dsl_prop_get_int_ds(ds,
+	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	mutex_enter(&ds->ds_dir->dd_lock);
+	mutex_enter(&ds->ds_lock);
+	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
+	delta = MAX(0, (int64_t)(newval - unique)) -
+	    MAX(0, (int64_t)(ds->ds_reserved - unique));
+	ds->ds_reserved = newval;
+	mutex_exit(&ds->ds_lock);
+
+	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
+	mutex_exit(&ds->ds_dir->dd_lock);
+}
+
+static void
+dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+
+	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+	dsl_dataset_set_refreservation_sync_impl(ds,
+	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
+    uint64_t refreservation)
+{
+	dsl_dataset_set_qr_arg_t ddsqra;
+
+	ddsqra.ddsqra_name = dsname;
+	ddsqra.ddsqra_source = source;
+	ddsqra.ddsqra_value = refreservation;
+
+	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
+	    dsl_dataset_set_refreservation_sync, &ddsqra,
+	    0, ZFS_SPACE_CHECK_NONE));
+}
+
+/*
+ * Return (in *usedp) the amount of space written in new that is not
+ * present in oldsnap.  New may be a snapshot or the head.  Old must be
+ * a snapshot before new, in new's filesystem (or its origin).  If not then
+ * fail and return EINVAL.
+ *
+ * The written space is calculated by considering two components:  First, we
+ * ignore any freed space, and calculate the written as new's used space
+ * minus old's used space.  Next, we add in the amount of space that was freed
+ * between the two snapshots, thus reducing new's used space relative to old's.
+ * Specifically, this is the space that was born before old->ds_creation_txg,
+ * and freed before new (ie. on new's deadlist or a previous deadlist).
+ *
+ * space freed                         [---------------------]
+ * snapshots                       ---O-------O--------O-------O------
+ *                                         oldsnap            new
+ */
+int
+dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	int err = 0;
+	uint64_t snapobj;
+	dsl_pool_t *dp = new->ds_dir->dd_pool;
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	*usedp = 0;
+	*usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
+	*usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
+
+	*compp = 0;
+	*compp += dsl_dataset_phys(new)->ds_compressed_bytes;
+	*compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
+
+	*uncompp = 0;
+	*uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
+	*uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
+
+	snapobj = new->ds_object;
+	while (snapobj != oldsnap->ds_object) {
+		dsl_dataset_t *snap;
+		uint64_t used, comp, uncomp;
+
+		if (snapobj == new->ds_object) {
+			snap = new;
+		} else {
+			err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
+			if (err != 0)
+				break;
+		}
+
+		if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
+		    dsl_dataset_phys(oldsnap)->ds_creation_txg) {
+			/*
+			 * The blocks in the deadlist can not be born after
+			 * ds_prev_snap_txg, so get the whole deadlist space,
+			 * which is more efficient (especially for old-format
+			 * deadlists).  Unfortunately the deadlist code
+			 * doesn't have enough information to make this
+			 * optimization itself.
+			 */
+			dsl_deadlist_space(&snap->ds_deadlist,
+			    &used, &comp, &uncomp);
+		} else {
+			dsl_deadlist_space_range(&snap->ds_deadlist,
+			    0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
+			    &used, &comp, &uncomp);
+		}
+		*usedp += used;
+		*compp += comp;
+		*uncompp += uncomp;
+
+		/*
+		 * If we get to the beginning of the chain of snapshots
+		 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
+		 * was not a snapshot of/before new.
+		 */
+		snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+		if (snap != new)
+			dsl_dataset_rele(snap, FTAG);
+		if (snapobj == 0) {
+			err = SET_ERROR(EINVAL);
+			break;
+		}
+
+	}
+	return (err);
+}
+
+/*
+ * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
+ * lastsnap, and all snapshots in between are deleted.
+ *
+ * blocks that would be freed            [---------------------------]
+ * snapshots                       ---O-------O--------O-------O--------O
+ *                                        firstsnap        lastsnap
+ *
+ * This is the set of blocks that were born after the snap before firstsnap,
+ * (birth > firstsnap->prev_snap_txg) and died before the snap after the
+ * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
+ * We calculate this by iterating over the relevant deadlists (from the snap
+ * after lastsnap, backward to the snap after firstsnap), summing up the
+ * space on the deadlist that was born after the snap before firstsnap.
+ */
+int
+dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
+    dsl_dataset_t *lastsnap,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	int err = 0;
+	uint64_t snapobj;
+	dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
+
+	ASSERT(firstsnap->ds_is_snapshot);
+	ASSERT(lastsnap->ds_is_snapshot);
+
+	/*
+	 * Check that the snapshots are in the same dsl_dir, and firstsnap
+	 * is before lastsnap.
+	 */
+	if (firstsnap->ds_dir != lastsnap->ds_dir ||
+	    dsl_dataset_phys(firstsnap)->ds_creation_txg >
+	    dsl_dataset_phys(lastsnap)->ds_creation_txg)
+		return (SET_ERROR(EINVAL));
+
+	*usedp = *compp = *uncompp = 0;
+
+	snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
+	while (snapobj != firstsnap->ds_object) {
+		dsl_dataset_t *ds;
+		uint64_t used, comp, uncomp;
+
+		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
+		if (err != 0)
+			break;
+
+		dsl_deadlist_space_range(&ds->ds_deadlist,
+		    dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
+		    &used, &comp, &uncomp);
+		*usedp += used;
+		*compp += comp;
+		*uncompp += uncomp;
+
+		snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+		ASSERT3U(snapobj, !=, 0);
+		dsl_dataset_rele(ds, FTAG);
+	}
+	return (err);
+}
+
+/*
+ * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
+ * For example, they could both be snapshots of the same filesystem, and
+ * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
+ * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
+ * filesystem.  Or 'earlier' could be the origin's origin.
+ *
+ * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
+ */
+boolean_t
+dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
+	uint64_t earlier_txg)
+{
+	dsl_pool_t *dp = later->ds_dir->dd_pool;
+	int error;
+	boolean_t ret;
+	dsl_dataset_t *origin;
+
+	ASSERT(dsl_pool_config_held(dp));
+	ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);
+
+	if (earlier_txg == 0)
+		earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
+
+	if (later->ds_is_snapshot &&
+	    earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
+		return (B_FALSE);
+
+	if (later->ds_dir == earlier->ds_dir)
+		return (B_TRUE);
+	if (!dsl_dir_is_clone(later->ds_dir))
+		return (B_FALSE);
+
+	if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
+		return (B_TRUE);
+	error = dsl_dataset_hold_obj(dp,
+	    dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
+	if (error != 0)
+		return (B_FALSE);
+	ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
+	dsl_dataset_rele(origin, FTAG);
+	return (ret);
+}
+
+void
+dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+#if defined(_LP64)
+module_param(zfs_max_recordsize, int, 0644);
+MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size");
+#else
+/* Limited to 1M on 32-bit platforms due to lack of virtual address space */
+module_param(zfs_max_recordsize, int, 0444);
+MODULE_PARM_DESC(zfs_max_recordsize, "Max allowed record size");
+#endif
+
+EXPORT_SYMBOL(dsl_dataset_hold);
+EXPORT_SYMBOL(dsl_dataset_hold_obj);
+EXPORT_SYMBOL(dsl_dataset_own);
+EXPORT_SYMBOL(dsl_dataset_own_obj);
+EXPORT_SYMBOL(dsl_dataset_name);
+EXPORT_SYMBOL(dsl_dataset_rele);
+EXPORT_SYMBOL(dsl_dataset_disown);
+EXPORT_SYMBOL(dsl_dataset_tryown);
+EXPORT_SYMBOL(dsl_dataset_create_sync);
+EXPORT_SYMBOL(dsl_dataset_create_sync_dd);
+EXPORT_SYMBOL(dsl_dataset_snapshot_check);
+EXPORT_SYMBOL(dsl_dataset_snapshot_sync);
+EXPORT_SYMBOL(dsl_dataset_promote);
+EXPORT_SYMBOL(dsl_dataset_user_hold);
+EXPORT_SYMBOL(dsl_dataset_user_release);
+EXPORT_SYMBOL(dsl_dataset_get_holds);
+EXPORT_SYMBOL(dsl_dataset_get_blkptr);
+EXPORT_SYMBOL(dsl_dataset_set_blkptr);
+EXPORT_SYMBOL(dsl_dataset_get_spa);
+EXPORT_SYMBOL(dsl_dataset_modified_since_snap);
+EXPORT_SYMBOL(dsl_dataset_space_written);
+EXPORT_SYMBOL(dsl_dataset_space_wouldfree);
+EXPORT_SYMBOL(dsl_dataset_sync);
+EXPORT_SYMBOL(dsl_dataset_block_born);
+EXPORT_SYMBOL(dsl_dataset_block_kill);
+EXPORT_SYMBOL(dsl_dataset_block_freeable);
+EXPORT_SYMBOL(dsl_dataset_prev_snap_txg);
+EXPORT_SYMBOL(dsl_dataset_dirty);
+EXPORT_SYMBOL(dsl_dataset_stats);
+EXPORT_SYMBOL(dsl_dataset_fast_stat);
+EXPORT_SYMBOL(dsl_dataset_space);
+EXPORT_SYMBOL(dsl_dataset_fsid_guid);
+EXPORT_SYMBOL(dsl_dsobj_to_dsname);
+EXPORT_SYMBOL(dsl_dataset_check_quota);
+EXPORT_SYMBOL(dsl_dataset_clone_swap_check_impl);
+EXPORT_SYMBOL(dsl_dataset_clone_swap_sync_impl);
+#endif
diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c
index e45f46d8da42..d0015d1bd97b 100644
--- a/module/zfs/dsl_destroy.c
+++ b/module/zfs/dsl_destroy.c
@@ -560,7 +560,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	struct killarg *ka = arg;
 	dmu_tx_t *tx = ka->tx;
 
-	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return (0);
 
 	if (zb->zb_level == ZB_ZIL_LEVEL) {
diff --git a/module/zfs/dsl_destroy.c.orig b/module/zfs/dsl_destroy.c.orig
new file mode 100644
index 000000000000..e45f46d8da42
--- /dev/null
+++ b/module/zfs/dsl_destroy.c.orig
@@ -0,0 +1,996 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_objset.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_impl.h>
+
+typedef struct dmu_snapshots_destroy_arg {
+	nvlist_t *dsda_snaps;
+	nvlist_t *dsda_successful_snaps;
+	boolean_t dsda_defer;
+	nvlist_t *dsda_errlist;
+} dmu_snapshots_destroy_arg_t;
+
+int
+dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
+{
+	if (!ds->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
+
+	if (dsl_dataset_long_held(ds))
+		return (SET_ERROR(EBUSY));
+
+	/*
+	 * Only allow deferred destroy on pools that support it.
+	 * NOTE: deferred destroy is only supported on snapshots.
+	 */
+	if (defer) {
+		if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
+		    SPA_VERSION_USERREFS)
+			return (SET_ERROR(ENOTSUP));
+		return (0);
+	}
+
+	/*
+	 * If this snapshot has an elevated user reference count,
+	 * we can't destroy it yet.
+	 */
+	if (ds->ds_userrefs > 0)
+		return (SET_ERROR(EBUSY));
+
+	/*
+	 * Can't delete a branch point.
+	 */
+	if (dsl_dataset_phys(ds)->ds_num_children > 1)
+		return (SET_ERROR(EEXIST));
+
+	return (0);
+}
+
+static int
+dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx)
+{
+	dmu_snapshots_destroy_arg_t *dsda = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvpair_t *pair;
+	int error = 0;
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	for (pair = nvlist_next_nvpair(dsda->dsda_snaps, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(dsda->dsda_snaps, pair)) {
+		dsl_dataset_t *ds;
+
+		error = dsl_dataset_hold(dp, nvpair_name(pair),
+		    FTAG, &ds);
+
+		/*
+		 * If the snapshot does not exist, silently ignore it
+		 * (it's "already destroyed").
+		 */
+		if (error == ENOENT)
+			continue;
+
+		if (error == 0) {
+			error = dsl_destroy_snapshot_check_impl(ds,
+			    dsda->dsda_defer);
+			dsl_dataset_rele(ds, FTAG);
+		}
+
+		if (error == 0) {
+			fnvlist_add_boolean(dsda->dsda_successful_snaps,
+			    nvpair_name(pair));
+		} else {
+			fnvlist_add_int32(dsda->dsda_errlist,
+			    nvpair_name(pair), error);
+		}
+	}
+
+	pair = nvlist_next_nvpair(dsda->dsda_errlist, NULL);
+	if (pair != NULL)
+		return (fnvpair_value_int32(pair));
+
+	return (0);
+}
+
+struct process_old_arg {
+	dsl_dataset_t *ds;
+	dsl_dataset_t *ds_prev;
+	boolean_t after_branch_point;
+	zio_t *pio;
+	uint64_t used, comp, uncomp;
+};
+
+static int
+process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	struct process_old_arg *poa = arg;
+	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
+
+	ASSERT(!BP_IS_HOLE(bp));
+
+	if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
+		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
+		if (poa->ds_prev && !poa->after_branch_point &&
+		    bp->blk_birth >
+		    dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
+			dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
+			    bp_get_dsize_sync(dp->dp_spa, bp);
+		}
+	} else {
+		poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
+		poa->comp += BP_GET_PSIZE(bp);
+		poa->uncomp += BP_GET_UCSIZE(bp);
+		dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
+	}
+	return (0);
+}
+
+static void
+process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
+    dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
+{
+	struct process_old_arg poa = { 0 };
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t deadlist_obj;
+
+	ASSERT(ds->ds_deadlist.dl_oldfmt);
+	ASSERT(ds_next->ds_deadlist.dl_oldfmt);
+
+	poa.ds = ds;
+	poa.ds_prev = ds_prev;
+	poa.after_branch_point = after_branch_point;
+	poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
+	    process_old_cb, &poa, tx));
+	VERIFY0(zio_wait(poa.pio));
+	ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);
+
+	/* change snapused */
+	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+	    -poa.used, -poa.comp, -poa.uncomp, tx);
+
+	/* swap next's deadlist to our deadlist */
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_close(&ds_next->ds_deadlist);
+	deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+	dsl_dataset_phys(ds)->ds_deadlist_obj =
+	    dsl_dataset_phys(ds_next)->ds_deadlist_obj;
+	dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
+	dsl_deadlist_open(&ds->ds_deadlist, mos,
+	    dsl_dataset_phys(ds)->ds_deadlist_obj);
+	dsl_deadlist_open(&ds_next->ds_deadlist, mos,
+	    dsl_dataset_phys(ds_next)->ds_deadlist_obj);
+}
+
+static void
+dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	zap_cursor_t *zc;
+	zap_attribute_t *za;
+
+	/*
+	 * If it is the old version, dd_clones doesn't exist so we can't
+	 * find the clones, but dsl_deadlist_remove_key() is a no-op so it
+	 * doesn't matter.
+	 */
+	if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0)
+		return;
+
+	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+	for (zap_cursor_init(zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones);
+	    zap_cursor_retrieve(zc, za) == 0;
+	    zap_cursor_advance(zc)) {
+		dsl_dataset_t *clone;
+
+		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+		    za->za_first_integer, FTAG, &clone));
+		if (clone->ds_dir->dd_origin_txg > mintxg) {
+			dsl_deadlist_remove_key(&clone->ds_deadlist,
+			    mintxg, tx);
+			dsl_dataset_remove_clones_key(clone, mintxg, tx);
+		}
+		dsl_dataset_rele(clone, FTAG);
+	}
+	zap_cursor_fini(zc);
+
+	kmem_free(za, sizeof (zap_attribute_t));
+	kmem_free(zc, sizeof (zap_cursor_t));
+}
+
+void
+dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
+{
+#ifdef ZFS_DEBUG
+	int err;
+#endif
+	spa_feature_t f;
+	int after_branch_point = FALSE;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	dsl_dataset_t *ds_prev = NULL;
+	uint64_t obj, old_unique, used = 0, comp = 0, uncomp = 0;
+	dsl_dataset_t *ds_next, *ds_head, *hds;
+
+
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+	ASSERT(refcount_is_zero(&ds->ds_longholds));
+
+	if (defer &&
+	    (ds->ds_userrefs > 0 ||
+	    dsl_dataset_phys(ds)->ds_num_children > 1)) {
+		ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
+		spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
+		return;
+	}
+
+	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
+
+	/* We need to log before removing it from the namespace. */
+	spa_history_log_internal_ds(ds, "destroy", tx, "");
+
+	dsl_scan_ds_destroyed(ds, tx);
+
+	obj = ds->ds_object;
+
+	for (f = 0; f < SPA_FEATURES; f++) {
+		if (ds->ds_feature_inuse[f]) {
+			dsl_dataset_deactivate_feature(obj, f, tx);
+			ds->ds_feature_inuse[f] = B_FALSE;
+		}
+	}
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+		ASSERT3P(ds->ds_prev, ==, NULL);
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
+		after_branch_point =
+		    (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);
+
+		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
+		if (after_branch_point &&
+		    dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
+			dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
+			if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
+				VERIFY0(zap_add_int(mos,
+				    dsl_dataset_phys(ds_prev)->
+				    ds_next_clones_obj,
+				    dsl_dataset_phys(ds)->ds_next_snap_obj,
+				    tx));
+			}
+		}
+		if (!after_branch_point) {
+			dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
+			    dsl_dataset_phys(ds)->ds_next_snap_obj;
+		}
+	}
+
+	VERIFY0(dsl_dataset_hold_obj(dp,
+	    dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
+	ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);
+
+	old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;
+
+	dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
+	dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+	    ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);
+
+	if (ds_next->ds_deadlist.dl_oldfmt) {
+		process_old_deadlist(ds, ds_prev, ds_next,
+		    after_branch_point, tx);
+	} else {
+		/* Adjust prev's unique space. */
+		if (ds_prev && !after_branch_point) {
+			dsl_deadlist_space_range(&ds_next->ds_deadlist,
+			    dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
+			    dsl_dataset_phys(ds)->ds_prev_snap_txg,
+			    &used, &comp, &uncomp);
+			dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
+		}
+
+		/* Adjust snapused. */
+		dsl_deadlist_space_range(&ds_next->ds_deadlist,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
+		    &used, &comp, &uncomp);
+		dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+		    -used, -comp, -uncomp, tx);
+
+		/* Move blocks to be freed to pool's free list. */
+		dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
+		    &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
+		    tx);
+		dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
+		    DD_USED_HEAD, used, comp, uncomp, tx);
+
+		/* Merge our deadlist into next's and free it. */
+		dsl_deadlist_merge(&ds_next->ds_deadlist,
+		    dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
+	}
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
+
+	/* Collapse range in clone heads */
+	dsl_dataset_remove_clones_key(ds,
+	    dsl_dataset_phys(ds)->ds_creation_txg, tx);
+
+	if (ds_next->ds_is_snapshot) {
+		dsl_dataset_t *ds_nextnext;
+
+		/*
+		 * Update next's unique to include blocks which
+		 * were previously shared by only this snapshot
+		 * and it.  Those blocks will be born after the
+		 * prev snap and before this snap, and will have
+		 * died after the next snap and before the one
+		 * after that (ie. be on the snap after next's
+		 * deadlist).
+		 */
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds_next)->ds_next_snap_obj,
+		    FTAG, &ds_nextnext));
+		dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
+		    dsl_dataset_phys(ds)->ds_creation_txg,
+		    &used, &comp, &uncomp);
+		dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
+		dsl_dataset_rele(ds_nextnext, FTAG);
+		ASSERT3P(ds_next->ds_prev, ==, NULL);
+
+		/* Collapse range in this head. */
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds));
+		dsl_deadlist_remove_key(&hds->ds_deadlist,
+		    dsl_dataset_phys(ds)->ds_creation_txg, tx);
+		dsl_dataset_rele(hds, FTAG);
+
+	} else {
+		ASSERT3P(ds_next->ds_prev, ==, ds);
+		dsl_dataset_rele(ds_next->ds_prev, ds_next);
+		ds_next->ds_prev = NULL;
+		if (ds_prev) {
+			VERIFY0(dsl_dataset_hold_obj(dp,
+			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
+			    ds_next, &ds_next->ds_prev));
+		}
+
+		dsl_dataset_recalc_head_uniq(ds_next);
+
+		/*
+		 * Reduce the amount of our unconsumed refreservation
+		 * being charged to our parent by the amount of
+		 * new unique data we have gained.
+		 */
+		if (old_unique < ds_next->ds_reserved) {
+			int64_t mrsdelta;
+			uint64_t new_unique =
+			    dsl_dataset_phys(ds_next)->ds_unique_bytes;
+
+			ASSERT(old_unique <= new_unique);
+			mrsdelta = MIN(new_unique - old_unique,
+			    ds_next->ds_reserved - old_unique);
+			dsl_dir_diduse_space(ds->ds_dir,
+			    DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
+		}
+	}
+	dsl_dataset_rele(ds_next, FTAG);
+
+	/*
+	 * This must be done after the dsl_traverse(), because it will
+	 * re-open the objset.
+	 */
+	if (ds->ds_objset) {
+		dmu_objset_evict(ds->ds_objset);
+		ds->ds_objset = NULL;
+	}
+
+	/* remove from snapshot namespace */
+	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
+	VERIFY0(dsl_dataset_hold_obj(dp,
+	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
+	VERIFY0(dsl_dataset_get_snapname(ds));
+#ifdef ZFS_DEBUG
+	{
+		uint64_t val;
+
+		err = dsl_dataset_snap_lookup(ds_head,
+		    ds->ds_snapname, &val);
+		ASSERT0(err);
+		ASSERT3U(val, ==, obj);
+	}
+#endif
+	VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
+	dsl_dataset_rele(ds_head, FTAG);
+
+	if (ds_prev != NULL)
+		dsl_dataset_rele(ds_prev, FTAG);
+
+	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+
+	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+		ASSERTV(uint64_t count);
+		ASSERT0(zap_count(mos,
+		    dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
+		    count == 0);
+		VERIFY0(dmu_object_free(mos,
+		    dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
+	}
+	if (dsl_dataset_phys(ds)->ds_props_obj != 0)
+		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
+		    tx));
+	if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
+		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+		    tx));
+	dsl_dir_rele(ds->ds_dir, ds);
+	ds->ds_dir = NULL;
+	dmu_object_free_zapified(mos, obj, tx);
+}
+
+static void
+dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+	dmu_snapshots_destroy_arg_t *dsda = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvpair_t *pair;
+
+	for (pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, NULL);
+	    pair != NULL;
+	    pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, pair)) {
+		dsl_dataset_t *ds;
+
+		VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
+
+		dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx);
+		dsl_dataset_rele(ds, FTAG);
+	}
+}
+
+/*
+ * The semantics of this function are described in the comment above
+ * lzc_destroy_snaps().  To summarize:
+ *
+ * The snapshots must all be in the same pool.
+ *
+ * Snapshots that don't exist will be silently ignored (considered to be
+ * "already deleted").
+ *
+ * On success, all snaps will be destroyed and this will return 0.
+ * On failure, no snaps will be destroyed, the errlist will be filled in,
+ * and this will return an errno.
+ */
+int
+dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
+    nvlist_t *errlist)
+{
+	dmu_snapshots_destroy_arg_t dsda;
+	int error;
+	nvpair_t *pair;
+
+	pair = nvlist_next_nvpair(snaps, NULL);
+	if (pair == NULL)
+		return (0);
+
+	dsda.dsda_snaps = snaps;
+	VERIFY0(nvlist_alloc(&dsda.dsda_successful_snaps,
+	    NV_UNIQUE_NAME, KM_SLEEP));
+	dsda.dsda_defer = defer;
+	dsda.dsda_errlist = errlist;
+
+	error = dsl_sync_task(nvpair_name(pair),
+	    dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync,
+	    &dsda, 0, ZFS_SPACE_CHECK_NONE);
+	fnvlist_free(dsda.dsda_successful_snaps);
+
+	return (error);
+}
+
+int
+dsl_destroy_snapshot(const char *name, boolean_t defer)
+{
+	int error;
+	nvlist_t *nvl = fnvlist_alloc();
+	nvlist_t *errlist = fnvlist_alloc();
+
+	fnvlist_add_boolean(nvl, name);
+	error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
+	fnvlist_free(errlist);
+	fnvlist_free(nvl);
+	return (error);
+}
+
+struct killarg {
+	dsl_dataset_t *ds;
+	dmu_tx_t *tx;
+};
+
+/* ARGSUSED */
+static int
+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	struct killarg *ka = arg;
+	dmu_tx_t *tx = ka->tx;
+
+	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+		return (0);
+
+	if (zb->zb_level == ZB_ZIL_LEVEL) {
+		ASSERT(zilog != NULL);
+		/*
+		 * It's a block in the intent log.  It has no
+		 * accounting, so just free it.
+		 */
+		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
+	} else {
+		ASSERT(zilog == NULL);
+		ASSERT3U(bp->blk_birth, >,
+		    dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
+		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
+	}
+
+	return (0);
+}
+
+static void
+old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	struct killarg ka;
+
+	/*
+	 * Free everything that we point to (that's born after
+	 * the previous snapshot, if we are a clone)
+	 *
+	 * NB: this should be very quick, because we already
+	 * freed all the objects in open context.
+	 */
+	ka.ds = ds;
+	ka.tx = tx;
+	VERIFY0(traverse_dataset(ds,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
+	    kill_blkptr, &ka));
+	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+	    dsl_dataset_phys(ds)->ds_unique_bytes == 0);
+}
+
+typedef struct dsl_destroy_head_arg {
+	const char *ddha_name;
+} dsl_destroy_head_arg_t;
+
+int
+dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
+{
+	int error;
+	uint64_t count;
+	objset_t *mos;
+
+	ASSERT(!ds->ds_is_snapshot);
+	if (ds->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
+
+	if (refcount_count(&ds->ds_longholds) != expected_holds)
+		return (SET_ERROR(EBUSY));
+
+	mos = ds->ds_dir->dd_pool->dp_meta_objset;
+
+	/*
+	 * Can't delete a head dataset if there are snapshots of it.
+	 * (Except if the only snapshots are from the branch we cloned
+	 * from.)
+	 */
+	if (ds->ds_prev != NULL &&
+	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
+		return (SET_ERROR(EBUSY));
+
+	/*
+	 * Can't delete if there are children of this fs.
+	 */
+	error = zap_count(mos,
+	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
+	if (error != 0)
+		return (error);
+	if (count != 0)
+		return (SET_ERROR(EEXIST));
+
+	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
+	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
+	    ds->ds_prev->ds_userrefs == 0) {
+		/* We need to remove the origin snapshot as well. */
+		if (!refcount_is_zero(&ds->ds_prev->ds_longholds))
+			return (SET_ERROR(EBUSY));
+	}
+	return (0);
+}
+
+static int
+dsl_destroy_head_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_destroy_head_arg_t *ddha = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int error;
+
+	error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	error = dsl_destroy_head_check_impl(ds, 0);
+	dsl_dataset_rele(ds, FTAG);
+	return (error);
+}
+
+static void
+dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
+{
+	dsl_dir_t *dd;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+	dd_used_t t;
+
+	ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
+
+	VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
+
+	ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);
+
+	/*
+	 * Decrement the filesystem count for all parent filesystems.
+	 *
+	 * When we receive an incremental stream into a filesystem that already
+	 * exists, a temporary clone is created.  We never count this temporary
+	 * clone, whose name begins with a '%'.
+	 */
+	if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
+		dsl_fs_ss_count_adjust(dd->dd_parent, -1,
+		    DD_FIELD_FILESYSTEM_COUNT, tx);
+
+	/*
+	 * Remove our reservation. The impl() routine avoids setting the
+	 * actual property, which would require the (already destroyed) ds.
+	 */
+	dsl_dir_set_reservation_sync_impl(dd, 0, tx);
+
+	ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
+	ASSERT0(dsl_dir_phys(dd)->dd_reserved);
+	for (t = 0; t < DD_USED_NUM; t++)
+		ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
+
+	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
+	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
+	VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
+	VERIFY0(zap_remove(mos,
+	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
+	    dd->dd_myname, tx));
+
+	dsl_dir_rele(dd, FTAG);
+	dmu_object_free_zapified(mos, ddobj, tx);
+}
+
+void
+dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	spa_feature_t f;
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t obj, ddobj, prevobj = 0;
+	boolean_t rmorigin;
+	objset_t *os;
+
+	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
+	ASSERT(ds->ds_prev == NULL ||
+	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+	/* We need to log before removing it from the namespace. */
+	spa_history_log_internal_ds(ds, "destroy", tx, "");
+
+	rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
+	    DS_IS_DEFER_DESTROY(ds->ds_prev) &&
+	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
+	    ds->ds_prev->ds_userrefs == 0);
+
+	/* Remove our reservation. */
+	if (ds->ds_reserved != 0) {
+		dsl_dataset_set_refreservation_sync_impl(ds,
+		    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
+		    0, tx);
+		ASSERT0(ds->ds_reserved);
+	}
+
+	obj = ds->ds_object;
+
+	for (f = 0; f < SPA_FEATURES; f++) {
+		if (ds->ds_feature_inuse[f]) {
+			dsl_dataset_deactivate_feature(obj, f, tx);
+			ds->ds_feature_inuse[f] = B_FALSE;
+		}
+	}
+
+	dsl_scan_ds_destroyed(ds, tx);
+
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+		/* This is a clone */
+		ASSERT(ds->ds_prev != NULL);
+		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
+		    obj);
+		ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);
+
+		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+		if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
+			dsl_dataset_remove_from_next_clones(ds->ds_prev,
+			    obj, tx);
+		}
+
+		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
+		dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
+	}
+
+	/*
+	 * Destroy the deadlist.  Unless it's a clone, the
+	 * deadlist should be empty.  (If it's a clone, it's
+	 * safe to ignore the deadlist contents.)
+	 */
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
+
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+
+	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
+		old_synchronous_dataset_destroy(ds, tx);
+	} else {
+		/*
+		 * Move the bptree into the pool's list of trees to
+		 * clean up and update space accounting information.
+		 */
+		uint64_t used, comp, uncomp;
+
+		zil_destroy_sync(dmu_objset_zil(os), tx);
+
+		if (!spa_feature_is_active(dp->dp_spa,
+		    SPA_FEATURE_ASYNC_DESTROY)) {
+			dsl_scan_t *scn = dp->dp_scan;
+			spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
+			    tx);
+			dp->dp_bptree_obj = bptree_alloc(mos, tx);
+			VERIFY0(zap_add(mos,
+			    DMU_POOL_DIRECTORY_OBJECT,
+			    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+			    &dp->dp_bptree_obj, tx));
+			ASSERT(!scn->scn_async_destroying);
+			scn->scn_async_destroying = B_TRUE;
+		}
+
+		used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
+		comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
+		uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
+
+		ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+		    dsl_dataset_phys(ds)->ds_unique_bytes == used);
+
+		bptree_add(mos, dp->dp_bptree_obj,
+		    &dsl_dataset_phys(ds)->ds_bp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
+		    used, comp, uncomp, tx);
+		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+		    -used, -comp, -uncomp, tx);
+		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+		    used, comp, uncomp, tx);
+	}
+
+	if (ds->ds_prev != NULL) {
+		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+			VERIFY0(zap_remove_int(mos,
+			    dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
+			    ds->ds_object, tx));
+		}
+		prevobj = ds->ds_prev->ds_object;
+		dsl_dataset_rele(ds->ds_prev, ds);
+		ds->ds_prev = NULL;
+	}
+
+	/*
+	 * This must be done after the dsl_traverse(), because it will
+	 * re-open the objset.
+	 */
+	if (ds->ds_objset) {
+		dmu_objset_evict(ds->ds_objset);
+		ds->ds_objset = NULL;
+	}
+
+	/* Erase the link in the dir */
+	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+	dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
+	ddobj = ds->ds_dir->dd_object;
+	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
+	VERIFY0(zap_destroy(mos,
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));
+
+	if (ds->ds_bookmarks != 0) {
+		VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
+		spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
+	}
+
+	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+
+	ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
+	ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
+	ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
+	dsl_dir_rele(ds->ds_dir, ds);
+	ds->ds_dir = NULL;
+	dmu_object_free_zapified(mos, obj, tx);
+
+	dsl_dir_destroy_sync(ddobj, tx);
+
+	if (rmorigin) {
+		dsl_dataset_t *prev;
+		VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
+		dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
+		dsl_dataset_rele(prev, FTAG);
+	}
+}
+
+static void
+dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_destroy_head_arg_t *ddha = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+
+	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
+	dsl_destroy_head_sync_impl(ds, tx);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+static void
+dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_destroy_head_arg_t *ddha = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+
+	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
+
+	/* Mark it as inconsistent on-disk, in case we crash */
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+	spa_history_log_internal_ds(ds, "destroy begin", tx, "");
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_destroy_head(const char *name)
+{
+	dsl_destroy_head_arg_t ddha;
+	int error;
+	spa_t *spa;
+	boolean_t isenabled;
+
+#ifdef _KERNEL
+	zfs_destroy_unmount_origin(name);
+#endif
+
+	error = spa_open(name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
+	spa_close(spa, FTAG);
+
+	ddha.ddha_name = name;
+
+	if (!isenabled) {
+		objset_t *os;
+
+		error = dsl_sync_task(name, dsl_destroy_head_check,
+		    dsl_destroy_head_begin_sync, &ddha,
+		    0, ZFS_SPACE_CHECK_NONE);
+		if (error != 0)
+			return (error);
+
+		/*
+		 * Head deletion is processed in one txg on old pools;
+		 * remove the objects from open context so that the txg sync
+		 * is not too long.
+		 */
+		error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
+		if (error == 0) {
+			uint64_t obj;
+			uint64_t prev_snap_txg =
+			    dsl_dataset_phys(dmu_objset_ds(os))->
+			    ds_prev_snap_txg;
+			for (obj = 0; error == 0;
+			    error = dmu_object_next(os, &obj, FALSE,
+			    prev_snap_txg))
+				(void) dmu_free_long_object(os, obj);
+			/* sync out all frees */
+			txg_wait_synced(dmu_objset_pool(os), 0);
+			dmu_objset_disown(os, FTAG);
+		}
+	}
+
+	return (dsl_sync_task(name, dsl_destroy_head_check,
+	    dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE));
+}
+
+/*
+ * Note, this function is used as the callback for dmu_objset_find().  We
+ * always return 0 so that we will continue to find and process
+ * inconsistent datasets, even if we encounter an error trying to
+ * process one of them.
+ */
+/* ARGSUSED */
+int
+dsl_destroy_inconsistent(const char *dsname, void *arg)
+{
+	objset_t *os;
+
+	if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
+		boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+		dmu_objset_rele(os, FTAG);
+		if (inconsistent)
+			(void) dsl_destroy_head(dsname);
+	}
+	return (0);
+}
+
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(dsl_destroy_head);
+EXPORT_SYMBOL(dsl_destroy_head_sync_impl);
+EXPORT_SYMBOL(dsl_dataset_user_hold_check_one);
+EXPORT_SYMBOL(dsl_destroy_snapshot_sync_impl);
+EXPORT_SYMBOL(dsl_destroy_inconsistent);
+EXPORT_SYMBOL(dsl_dataset_user_release_tmp);
+EXPORT_SYMBOL(dsl_destroy_head_check_impl);
+#endif
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index b989e763386b..295b8df8bf44 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -619,7 +619,8 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
 		 * If we already visited this bp & everything below (in
 		 * a prior txg sync), don't bother doing it again.
 		 */
-		if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+		if (zbookmark_subtree_completed(dnp, zb,
+		    &scn->scn_phys.scn_bookmark))
 			return (B_TRUE);
 
 		/*
diff --git a/module/zfs/dsl_scan.c.orig b/module/zfs/dsl_scan.c.orig
new file mode 100644
index 000000000000..b989e763386b
--- /dev/null
+++ b/module/zfs/dsl_scan.c.orig
@@ -0,0 +1,1907 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/dsl_scan.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dnode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zil_impl.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+
+typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
+    const zbookmark_phys_t *);
+
+static scan_cb_t dsl_scan_scrub_cb;
+static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
+static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
+
+int zfs_top_maxinflight = 32;		/* maximum I/Os per top-level */
+int zfs_resilver_delay = 2;		/* number of ticks to delay resilver */
+int zfs_scrub_delay = 4;		/* number of ticks to delay scrub */
+int zfs_scan_idle = 50;			/* idle window in clock ticks */
+
+int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
+int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
+int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
+int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
+enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
+/* max number of blocks to free in a single TXG */
+ulong zfs_free_max_blocks = 100000;
+
+#define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
+	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
+	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
+
+/* the order has to match pool_scan_type */
+static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
+	NULL,
+	dsl_scan_scrub_cb,	/* POOL_SCAN_SCRUB */
+	dsl_scan_scrub_cb,	/* POOL_SCAN_RESILVER */
+};
+
+int
+dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
+{
+	int err;
+	dsl_scan_t *scn;
+	spa_t *spa = dp->dp_spa;
+	uint64_t f;
+
+	scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
+	scn->scn_dp = dp;
+
+	/*
+	 * It's possible that we're resuming a scan after a reboot so
+	 * make sure that the scan_async_destroying flag is initialized
+	 * appropriately.
+	 */
+	ASSERT(!scn->scn_async_destroying);
+	scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
+	    SPA_FEATURE_ASYNC_DESTROY);
+
+	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    "scrub_func", sizeof (uint64_t), 1, &f);
+	if (err == 0) {
+		/*
+		 * There was an old-style scrub in progress.  Restart a
+		 * new-style scrub from the beginning.
+		 */
+		scn->scn_restart_txg = txg;
+		zfs_dbgmsg("old-style scrub was in progress; "
+		    "restarting new-style scrub in txg %llu",
+		    scn->scn_restart_txg);
+
+		/*
+		 * Load the queue obj from the old location so that it
+		 * can be freed by dsl_scan_done().
+		 */
+		(void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    "scrub_queue", sizeof (uint64_t), 1,
+		    &scn->scn_phys.scn_queue_obj);
+	} else {
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+		    &scn->scn_phys);
+		/*
+		 * Detect if the pool contains the signature of #2094.  If it
+		 * does properly update the scn->scn_phys structure and notify
+		 * the administrator by setting an errata for the pool.
+		 */
+		if (err == EOVERFLOW) {
+			uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1];
+			VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24);
+			VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==,
+			    (23 * sizeof (uint64_t)));
+
+			err = zap_lookup(dp->dp_meta_objset,
+			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN,
+			    sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp);
+			if (err == 0) {
+				uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS];
+
+				if (overflow & ~DSL_SCAN_FLAGS_MASK ||
+				    scn->scn_async_destroying) {
+					spa->spa_errata =
+					    ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY;
+					return (EOVERFLOW);
+				}
+
+				bcopy(zaptmp, &scn->scn_phys,
+				    SCAN_PHYS_NUMINTS * sizeof (uint64_t));
+				scn->scn_phys.scn_flags = overflow;
+
+				/* Required scrub already in progress. */
+				if (scn->scn_phys.scn_state == DSS_FINISHED ||
+				    scn->scn_phys.scn_state == DSS_CANCELED)
+					spa->spa_errata =
+					    ZPOOL_ERRATA_ZOL_2094_SCRUB;
+			}
+		}
+
+		if (err == ENOENT)
+			return (0);
+		else if (err)
+			return (err);
+
+		if (scn->scn_phys.scn_state == DSS_SCANNING &&
+		    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
+			/*
+			 * A new-type scrub was in progress on an old
+			 * pool, and the pool was accessed by old
+			 * software.  Restart from the beginning, since
+			 * the old software may have changed the pool in
+			 * the meantime.
+			 */
+			scn->scn_restart_txg = txg;
+			zfs_dbgmsg("new-style scrub was modified "
+			    "by old software; restarting in txg %llu",
+			    scn->scn_restart_txg);
+		}
+	}
+
+	spa_scan_stat_init(spa);
+	return (0);
+}
+
+void
+dsl_scan_fini(dsl_pool_t *dp)
+{
+	if (dp->dp_scan) {
+		kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
+		dp->dp_scan = NULL;
+	}
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+	if (scn->scn_phys.scn_state == DSS_SCANNING)
+		return (SET_ERROR(EBUSY));
+
+	return (0);
+}
+
+static void
+dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+	pool_scan_func_t *funcp = arg;
+	dmu_object_type_t ot = 0;
+	dsl_pool_t *dp = scn->scn_dp;
+	spa_t *spa = dp->dp_spa;
+
+	ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
+	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+	bzero(&scn->scn_phys, sizeof (scn->scn_phys));
+	scn->scn_phys.scn_func = *funcp;
+	scn->scn_phys.scn_state = DSS_SCANNING;
+	scn->scn_phys.scn_min_txg = 0;
+	scn->scn_phys.scn_max_txg = tx->tx_txg;
+	scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
+	scn->scn_phys.scn_start_time = gethrestime_sec();
+	scn->scn_phys.scn_errors = 0;
+	scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
+	scn->scn_restart_txg = 0;
+	scn->scn_done_txg = 0;
+	spa_scan_stat_init(spa);
+
+	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+		scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
+
+		/* rewrite all disk labels */
+		vdev_config_dirty(spa->spa_root_vdev);
+
+		if (vdev_resilver_needed(spa->spa_root_vdev,
+		    &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
+			spa_event_notify(spa, NULL,
+			    FM_EREPORT_ZFS_RESILVER_START);
+		} else {
+			spa_event_notify(spa, NULL,
+			    FM_EREPORT_ZFS_SCRUB_START);
+		}
+
+		spa->spa_scrub_started = B_TRUE;
+		/*
+		 * If this is an incremental scrub, limit the DDT scrub phase
+		 * to just the auto-ditto class (for correctness); the rest
+		 * of the scrub should go faster using top-down pruning.
+		 */
+		if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
+			scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
+
+	}
+
+	/* back to the generic stuff */
+
+	if (dp->dp_blkstats == NULL) {
+		dp->dp_blkstats =
+		    vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+	}
+	bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+
+	if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
+		ot = DMU_OT_ZAP_OTHER;
+
+	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
+	    ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
+
+	dsl_scan_sync_state(scn, tx);
+
+	spa_history_log_internal(spa, "scan setup", tx,
+	    "func=%u mintxg=%llu maxtxg=%llu",
+	    *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+{
+	static const char *old_names[] = {
+		"scrub_bookmark",
+		"scrub_ddt_bookmark",
+		"scrub_ddt_class_max",
+		"scrub_queue",
+		"scrub_min_txg",
+		"scrub_max_txg",
+		"scrub_func",
+		"scrub_errors",
+		NULL
+	};
+
+	dsl_pool_t *dp = scn->scn_dp;
+	spa_t *spa = dp->dp_spa;
+	int i;
+
+	/* Remove any remnants of an old-style scrub. */
+	for (i = 0; old_names[i]; i++) {
+		(void) zap_remove(dp->dp_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
+	}
+
+	if (scn->scn_phys.scn_queue_obj != 0) {
+		VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, tx));
+		scn->scn_phys.scn_queue_obj = 0;
+	}
+
+	/*
+	 * If we were "restarted" from a stopped state, don't bother
+	 * with anything else.
+	 */
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	if (complete)
+		scn->scn_phys.scn_state = DSS_FINISHED;
+	else
+		scn->scn_phys.scn_state = DSS_CANCELED;
+
+	spa_history_log_internal(spa, "scan done", tx,
+	    "complete=%u", complete);
+
+	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+		mutex_enter(&spa->spa_scrub_lock);
+		while (spa->spa_scrub_inflight > 0) {
+			cv_wait(&spa->spa_scrub_io_cv,
+			    &spa->spa_scrub_lock);
+		}
+		mutex_exit(&spa->spa_scrub_lock);
+		spa->spa_scrub_started = B_FALSE;
+		spa->spa_scrub_active = B_FALSE;
+
+		/*
+		 * If the scrub/resilver completed, update all DTLs to
+		 * reflect this.  Whether it succeeded or not, vacate
+		 * all temporary scrub DTLs.
+		 */
+		vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+		    complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
+		if (complete) {
+			spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
+			    FM_EREPORT_ZFS_RESILVER_FINISH :
+			    FM_EREPORT_ZFS_SCRUB_FINISH);
+		}
+		spa_errlog_rotate(spa);
+
+		/*
+		 * We may have finished replacing a device.
+		 * Let the async thread assess this and handle the detach.
+		 */
+		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+	}
+
+	scn->scn_phys.scn_end_time = gethrestime_sec();
+
+	if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
+		spa->spa_errata = 0;
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return (SET_ERROR(ENOENT));
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+	dsl_scan_done(scn, B_FALSE, tx);
+	dsl_scan_sync_state(scn, tx);
+}
+
+int
+dsl_scan_cancel(dsl_pool_t *dp)
+{
+	return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
+	    dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
+}
+
+static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+    dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+    dmu_objset_type_t ostype, dmu_tx_t *tx);
+inline __attribute__((always_inline)) static void dsl_scan_visitdnode(
+    dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+    dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
+
+void
+dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
+{
+	zio_free(dp->dp_spa, txg, bp);
+}
+
+void
+dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
+{
+	ASSERT(dsl_pool_sync_context(dp));
+	zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
+}
+
+static uint64_t
+dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
+{
+	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
+	if (ds->ds_is_snapshot)
+		return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
+	return (smt);
+}
+
+static void
+dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+	    &scn->scn_phys, tx));
+}
+
+extern int zfs_vdev_async_write_active_min_dirty_percent;
+
+static boolean_t
+dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb)
+{
+	uint64_t elapsed_nanosecs;
+	int mintime;
+	int dirty_pct;
+
+	/* we never skip user/group accounting objects */
+	if (zb && (int64_t)zb->zb_object < 0)
+		return (B_FALSE);
+
+	if (scn->scn_pausing)
+		return (B_TRUE); /* we're already pausing */
+
+	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
+		return (B_FALSE); /* we're resuming */
+
+	/* We only know how to resume from level-0 blocks. */
+	if (zb && zb->zb_level != 0)
+		return (B_FALSE);
+
+	/*
+	 * We pause if:
+	 *  - we have scanned for the maximum time: an entire txg
+	 *    timeout (default 5 sec)
+	 *  or
+	 *  - we have scanned for at least the minimum time (default 1 sec
+	 *    for scrub, 3 sec for resilver), and either we have sufficient
+	 *    dirty data that we are starting to write more quickly
+	 *    (default 30%), or someone is explicitly waiting for this txg
+	 *    to complete.
+	 *  or
+	 *  - the spa is shutting down because this pool is being exported
+	 *    or the machine is rebooting.
+	 */
+	mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+	    zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
+	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+	dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
+	if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout ||
+	    (NSEC2MSEC(elapsed_nanosecs) > mintime &&
+	    (txg_sync_waiting(scn->scn_dp) ||
+	    dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) ||
+	    spa_shutting_down(scn->scn_dp->dp_spa)) {
+		if (zb) {
+			dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
+			    (longlong_t)zb->zb_objset,
+			    (longlong_t)zb->zb_object,
+			    (longlong_t)zb->zb_level,
+			    (longlong_t)zb->zb_blkid);
+			scn->scn_phys.scn_bookmark = *zb;
+		}
+		dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+		scn->scn_pausing = B_TRUE;
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+typedef struct zil_scan_arg {
+	dsl_pool_t	*zsa_dp;
+	zil_header_t	*zsa_zh;
+} zil_scan_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+	zil_scan_arg_t *zsa = arg;
+	dsl_pool_t *dp = zsa->zsa_dp;
+	dsl_scan_t *scn = dp->dp_scan;
+	zil_header_t *zh = zsa->zsa_zh;
+	zbookmark_phys_t zb;
+
+	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+		return (0);
+
+	/*
+	 * One block ("stubby") can be allocated a long time ago; we
+	 * want to visit that one because it has been allocated
+	 * (on-disk) even if it hasn't been claimed (even though for
+	 * scrub there's nothing to do to it).
+	 */
+	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+		return (0);
+
+	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+	VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+{
+	if (lrc->lrc_txtype == TX_WRITE) {
+		zil_scan_arg_t *zsa = arg;
+		dsl_pool_t *dp = zsa->zsa_dp;
+		dsl_scan_t *scn = dp->dp_scan;
+		zil_header_t *zh = zsa->zsa_zh;
+		lr_write_t *lr = (lr_write_t *)lrc;
+		blkptr_t *bp = &lr->lr_blkptr;
+		zbookmark_phys_t zb;
+
+		if (BP_IS_HOLE(bp) ||
+		    bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+			return (0);
+
+		/*
+		 * birth can be < claim_txg if this record's txg is
+		 * already txg sync'ed (but this log block contains
+		 * other records that are not synced)
+		 */
+		if (claim_txg == 0 || bp->blk_birth < claim_txg)
+			return (0);
+
+		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+		    lr->lr_foid, ZB_ZIL_LEVEL,
+		    lr->lr_offset / BP_GET_LSIZE(bp));
+
+		VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+	}
+	return (0);
+}
+
+static void
+dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
+{
+	uint64_t claim_txg = zh->zh_claim_txg;
+	zil_scan_arg_t zsa = { dp, zh };
+	zilog_t *zilog;
+
+	/*
+	 * We only want to visit blocks that have been claimed but not yet
+	 * replayed (or, in read-only mode, blocks that *would* be claimed).
+	 */
+	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
+		return;
+
+	zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+	(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
+	    claim_txg);
+
+	zil_free(zilog);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
+    uint64_t objset, uint64_t object, uint64_t blkid)
+{
+	zbookmark_phys_t czb;
+	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+
+	if (zfs_no_scrub_prefetch)
+		return;
+
+	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
+	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
+		return;
+
+	SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
+
+	(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
+	    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
+}
+
+static boolean_t
+dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
+    const zbookmark_phys_t *zb)
+{
+	/*
+	 * We never skip over user/group accounting objects (obj<0)
+	 */
+	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
+	    (int64_t)zb->zb_object >= 0) {
+		/*
+		 * If we already visited this bp & everything below (in
+		 * a prior txg sync), don't bother doing it again.
+		 */
+		if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+			return (B_TRUE);
+
+		/*
+		 * If we found the block we're trying to resume from, or
+		 * we went past it to a different object, zero it out to
+		 * indicate that it's OK to start checking for pausing
+		 * again.
+		 */
+		if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
+		    zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
+			dprintf("resuming at %llx/%llx/%llx/%llx\n",
+			    (longlong_t)zb->zb_objset,
+			    (longlong_t)zb->zb_object,
+			    (longlong_t)zb->zb_level,
+			    (longlong_t)zb->zb_blkid);
+			bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
+		}
+	}
+	return (B_FALSE);
+}
+
+/*
+ * Return nonzero on i/o error.
+ * Return new buf to write out in *bufp.
+ */
+inline __attribute__((always_inline)) static int
+dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+    dnode_phys_t *dnp, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
+	int err;
+
+	if (BP_GET_LEVEL(bp) > 0) {
+		arc_flags_t flags = ARC_FLAG_WAIT;
+		int i;
+		blkptr_t *cbp;
+		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+		arc_buf_t *buf;
+
+		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+		if (err) {
+			scn->scn_phys.scn_errors++;
+			return (err);
+		}
+		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+			dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset,
+			    zb->zb_object, zb->zb_blkid * epb + i);
+		}
+		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+			zbookmark_phys_t czb;
+
+			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+			    zb->zb_level - 1,
+			    zb->zb_blkid * epb + i);
+			dsl_scan_visitbp(cbp, &czb, dnp,
+			    ds, scn, ostype, tx);
+		}
+		(void) arc_buf_remove_ref(buf, &buf);
+	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+		arc_flags_t flags = ARC_FLAG_WAIT;
+		dnode_phys_t *cdnp;
+		int i, j;
+		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+		arc_buf_t *buf;
+
+		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+		if (err) {
+			scn->scn_phys.scn_errors++;
+			return (err);
+		}
+		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
+			for (j = 0; j < cdnp->dn_nblkptr; j++) {
+				blkptr_t *cbp = &cdnp->dn_blkptr[j];
+				dsl_scan_prefetch(scn, buf, cbp,
+				    zb->zb_objset, zb->zb_blkid * epb + i, j);
+			}
+		}
+		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
+			dsl_scan_visitdnode(scn, ds, ostype,
+			    cdnp, zb->zb_blkid * epb + i, tx);
+		}
+
+		(void) arc_buf_remove_ref(buf, &buf);
+	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+		arc_flags_t flags = ARC_FLAG_WAIT;
+		objset_phys_t *osp;
+		arc_buf_t *buf;
+
+		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+		if (err) {
+			scn->scn_phys.scn_errors++;
+			return (err);
+		}
+
+		osp = buf->b_data;
+
+		dsl_scan_visitdnode(scn, ds, osp->os_type,
+		    &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
+
+		if (OBJSET_BUF_HAS_USERUSED(buf)) {
+			/*
+			 * We also always visit user/group accounting
+			 * objects, and never skip them, even if we are
+			 * pausing.  This is necessary so that the space
+			 * deltas from this txg get integrated.
+			 */
+			dsl_scan_visitdnode(scn, ds, osp->os_type,
+			    &osp->os_groupused_dnode,
+			    DMU_GROUPUSED_OBJECT, tx);
+			dsl_scan_visitdnode(scn, ds, osp->os_type,
+			    &osp->os_userused_dnode,
+			    DMU_USERUSED_OBJECT, tx);
+		}
+		(void) arc_buf_remove_ref(buf, &buf);
+	}
+
+	return (0);
+}
+
+inline __attribute__((always_inline)) static void
+dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
+    dmu_objset_type_t ostype, dnode_phys_t *dnp,
+    uint64_t object, dmu_tx_t *tx)
+{
+	int j;
+
+	for (j = 0; j < dnp->dn_nblkptr; j++) {
+		zbookmark_phys_t czb;
+
+		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+		    dnp->dn_nlevels - 1, j);
+		dsl_scan_visitbp(&dnp->dn_blkptr[j],
+		    &czb, dnp, ds, scn, ostype, tx);
+	}
+
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		zbookmark_phys_t czb;
+		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+		    0, DMU_SPILL_BLKID);
+		dsl_scan_visitbp(&dnp->dn_spill,
+		    &czb, dnp, ds, scn, ostype, tx);
+	}
+}
+
+/*
+ * The arguments are in this order because mdb can only print the
+ * first 5; we want them to be useful.
+ */
+static void
+dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+    dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+    dmu_objset_type_t ostype, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	blkptr_t *bp_toread;
+
+	bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+	*bp_toread = *bp;
+
+	/* ASSERT(pbuf == NULL || arc_released(pbuf)); */
+
+	if (dsl_scan_check_pause(scn, zb))
+		goto out;
+
+	if (dsl_scan_check_resume(scn, dnp, zb))
+		goto out;
+
+	if (BP_IS_HOLE(bp))
+		goto out;
+
+	scn->scn_visited_this_txg++;
+
+	/*
+	 * This debugging is commented out to conserve stack space.  This
+	 * function is called recursively and the debugging addes several
+	 * bytes to the stack for each call.  It can be commented back in
+	 * if required to debug an issue in dsl_scan_visitbp().
+	 *
+	 * dprintf_bp(bp,
+	 *    "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
+	 *    ds, ds ? ds->ds_object : 0,
+	 *    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+	 *    bp);
+	 */
+
+	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+		goto out;
+
+	if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0)
+		goto out;
+
+	/*
+	 * If dsl_scan_ddt() has aready visited this block, it will have
+	 * already done any translations or scrubbing, so don't call the
+	 * callback again.
+	 */
+	if (ddt_class_contains(dp->dp_spa,
+	    scn->scn_phys.scn_ddt_class_max, bp)) {
+		goto out;
+	}
+
+	/*
+	 * If this block is from the future (after cur_max_txg), then we
+	 * are doing this on behalf of a deleted snapshot, and we will
+	 * revisit the future block on the next pass of this dataset.
+	 * Don't scan it now unless we need to because something
+	 * under it was modified.
+	 */
+	if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) {
+		scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+	}
+out:
+	kmem_free(bp_toread, sizeof (blkptr_t));
+}
+
+static void
+dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
+    dmu_tx_t *tx)
+{
+	zbookmark_phys_t zb;
+
+	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+	dsl_scan_visitbp(bp, &zb, NULL,
+	    ds, scn, DMU_OST_NONE, tx);
+
+	dprintf_ds(ds, "finished scan%s", "");
+}
+
+void
+dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+	uint64_t mintxg;
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+		if (ds->ds_is_snapshot) {
+			/* Note, scn_cur_{min,max}_txg stays the same. */
+			scn->scn_phys.scn_bookmark.zb_objset =
+			    dsl_dataset_phys(ds)->ds_next_snap_obj;
+			zfs_dbgmsg("destroying ds %llu; currently traversing; "
+			    "reset zb_objset to %llu",
+			    (u_longlong_t)ds->ds_object,
+			    (u_longlong_t)dsl_dataset_phys(ds)->
+			    ds_next_snap_obj);
+			scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
+		} else {
+			SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
+			    ZB_DESTROYED_OBJSET, 0, 0, 0);
+			zfs_dbgmsg("destroying ds %llu; currently traversing; "
+			    "reset bookmark to -1,0,0,0",
+			    (u_longlong_t)ds->ds_object);
+		}
+	} else if (zap_lookup_int_key(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+		ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+		if (ds->ds_is_snapshot) {
+			/*
+			 * We keep the same mintxg; it could be >
+			 * ds_creation_txg if the previous snapshot was
+			 * deleted too.
+			 */
+			VERIFY(zap_add_int_key(dp->dp_meta_objset,
+			    scn->scn_phys.scn_queue_obj,
+			    dsl_dataset_phys(ds)->ds_next_snap_obj,
+			    mintxg, tx) == 0);
+			zfs_dbgmsg("destroying ds %llu; in queue; "
+			    "replacing with %llu",
+			    (u_longlong_t)ds->ds_object,
+			    (u_longlong_t)dsl_dataset_phys(ds)->
+			    ds_next_snap_obj);
+		} else {
+			zfs_dbgmsg("destroying ds %llu; in queue; removing",
+			    (u_longlong_t)ds->ds_object);
+		}
+	} else {
+		zfs_dbgmsg("destroying ds %llu; ignoring",
+		    (u_longlong_t)ds->ds_object);
+	}
+
+	/*
+	 * dsl_scan_sync() should be called after this, and should sync
+	 * out our changed state, but just to be safe, do it here.
+	 */
+	dsl_scan_sync_state(scn, tx);
+}
+
+void
+dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+	uint64_t mintxg;
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
+
+	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+		scn->scn_phys.scn_bookmark.zb_objset =
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj;
+		zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
+		    "reset zb_objset to %llu",
+		    (u_longlong_t)ds->ds_object,
+		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
+	} else if (zap_lookup_int_key(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+		VERIFY(zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
+		zfs_dbgmsg("snapshotting ds %llu; in queue; "
+		    "replacing with %llu",
+		    (u_longlong_t)ds->ds_object,
+		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
+	}
+	dsl_scan_sync_state(scn, tx);
+}
+
+void
+dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+	uint64_t mintxg;
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
+		scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
+		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+		    "reset zb_objset to %llu",
+		    (u_longlong_t)ds1->ds_object,
+		    (u_longlong_t)ds2->ds_object);
+	} else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
+		scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
+		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+		    "reset zb_objset to %llu",
+		    (u_longlong_t)ds2->ds_object,
+		    (u_longlong_t)ds1->ds_object);
+	}
+
+	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+	    ds1->ds_object, &mintxg) == 0) {
+		int err;
+
+		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
+		err = zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
+		VERIFY(err == 0 || err == EEXIST);
+		if (err == EEXIST) {
+			/* Both were there to begin with */
+			VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+			    scn->scn_phys.scn_queue_obj,
+			    ds1->ds_object, mintxg, tx));
+		}
+		zfs_dbgmsg("clone_swap ds %llu; in queue; "
+		    "replacing with %llu",
+		    (u_longlong_t)ds1->ds_object,
+		    (u_longlong_t)ds2->ds_object);
+	} else if (zap_lookup_int_key(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
+		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
+		VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
+		zfs_dbgmsg("clone_swap ds %llu; in queue; "
+		    "replacing with %llu",
+		    (u_longlong_t)ds2->ds_object,
+		    (u_longlong_t)ds1->ds_object);
+	}
+
+	dsl_scan_sync_state(scn, tx);
+}
+
+struct enqueue_clones_arg {
+	dmu_tx_t *tx;
+	uint64_t originobj;
+};
+
+/* ARGSUSED */
+static int
+enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
+{
+	struct enqueue_clones_arg *eca = arg;
+	dsl_dataset_t *ds;
+	int err;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj)
+		return (0);
+
+	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
+	if (err)
+		return (err);
+
+	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) {
+		dsl_dataset_t *prev;
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+
+		dsl_dataset_rele(ds, FTAG);
+		if (err)
+			return (err);
+		ds = prev;
+	}
+	VERIFY(zap_add_int_key(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, ds->ds_object,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0);
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	dsl_dataset_t *ds;
+	objset_t *os;
+	char *dsname;
+
+	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+	if (dmu_objset_from_ds(ds, &os))
+		goto out;
+
+	/*
+	 * Only the ZIL in the head (non-snapshot) is valid.  Even though
+	 * snapshots can have ZIL block pointers (which may be the same
+	 * BP as in the head), they must be ignored.  So we traverse the
+	 * ZIL here, rather than in scan_recurse(), because the regular
+	 * snapshot block-sharing rules don't apply to it.
+	 */
+	if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot)
+		dsl_scan_zil(dp, &os->os_zil_header);
+
+	/*
+	 * Iterate over the bps in this ds.
+	 */
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
+
+	dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
+	dsl_dataset_name(ds, dsname);
+	zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
+	    "pausing=%u",
+	    (longlong_t)dsobj, dsname,
+	    (longlong_t)scn->scn_phys.scn_cur_min_txg,
+	    (longlong_t)scn->scn_phys.scn_cur_max_txg,
+	    (int)scn->scn_pausing);
+	kmem_free(dsname, ZFS_MAXNAMELEN);
+
+	if (scn->scn_pausing)
+		goto out;
+
+	/*
+	 * We've finished this pass over this dataset.
+	 */
+
+	/*
+	 * If we did not completely visit this dataset, do another pass.
+	 */
+	if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
+		zfs_dbgmsg("incomplete pass; visiting again");
+		scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
+		VERIFY(zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds->ds_object,
+		    scn->scn_phys.scn_cur_max_txg, tx) == 0);
+		goto out;
+	}
+
+	/*
+	 * Add descendent datasets to work queue.
+	 */
+	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
+		VERIFY(zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj,
+		    dsl_dataset_phys(ds)->ds_next_snap_obj,
+		    dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0);
+	}
+	if (dsl_dataset_phys(ds)->ds_num_children > 1) {
+		boolean_t usenext = B_FALSE;
+		if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+			uint64_t count;
+			/*
+			 * A bug in a previous version of the code could
+			 * cause upgrade_clones_cb() to not set
+			 * ds_next_snap_obj when it should, leading to a
+			 * missing entry.  Therefore we can only use the
+			 * next_clones_obj when its count is correct.
+			 */
+			int err = zap_count(dp->dp_meta_objset,
+			    dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
+			if (err == 0 &&
+			    count == dsl_dataset_phys(ds)->ds_num_children - 1)
+				usenext = B_TRUE;
+		}
+
+		if (usenext) {
+			VERIFY0(zap_join_key(dp->dp_meta_objset,
+			    dsl_dataset_phys(ds)->ds_next_clones_obj,
+			    scn->scn_phys.scn_queue_obj,
+			    dsl_dataset_phys(ds)->ds_creation_txg, tx));
+		} else {
+			struct enqueue_clones_arg eca;
+			eca.tx = tx;
+			eca.originobj = ds->ds_object;
+
+			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+			    enqueue_clones_cb, &eca, DS_FIND_CHILDREN));
+		}
+	}
+
+out:
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/* ARGSUSED */
+static int
+enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
+{
+	dmu_tx_t *tx = arg;
+	dsl_dataset_t *ds;
+	int err;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
+	if (err)
+		return (err);
+
+	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+		dsl_dataset_t *prev;
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+		if (err) {
+			dsl_dataset_rele(ds, FTAG);
+			return (err);
+		}
+
+		/*
+		 * If this is a clone, we don't need to worry about it for now.
+		 */
+		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
+			dsl_dataset_rele(ds, FTAG);
+			dsl_dataset_rele(prev, FTAG);
+			return (0);
+		}
+		dsl_dataset_rele(ds, FTAG);
+		ds = prev;
+	}
+
+	VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+	    ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0);
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+/*
+ * Scrub/dedup interaction.
+ *
+ * If there are N references to a deduped block, we don't want to scrub it
+ * N times -- ideally, we should scrub it exactly once.
+ *
+ * We leverage the fact that the dde's replication class (enum ddt_class)
+ * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
+ * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
+ *
+ * To prevent excess scrubbing, the scrub begins by walking the DDT
+ * to find all blocks with refcnt > 1, and scrubs each of these once.
+ * Since there are two replication classes which contain blocks with
+ * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
+ * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
+ *
+ * There would be nothing more to say if a block's refcnt couldn't change
+ * during a scrub, but of course it can so we must account for changes
+ * in a block's replication class.
+ *
+ * Here's an example of what can occur:
+ *
+ * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+ * when visited during the top-down scrub phase, it will be scrubbed twice.
+ * This negates our scrub optimization, but is otherwise harmless.
+ *
+ * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+ * on each visit during the top-down scrub phase, it will never be scrubbed.
+ * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+ * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
+ * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
+ * while a scrub is in progress, it scrubs the block right then.
+ */
+static void
+dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
+	ddt_entry_t dde;
+	int error;
+	uint64_t n = 0;
+
+	bzero(&dde, sizeof (ddt_entry_t));
+
+	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+		ddt_t *ddt;
+
+		if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
+			break;
+		dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
+		    (longlong_t)ddb->ddb_class,
+		    (longlong_t)ddb->ddb_type,
+		    (longlong_t)ddb->ddb_checksum,
+		    (longlong_t)ddb->ddb_cursor);
+
+		/* There should be no pending changes to the dedup table */
+		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
+		ASSERT(avl_first(&ddt->ddt_tree) == NULL);
+
+		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+		n++;
+
+		if (dsl_scan_check_pause(scn, NULL))
+			break;
+	}
+
+	zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
+	    (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
+	    (int)scn->scn_pausing);
+
+	ASSERT(error == 0 || error == ENOENT);
+	ASSERT(error != ENOENT ||
+	    ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
+}
+
+/* ARGSUSED */
+void
+dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+    ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	const ddt_key_t *ddk = &dde->dde_key;
+	ddt_phys_t *ddp = dde->dde_phys;
+	blkptr_t bp;
+	zbookmark_phys_t zb = { 0 };
+	int p;
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		if (ddp->ddp_phys_birth == 0 ||
+		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
+			continue;
+		ddt_bp_create(checksum, ddk, ddp, &bp);
+
+		scn->scn_visited_this_txg++;
+		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
+	}
+}
+
+static void
+dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	zap_cursor_t *zc;
+	zap_attribute_t *za;
+
+	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+	    scn->scn_phys.scn_ddt_class_max) {
+		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+		dsl_scan_ddt(scn, tx);
+		if (scn->scn_pausing)
+			return;
+	}
+
+	if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
+		/* First do the MOS & ORIGIN */
+
+		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+		dsl_scan_visit_rootbp(scn, NULL,
+		    &dp->dp_meta_rootbp, tx);
+		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+		if (scn->scn_pausing)
+			return;
+
+		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+			    enqueue_cb, tx, DS_FIND_CHILDREN));
+		} else {
+			dsl_scan_visitds(scn,
+			    dp->dp_origin_snap->ds_object, tx);
+		}
+		ASSERT(!scn->scn_pausing);
+	} else if (scn->scn_phys.scn_bookmark.zb_objset !=
+	    ZB_DESTROYED_OBJSET) {
+		/*
+		 * If we were paused, continue from here.  Note if the
+		 * ds we were paused on was deleted, the zb_objset may
+		 * be -1, so we will skip this and find a new objset
+		 * below.
+		 */
+		dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
+		if (scn->scn_pausing)
+			return;
+	}
+
+	/*
+	 * In case we were paused right at the end of the ds, zero the
+	 * bookmark so we don't think that we're still trying to resume.
+	 */
+	bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
+	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+	/* keep pulling things out of the zap-object-as-queue */
+	while (zap_cursor_init(zc, dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj),
+	    zap_cursor_retrieve(zc, za) == 0) {
+		dsl_dataset_t *ds;
+		uint64_t dsobj;
+
+		dsobj = strtonum(za->za_name, NULL);
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, dsobj, tx));
+
+		/* Set up min/max txg */
+		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+		if (za->za_first_integer != 0) {
+			scn->scn_phys.scn_cur_min_txg =
+			    MAX(scn->scn_phys.scn_min_txg,
+			    za->za_first_integer);
+		} else {
+			scn->scn_phys.scn_cur_min_txg =
+			    MAX(scn->scn_phys.scn_min_txg,
+			    dsl_dataset_phys(ds)->ds_prev_snap_txg);
+		}
+		scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
+		dsl_dataset_rele(ds, FTAG);
+
+		dsl_scan_visitds(scn, dsobj, tx);
+		zap_cursor_fini(zc);
+		if (scn->scn_pausing)
+			goto out;
+	}
+	zap_cursor_fini(zc);
+out:
+	kmem_free(za, sizeof (zap_attribute_t));
+	kmem_free(zc, sizeof (zap_cursor_t));
+}
+
+static boolean_t
+dsl_scan_free_should_pause(dsl_scan_t *scn)
+{
+	uint64_t elapsed_nanosecs;
+
+	if (zfs_recover)
+		return (B_FALSE);
+
+	if (scn->scn_visited_this_txg >= zfs_free_max_blocks)
+		return (B_TRUE);
+
+	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+	return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+	    (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
+	    txg_sync_waiting(scn->scn_dp)) ||
+	    spa_shutting_down(scn->scn_dp->dp_spa));
+}
+
+static int
+dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = arg;
+
+	if (!scn->scn_is_bptree ||
+	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
+		if (dsl_scan_free_should_pause(scn))
+			return (SET_ERROR(ERESTART));
+	}
+
+	zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
+	    dmu_tx_get_txg(tx), bp, 0));
+	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+	    -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
+	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+	scn->scn_visited_this_txg++;
+	return (0);
+}
+
+boolean_t
+dsl_scan_active(dsl_scan_t *scn)
+{
+	spa_t *spa = scn->scn_dp->dp_spa;
+	uint64_t used = 0, comp, uncomp;
+
+	if (spa->spa_load_state != SPA_LOAD_NONE)
+		return (B_FALSE);
+	if (spa_shutting_down(spa))
+		return (B_FALSE);
+	if (scn->scn_phys.scn_state == DSS_SCANNING ||
+	    (scn->scn_async_destroying && !scn->scn_async_stalled))
+		return (B_TRUE);
+
+	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
+		    &used, &comp, &uncomp);
+	}
+	return (used != 0);
+}
+
+void
+dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dp->dp_scan;
+	spa_t *spa = dp->dp_spa;
+	int err = 0;
+
+	/*
+	 * Check for scn_restart_txg before checking spa_load_state, so
+	 * that we can restart an old-style scan while the pool is being
+	 * imported (see dsl_scan_init).
+	 */
+	if (scn->scn_restart_txg != 0 &&
+	    scn->scn_restart_txg <= tx->tx_txg) {
+		pool_scan_func_t func = POOL_SCAN_SCRUB;
+		dsl_scan_done(scn, B_FALSE, tx);
+		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+			func = POOL_SCAN_RESILVER;
+		zfs_dbgmsg("restarting scan func=%u txg=%llu",
+		    func, tx->tx_txg);
+		dsl_scan_setup_sync(&func, tx);
+	}
+
+	/*
+	 * If the scan is inactive due to a stalled async destroy, try again.
+	 */
+	if ((!scn->scn_async_stalled && !dsl_scan_active(scn)) ||
+	    spa_sync_pass(dp->dp_spa) > 1)
+		return;
+
+	scn->scn_visited_this_txg = 0;
+	scn->scn_pausing = B_FALSE;
+	scn->scn_sync_start_time = gethrtime();
+	spa->spa_scrub_active = B_TRUE;
+
+	/*
+	 * First process the async destroys.  If we pause, don't do
+	 * any scrubbing or resilvering.  This ensures that there are no
+	 * async destroys while we are scanning, so the scan code doesn't
+	 * have to worry about traversing it.  It is also faster to free the
+	 * blocks than to scrub them.
+	 */
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+		scn->scn_is_bptree = B_FALSE;
+		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+		    NULL, ZIO_FLAG_MUSTSUCCEED);
+		err = bpobj_iterate(&dp->dp_free_bpobj,
+		    dsl_scan_free_block_cb, scn, tx);
+		VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+
+		if (err != 0 && err != ERESTART)
+			zfs_panic_recover("error %u from bpobj_iterate()", err);
+	}
+
+	if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
+		ASSERT(scn->scn_async_destroying);
+		scn->scn_is_bptree = B_TRUE;
+		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+		    NULL, ZIO_FLAG_MUSTSUCCEED);
+		err = bptree_iterate(dp->dp_meta_objset,
+		    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
+		VERIFY0(zio_wait(scn->scn_zio_root));
+
+		if (err == EIO || err == ECKSUM) {
+			err = 0;
+		} else if (err != 0 && err != ERESTART) {
+			zfs_panic_recover("error %u from "
+			    "traverse_dataset_destroyed()", err);
+		}
+
+		if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
+			/* finished; deactivate async destroy feature */
+			spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
+			ASSERT(!spa_feature_is_active(spa,
+			    SPA_FEATURE_ASYNC_DESTROY));
+			VERIFY0(zap_remove(dp->dp_meta_objset,
+			    DMU_POOL_DIRECTORY_OBJECT,
+			    DMU_POOL_BPTREE_OBJ, tx));
+			VERIFY0(bptree_free(dp->dp_meta_objset,
+			    dp->dp_bptree_obj, tx));
+			dp->dp_bptree_obj = 0;
+			scn->scn_async_destroying = B_FALSE;
+			scn->scn_async_stalled = B_FALSE;
+		} else {
+			/*
+			 * If we didn't make progress, mark the async
+			 * destroy as stalled, so that we will not initiate
+			 * a spa_sync() on its behalf.  Note that we only
+			 * check this if we are not finished, because if the
+			 * bptree had no blocks for us to visit, we can
+			 * finish without "making progress".
+			 */
+			scn->scn_async_stalled =
+			    (scn->scn_visited_this_txg == 0);
+		}
+	}
+	if (scn->scn_visited_this_txg) {
+		zfs_dbgmsg("freed %llu blocks in %llums from "
+		    "free_bpobj/bptree txg %llu; err=%u",
+		    (longlong_t)scn->scn_visited_this_txg,
+		    (longlong_t)
+		    NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
+		    (longlong_t)tx->tx_txg, err);
+		scn->scn_visited_this_txg = 0;
+
+		/*
+		 * Write out changes to the DDT that may be required as a
+		 * result of the blocks freed.  This ensures that the DDT
+		 * is clean when a scrub/resilver runs.
+		 */
+		ddt_sync(spa, tx->tx_txg);
+	}
+	if (err != 0)
+		return;
+	if (!scn->scn_async_destroying && zfs_free_leak_on_eio &&
+	    (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
+	    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
+	    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
+		/*
+		 * We have finished background destroying, but there is still
+		 * some space left in the dp_free_dir. Transfer this leaked
+		 * space to the dp_leak_dir.
+		 */
+		if (dp->dp_leak_dir == NULL) {
+			rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+			(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+			    LEAK_DIR_NAME, tx);
+			VERIFY0(dsl_pool_open_special_dir(dp,
+			    LEAK_DIR_NAME, &dp->dp_leak_dir));
+			rrw_exit(&dp->dp_config_rwlock, FTAG);
+		}
+		dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
+		    dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+		    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+		    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
+		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+		    -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+		    -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+		    -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
+	}
+	if (!scn->scn_async_destroying) {
+		/* finished; verify that space accounting went to zero */
+		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
+		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
+		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
+	}
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	if (scn->scn_done_txg == tx->tx_txg) {
+		ASSERT(!scn->scn_pausing);
+		/* finished with scan. */
+		zfs_dbgmsg("txg %llu scan complete", tx->tx_txg);
+		dsl_scan_done(scn, B_TRUE, tx);
+		ASSERT3U(spa->spa_scrub_inflight, ==, 0);
+		dsl_scan_sync_state(scn, tx);
+		return;
+	}
+
+	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+	    scn->scn_phys.scn_ddt_class_max) {
+		zfs_dbgmsg("doing scan sync txg %llu; "
+		    "ddt bm=%llu/%llu/%llu/%llx",
+		    (longlong_t)tx->tx_txg,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+		ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
+		ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
+		ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
+		ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
+	} else {
+		zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
+		    (longlong_t)tx->tx_txg,
+		    (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
+		    (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
+		    (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
+		    (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
+	}
+
+	scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+	    NULL, ZIO_FLAG_CANFAIL);
+	dsl_pool_config_enter(dp, FTAG);
+	dsl_scan_visit(scn, tx);
+	dsl_pool_config_exit(dp, FTAG);
+	(void) zio_wait(scn->scn_zio_root);
+	scn->scn_zio_root = NULL;
+
+	zfs_dbgmsg("visited %llu blocks in %llums",
+	    (longlong_t)scn->scn_visited_this_txg,
+	    (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
+
+	if (!scn->scn_pausing) {
+		scn->scn_done_txg = tx->tx_txg + 1;
+		zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
+		    tx->tx_txg, scn->scn_done_txg);
+	}
+
+	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+		mutex_enter(&spa->spa_scrub_lock);
+		while (spa->spa_scrub_inflight > 0) {
+			cv_wait(&spa->spa_scrub_io_cv,
+			    &spa->spa_scrub_lock);
+		}
+		mutex_exit(&spa->spa_scrub_lock);
+	}
+
+	dsl_scan_sync_state(scn, tx);
+}
+
+/*
+ * This will start a new scan, or restart an existing one.
+ */
+void
+dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
+{
+	if (txg == 0) {
+		dmu_tx_t *tx;
+		tx = dmu_tx_create_dd(dp->dp_mos_dir);
+		VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+
+		txg = dmu_tx_get_txg(tx);
+		dp->dp_scan->scn_restart_txg = txg;
+		dmu_tx_commit(tx);
+	} else {
+		dp->dp_scan->scn_restart_txg = txg;
+	}
+	zfs_dbgmsg("restarting resilver txg=%llu", txg);
+}
+
+boolean_t
+dsl_scan_resilvering(dsl_pool_t *dp)
+{
+	return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
+	    dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+}
+
+/*
+ * scrub consumers
+ */
+
+static void
+count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
+{
+	int i;
+
+	/*
+	 * If we resume after a reboot, zab will be NULL; don't record
+	 * incomplete stats in that case.
+	 */
+	if (zab == NULL)
+		return;
+
+	for (i = 0; i < 4; i++) {
+		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
+		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+		int equal;
+		zfs_blkstat_t *zb;
+
+		if (t & DMU_OT_NEWTYPE)
+			t = DMU_OT_OTHER;
+
+		zb = &zab->zab_type[l][t];
+		zb->zb_count++;
+		zb->zb_asize += BP_GET_ASIZE(bp);
+		zb->zb_lsize += BP_GET_LSIZE(bp);
+		zb->zb_psize += BP_GET_PSIZE(bp);
+		zb->zb_gangs += BP_COUNT_GANG(bp);
+
+		switch (BP_GET_NDVAS(bp)) {
+		case 2:
+			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[1]))
+				zb->zb_ditto_2_of_2_samevdev++;
+			break;
+		case 3:
+			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[1])) +
+			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[2])) +
+			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[2]));
+			if (equal == 1)
+				zb->zb_ditto_2_of_3_samevdev++;
+			else if (equal == 3)
+				zb->zb_ditto_3_of_3_samevdev++;
+			break;
+		}
+	}
+}
+
+static void
+dsl_scan_scrub_done(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+
+	zio_data_buf_free(zio->io_data, zio->io_size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_inflight--;
+	cv_broadcast(&spa->spa_scrub_io_cv);
+
+	if (zio->io_error && (zio->io_error != ECKSUM ||
+	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+		spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
+	}
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+dsl_scan_scrub_cb(dsl_pool_t *dp,
+    const blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+	dsl_scan_t *scn = dp->dp_scan;
+	size_t size = BP_GET_PSIZE(bp);
+	spa_t *spa = dp->dp_spa;
+	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+	boolean_t needs_io = B_FALSE;
+	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
+	int scan_delay = 0;
+	int d;
+
+	if (phys_birth <= scn->scn_phys.scn_min_txg ||
+	    phys_birth >= scn->scn_phys.scn_max_txg)
+		return (0);
+
+	count_block(dp->dp_blkstats, bp);
+
+	if (BP_IS_EMBEDDED(bp))
+		return (0);
+
+	ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
+	if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
+		zio_flags |= ZIO_FLAG_SCRUB;
+		needs_io = B_TRUE;
+		scan_delay = zfs_scrub_delay;
+	} else {
+		ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
+		zio_flags |= ZIO_FLAG_RESILVER;
+		needs_io = B_FALSE;
+		scan_delay = zfs_resilver_delay;
+	}
+
+	/* If it's an intent log block, failure is expected. */
+	if (zb->zb_level == ZB_ZIL_LEVEL)
+		zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+		vdev_t *vd = vdev_lookup_top(spa,
+		    DVA_GET_VDEV(&bp->blk_dva[d]));
+
+		/*
+		 * Keep track of how much data we've examined so that
+		 * zpool(1M) status can make useful progress reports.
+		 */
+		scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
+		spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
+
+		/* if it's a resilver, this may not be in the target range */
+		if (!needs_io) {
+			if (DVA_GET_GANG(&bp->blk_dva[d])) {
+				/*
+				 * Gang members may be spread across multiple
+				 * vdevs, so the best estimate we have is the
+				 * scrub range, which has already been checked.
+				 * XXX -- it would be better to change our
+				 * allocation policy to ensure that all
+				 * gang members reside on the same vdev.
+				 */
+				needs_io = B_TRUE;
+			} else {
+				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
+				    phys_birth, 1);
+			}
+		}
+	}
+
+	if (needs_io && !zfs_no_scrub_io) {
+		vdev_t *rvd = spa->spa_root_vdev;
+		uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
+		void *data = zio_data_buf_alloc(size);
+
+		mutex_enter(&spa->spa_scrub_lock);
+		while (spa->spa_scrub_inflight >= maxinflight)
+			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+		spa->spa_scrub_inflight++;
+		mutex_exit(&spa->spa_scrub_lock);
+
+		/*
+		 * If we're seeing recent (zfs_scan_idle) "important" I/Os
+		 * then throttle our workload to limit the impact of a scan.
+		 */
+		if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
+			delay(scan_delay);
+
+		zio_nowait(zio_read(NULL, spa, bp, data, size,
+		    dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB,
+		    zio_flags, zb));
+	}
+
+	/* do not relocate this block */
+	return (0);
+}
+
+int
+dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+{
+	spa_t *spa = dp->dp_spa;
+
+	/*
+	 * Purge all vdev caches and probe all devices.  We do this here
+	 * rather than in sync context because this requires a writer lock
+	 * on the spa_config lock, which we can't do from sync context.  The
+	 * spa_scrub_reopen flag indicates that vdev_open() should not
+	 * attempt to start another scrub.
+	 */
+	spa_vdev_state_enter(spa, SCL_NONE);
+	spa->spa_scrub_reopen = B_TRUE;
+	vdev_reopen(spa->spa_root_vdev);
+	spa->spa_scrub_reopen = B_FALSE;
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+
+	return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
+	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+module_param(zfs_top_maxinflight, int, 0644);
+MODULE_PARM_DESC(zfs_top_maxinflight, "Max I/Os per top-level");
+
+module_param(zfs_resilver_delay, int, 0644);
+MODULE_PARM_DESC(zfs_resilver_delay, "Number of ticks to delay resilver");
+
+module_param(zfs_scrub_delay, int, 0644);
+MODULE_PARM_DESC(zfs_scrub_delay, "Number of ticks to delay scrub");
+
+module_param(zfs_scan_idle, int, 0644);
+MODULE_PARM_DESC(zfs_scan_idle, "Idle window in clock ticks");
+
+module_param(zfs_scan_min_time_ms, int, 0644);
+MODULE_PARM_DESC(zfs_scan_min_time_ms, "Min millisecs to scrub per txg");
+
+module_param(zfs_free_min_time_ms, int, 0644);
+MODULE_PARM_DESC(zfs_free_min_time_ms, "Min millisecs to free per txg");
+
+module_param(zfs_resilver_min_time_ms, int, 0644);
+MODULE_PARM_DESC(zfs_resilver_min_time_ms, "Min millisecs to resilver per txg");
+
+module_param(zfs_no_scrub_io, int, 0644);
+MODULE_PARM_DESC(zfs_no_scrub_io, "Set to disable scrub I/O");
+
+module_param(zfs_no_scrub_prefetch, int, 0644);
+MODULE_PARM_DESC(zfs_no_scrub_prefetch, "Set to disable scrub prefetching");
+
+module_param(zfs_free_max_blocks, ulong, 0644);
+MODULE_PARM_DESC(zfs_free_max_blocks, "Max number of blocks freed in one txg");
+#endif
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 2e23a341fb13..69908546441d 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1921,7 +1921,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	size_t size;
 	void *data;
 
-	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return (0);
 	/*
 	 * Note: normally this routine will not be called if
diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c
index b3aa469bf45b..126fd6bee0b9 100644
--- a/module/zfs/space_map.c
+++ b/module/zfs/space_map.c
@@ -76,8 +76,8 @@ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
 
 	mutex_exit(sm->sm_lock);
 	if (end > bufsize) {
-		dmu_prefetch(sm->sm_os, space_map_object(sm), bufsize,
-		    end - bufsize);
+		dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
+		    end - bufsize, ZIO_PRIORITY_SYNC_READ);
 	}
 	mutex_enter(sm->sm_lock);
 
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index c5ea392b6a1d..5a08123ebdde 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -162,8 +162,9 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
 		tbl->zt_nextblk = newblk;
 		ASSERT0(tbl->zt_blks_copied);
-		dmu_prefetch(zap->zap_objset, zap->zap_object,
-		    tbl->zt_blk << bs, tbl->zt_numblks << bs);
+		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		    tbl->zt_blk << bs, tbl->zt_numblks << bs,
+		    ZIO_PRIORITY_SYNC_READ);
 	}
 
 	/*
@@ -939,7 +940,8 @@ fzap_prefetch(zap_name_t *zn)
 	if (zap_idx_to_blk(zap, idx, &blk) != 0)
 		return;
 	bs = FZAP_BLOCK_SHIFT(zap);
-	dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs);
+	dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
+	    ZIO_PRIORITY_SYNC_READ);
 }
 
 /*
@@ -1285,9 +1287,10 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 	} else {
 		int b;
 
-		dmu_prefetch(zap->zap_objset, zap->zap_object,
+		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
-		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs);
+		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
+		    ZIO_PRIORITY_SYNC_READ);
 
 		for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
 		    b++) {
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 944f0ad3ddb8..174c918ab0f6 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -2118,7 +2118,8 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr)
 
 		/* Prefetch znode */
 		if (prefetch) {
-			dmu_prefetch(os, objnum, 0, 0);
+			dmu_prefetch(os, objnum, 0, 0, 0,
+			    ZIO_PRIORITY_SYNC_READ);
 		}
 
 		/*
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index a16266a40a94..b02f60abdd63 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -63,6 +63,9 @@ int zio_delay_max = ZIO_DELAY_MAX;
 #define	ZIO_PIPELINE_CONTINUE		0x100
 #define	ZIO_PIPELINE_STOP		0x101
 
+#define	BP_SPANB(indblkshift, level) \
+	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
+#define	COMPARE_META_LEVEL	0x80000000ul
 /*
  * The following actions directly effect the spa's sync-to-convergence logic.
  * The values below define the sync pass when we start performing the action.
@@ -3436,39 +3439,129 @@ static zio_pipe_stage_t *zio_pipeline[] = {
 	zio_done
 };
 
-/* dnp is the dnode for zb1->zb_object */
-boolean_t
-zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
-    const zbookmark_phys_t *zb2)
-{
-	uint64_t zb1nextL0, zb2thisobj;
 
-	ASSERT(zb1->zb_objset == zb2->zb_objset);
-	ASSERT(zb2->zb_level == 0);
 
-	/* The objset_phys_t isn't before anything. */
-	if (dnp == NULL)
-		return (B_FALSE);
 
-	zb1nextL0 = (zb1->zb_blkid + 1) <<
-	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+/*
+ * Compare two zbookmark_phys_t's to see which we would reach first in a
+ * pre-order traversal of the object tree.
+ *
+ * This is simple in every case aside from the meta-dnode object. For all other
+ * objects, we traverse them in order (object 1 before object 2, and so on).
+ * However, all of these objects are traversed while traversing object 0, since
+ * the data it points to is the list of objects.  Thus, we need to convert to a
+ * canonical representation so we can compare meta-dnode bookmarks to
+ * non-meta-dnode bookmarks.
+ *
+ * We do this by calculating "equivalents" for each field of the zbookmark.
+ * zbookmarks outside of the meta-dnode use their own object and level, and
+ * calculate the level 0 equivalent (the first L0 blkid that is contained in the
+ * blocks this bookmark refers to) by multiplying their blkid by their span
+ * (the number of L0 blocks contained within one block at their level).
+ * zbookmarks inside the meta-dnode calculate their object equivalent
+ * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
+ * level + 1<<31 (any value larger than a level could ever be) for their level.
+ * This causes them to always compare before a bookmark in their object
+ * equivalent, compare appropriately to bookmarks in other objects, and to
+ * compare appropriately to other bookmarks in the meta-dnode.
+ */
+int
+zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
+    const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
+{
+	/*
+	 * These variables represent the "equivalent" values for the zbookmark,
+	 * after converting zbookmarks inside the meta dnode to their
+	 * normal-object equivalents.
+	 */
+	uint64_t zb1obj, zb2obj;
+	uint64_t zb1L0, zb2L0;
+	uint64_t zb1level, zb2level;
+
+	if (zb1->zb_object == zb2->zb_object &&
+	    zb1->zb_level == zb2->zb_level &&
+	    zb1->zb_blkid == zb2->zb_blkid)
+		return (0);
 
-	zb2thisobj = zb2->zb_object ? zb2->zb_object :
-	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
+	/*
+	 * BP_SPANB calculates the span in blocks.
+	 */
+	zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
+	zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
 
 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
-		uint64_t nextobj = zb1nextL0 *
-		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
-		return (nextobj <= zb2thisobj);
+		zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+		zb1L0 = 0;
+		zb1level = zb1->zb_level + COMPARE_META_LEVEL;
+	} else {
+		zb1obj = zb1->zb_object;
+		zb1level = zb1->zb_level;
 	}
 
-	if (zb1->zb_object < zb2thisobj)
-		return (B_TRUE);
-	if (zb1->zb_object > zb2thisobj)
-		return (B_FALSE);
-	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
+	if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
+		zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+		zb2L0 = 0;
+		zb2level = zb2->zb_level + COMPARE_META_LEVEL;
+	} else {
+		zb2obj = zb2->zb_object;
+		zb2level = zb2->zb_level;
+	}
+
+	/* Now that we have a canonical representation, do the comparison. */
+	if (zb1obj != zb2obj)
+		return (zb1obj < zb2obj ? -1 : 1);
+	else if (zb1L0 != zb2L0)
+		return (zb1L0 < zb2L0 ? -1 : 1);
+	else if (zb1level != zb2level)
+		return (zb1level > zb2level ? -1 : 1);
+	/*
+	 * This can (theoretically) happen if the bookmarks have the same object
+	 * and level, but different blkids, if the block sizes are not the same.
+	 * There is presently no way to change the indirect block sizes
+	 */
+	return (0);
+}
+
+/*
+ *  This function checks the following: given that last_block is the place that
+ *  our traversal stopped last time, does that guarantee that we've visited
+ *  every node under subtree_root?  Therefore, we can't just use the raw output
+ *  of zbookmark_compare.  We have to pass in a modified version of
+ *  subtree_root; by incrementing the block id, and then checking whether
+ *  last_block is before or equal to that, we can tell whether or not having
+ *  visited last_block implies that all of subtree_root's children have been
+ *  visited.
+ */
+boolean_t
+zbookmark_subtree_completed(const dnode_phys_t *dnp,
+    const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
+{
+	zbookmark_phys_t mod_zb = *subtree_root;
+	mod_zb.zb_blkid++;
+	ASSERT(last_block->zb_level == 0);
+
+	/* The objset_phys_t isn't before anything. */
+	if (dnp == NULL)
 		return (B_FALSE);
-	return (zb1nextL0 <= zb2->zb_blkid);
+
+	/*
+	 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
+	 * data block size in sectors, because that variable is only used if
+	 * the bookmark refers to a block in the meta-dnode.  Since we don't
+	 * know without examining it what object it refers to, and there's no
+	 * harm in passing in this value in other cases, we always pass it in.
+	 *
+	 * We pass in 0 for the indirect block size shift because zb2 must be
+	 * level 0.  The indirect block size is only used to calculate the span
+	 * of the bookmark, but since the bookmark must be level 0, the span is
+	 * always 1, so the math works out.
+	 *
+	 * If you make changes to how the zbookmark_compare code works, be sure
+	 * to make sure that this code still works afterwards.
+	 */
+	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
+	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
+	    last_block) <= 0);
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
diff --git a/module/zfs/zio.c.orig b/module/zfs/zio.c.orig
new file mode 100644
index 000000000000..a16266a40a94
--- /dev/null
+++ b/module/zfs/zio.c.orig
@@ -0,0 +1,3498 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/ddt.h>
+#include <sys/blkptr.h>
+#include <sys/zfeature.h>
+
+/*
+ * ==========================================================================
+ * I/O type descriptions
+ * ==========================================================================
+ */
+const char *zio_type_name[ZIO_TYPES] = {
+	"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl"
+};
+
+/*
+ * ==========================================================================
+ * I/O kmem caches
+ * ==========================================================================
+ */
+kmem_cache_t *zio_cache;
+kmem_cache_t *zio_link_cache;
+kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+int zio_delay_max = ZIO_DELAY_MAX;
+
+#define	ZIO_PIPELINE_CONTINUE		0x100
+#define	ZIO_PIPELINE_STOP		0x101
+
+/*
+ * The following actions directly effect the spa's sync-to-convergence logic.
+ * The values below define the sync pass when we start performing the action.
+ * Care should be taken when changing these values as they directly impact
+ * spa_sync() performance. Tuning these values may introduce subtle performance
+ * pathologies and should only be done in the context of performance analysis.
+ * These tunables will eventually be removed and replaced with #defines once
+ * enough analysis has been done to determine optimal values.
+ *
+ * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
+ * regular blocks are not deferred.
+ */
+int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
+int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
+int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
+
+/*
+ * An allocating zio is one that either currently has the DVA allocate
+ * stage set or will have it later in its lifetime.
+ */
+#define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
+
+int zio_requeue_io_start_cut_in_line = 1;
+
+#ifdef ZFS_DEBUG
+int zio_buf_debug_limit = 16384;
+#else
+int zio_buf_debug_limit = 0;
+#endif
+
+static inline void __zio_execute(zio_t *zio);
+
+void
+zio_init(void)
+{
+	size_t c;
+	vmem_t *data_alloc_arena = NULL;
+
+	zio_cache = kmem_cache_create("zio_cache",
+	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	zio_link_cache = kmem_cache_create("zio_link_cache",
+	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	/*
+	 * For small buffers, we want a cache for each multiple of
+	 * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
+	 * for each quarter-power of 2.
+	 */
+	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
+		size_t p2 = size;
+		size_t align = 0;
+		size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
+
+#ifdef _ILP32
+		/*
+		 * Cache size limited to 1M on 32-bit platforms until ARC
+		 * buffers no longer require virtual address space.
+		 */
+		if (size > zfs_max_recordsize)
+			break;
+#endif
+
+		while (!ISP2(p2))
+			p2 &= p2 - 1;
+
+#ifndef _KERNEL
+		/*
+		 * If we are using watchpoints, put each buffer on its own page,
+		 * to eliminate the performance overhead of trapping to the
+		 * kernel when modifying a non-watched buffer that shares the
+		 * page with a watched buffer.
+		 */
+		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
+			continue;
+#endif
+		if (size <= 4 * SPA_MINBLOCKSIZE) {
+			align = SPA_MINBLOCKSIZE;
+		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
+			align = MIN(p2 >> 2, PAGESIZE);
+		}
+
+		if (align != 0) {
+			char name[36];
+			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
+			zio_buf_cache[c] = kmem_cache_create(name, size,
+			    align, NULL, NULL, NULL, NULL, NULL, cflags);
+
+			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
+			zio_data_buf_cache[c] = kmem_cache_create(name, size,
+			    align, NULL, NULL, NULL, NULL,
+			    data_alloc_arena, cflags);
+		}
+	}
+
+	while (--c != 0) {
+		ASSERT(zio_buf_cache[c] != NULL);
+		if (zio_buf_cache[c - 1] == NULL)
+			zio_buf_cache[c - 1] = zio_buf_cache[c];
+
+		ASSERT(zio_data_buf_cache[c] != NULL);
+		if (zio_data_buf_cache[c - 1] == NULL)
+			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
+	}
+
+	zio_inject_init();
+
+	lz4_init();
+}
+
+void
+zio_fini(void)
+{
+	size_t c;
+	kmem_cache_t *last_cache = NULL;
+	kmem_cache_t *last_data_cache = NULL;
+
+	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+#ifdef _ILP32
+		/*
+		 * Cache size limited to 1M on 32-bit platforms until ARC
+		 * buffers no longer require virtual address space.
+		 */
+		if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize)
+			break;
+#endif
+		if (zio_buf_cache[c] != last_cache) {
+			last_cache = zio_buf_cache[c];
+			kmem_cache_destroy(zio_buf_cache[c]);
+		}
+		zio_buf_cache[c] = NULL;
+
+		if (zio_data_buf_cache[c] != last_data_cache) {
+			last_data_cache = zio_data_buf_cache[c];
+			kmem_cache_destroy(zio_data_buf_cache[c]);
+		}
+		zio_data_buf_cache[c] = NULL;
+	}
+
+	kmem_cache_destroy(zio_link_cache);
+	kmem_cache_destroy(zio_cache);
+
+	zio_inject_fini();
+
+	lz4_fini();
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free I/O buffers
+ * ==========================================================================
+ */
+
+/*
+ * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
+ * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
+ * useful to inspect ZFS metadata, but if possible, we should avoid keeping
+ * excess / transient data in-core during a crashdump.
+ */
+void *
+zio_buf_alloc(size_t size)
+{
+	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+	return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
+}
+
+/*
+ * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
+ * crashdump if the kernel panics.  This exists so that we will limit the amount
+ * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
+ * of kernel heap dumped to disk when the kernel panics)
+ */
+void *
+zio_data_buf_alloc(size_t size)
+{
+	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
+}
+
+void
+zio_buf_free(void *buf, size_t size)
+{
+	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+	kmem_cache_free(zio_buf_cache[c], buf);
+}
+
+void
+zio_data_buf_free(void *buf, size_t size)
+{
+	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+	kmem_cache_free(zio_data_buf_cache[c], buf);
+}
+
+/*
+ * ==========================================================================
+ * Push and pop I/O transform buffers
+ * ==========================================================================
+ */
+static void
+zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
+	zio_transform_func_t *transform)
+{
+	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
+
+	zt->zt_orig_data = zio->io_data;
+	zt->zt_orig_size = zio->io_size;
+	zt->zt_bufsize = bufsize;
+	zt->zt_transform = transform;
+
+	zt->zt_next = zio->io_transform_stack;
+	zio->io_transform_stack = zt;
+
+	zio->io_data = data;
+	zio->io_size = size;
+}
+
+static void
+zio_pop_transforms(zio_t *zio)
+{
+	zio_transform_t *zt;
+
+	while ((zt = zio->io_transform_stack) != NULL) {
+		if (zt->zt_transform != NULL)
+			zt->zt_transform(zio,
+			    zt->zt_orig_data, zt->zt_orig_size);
+
+		if (zt->zt_bufsize != 0)
+			zio_buf_free(zio->io_data, zt->zt_bufsize);
+
+		zio->io_data = zt->zt_orig_data;
+		zio->io_size = zt->zt_orig_size;
+		zio->io_transform_stack = zt->zt_next;
+
+		kmem_free(zt, sizeof (zio_transform_t));
+	}
+}
+
+/*
+ * ==========================================================================
+ * I/O transform callbacks for subblocks and decompression
+ * ==========================================================================
+ */
+static void
+zio_subblock(zio_t *zio, void *data, uint64_t size)
+{
+	ASSERT(zio->io_size > size);
+
+	if (zio->io_type == ZIO_TYPE_READ)
+		bcopy(zio->io_data, data, size);
+}
+
+static void
+zio_decompress(zio_t *zio, void *data, uint64_t size)
+{
+	if (zio->io_error == 0 &&
+	    zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
+	    zio->io_data, data, zio->io_size, size) != 0)
+		zio->io_error = SET_ERROR(EIO);
+}
+
+/*
+ * ==========================================================================
+ * I/O parent/child relationships and pipeline interlocks
+ * ==========================================================================
+ */
+/*
+ * NOTE - Callers to zio_walk_parents() and zio_walk_children must
+ *        continue calling these functions until they return NULL.
+ *        Otherwise, the next caller will pick up the list walk in
+ *        some indeterminate state.  (Otherwise every caller would
+ *        have to pass in a cookie to keep the state represented by
+ *        io_walk_link, which gets annoying.)
+ */
+zio_t *
+zio_walk_parents(zio_t *cio)
+{
+	zio_link_t *zl = cio->io_walk_link;
+	list_t *pl = &cio->io_parent_list;
+
+	zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
+	cio->io_walk_link = zl;
+
+	if (zl == NULL)
+		return (NULL);
+
+	ASSERT(zl->zl_child == cio);
+	return (zl->zl_parent);
+}
+
+zio_t *
+zio_walk_children(zio_t *pio)
+{
+	zio_link_t *zl = pio->io_walk_link;
+	list_t *cl = &pio->io_child_list;
+
+	zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
+	pio->io_walk_link = zl;
+
+	if (zl == NULL)
+		return (NULL);
+
+	ASSERT(zl->zl_parent == pio);
+	return (zl->zl_child);
+}
+
+zio_t *
+zio_unique_parent(zio_t *cio)
+{
+	zio_t *pio = zio_walk_parents(cio);
+
+	VERIFY(zio_walk_parents(cio) == NULL);
+	return (pio);
+}
+
+void
+zio_add_child(zio_t *pio, zio_t *cio)
+{
+	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
+	int w;
+
+	/*
+	 * Logical I/Os can have logical, gang, or vdev children.
+	 * Gang I/Os can have gang or vdev children.
+	 * Vdev I/Os can only have vdev children.
+	 * The following ASSERT captures all of these constraints.
+	 */
+	ASSERT(cio->io_child_type <= pio->io_child_type);
+
+	zl->zl_parent = pio;
+	zl->zl_child = cio;
+
+	mutex_enter(&cio->io_lock);
+	mutex_enter(&pio->io_lock);
+
+	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
+
+	for (w = 0; w < ZIO_WAIT_TYPES; w++)
+		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
+
+	list_insert_head(&pio->io_child_list, zl);
+	list_insert_head(&cio->io_parent_list, zl);
+
+	pio->io_child_count++;
+	cio->io_parent_count++;
+
+	mutex_exit(&pio->io_lock);
+	mutex_exit(&cio->io_lock);
+}
+
+static void
+zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
+{
+	ASSERT(zl->zl_parent == pio);
+	ASSERT(zl->zl_child == cio);
+
+	mutex_enter(&cio->io_lock);
+	mutex_enter(&pio->io_lock);
+
+	list_remove(&pio->io_child_list, zl);
+	list_remove(&cio->io_parent_list, zl);
+
+	pio->io_child_count--;
+	cio->io_parent_count--;
+
+	mutex_exit(&pio->io_lock);
+	mutex_exit(&cio->io_lock);
+
+	kmem_cache_free(zio_link_cache, zl);
+}
+
+static boolean_t
+zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
+{
+	uint64_t *countp = &zio->io_children[child][wait];
+	boolean_t waiting = B_FALSE;
+
+	mutex_enter(&zio->io_lock);
+	ASSERT(zio->io_stall == NULL);
+	if (*countp != 0) {
+		zio->io_stage >>= 1;
+		zio->io_stall = countp;
+		waiting = B_TRUE;
+	}
+	mutex_exit(&zio->io_lock);
+
+	return (waiting);
+}
+
+__attribute__((always_inline))
+static inline void
+zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
+{
+	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
+	int *errorp = &pio->io_child_error[zio->io_child_type];
+
+	mutex_enter(&pio->io_lock);
+	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+		*errorp = zio_worst_error(*errorp, zio->io_error);
+	pio->io_reexecute |= zio->io_reexecute;
+	ASSERT3U(*countp, >, 0);
+
+	(*countp)--;
+
+	if (*countp == 0 && pio->io_stall == countp) {
+		pio->io_stall = NULL;
+		mutex_exit(&pio->io_lock);
+		__zio_execute(pio);
+	} else {
+		mutex_exit(&pio->io_lock);
+	}
+}
+
+static void
+zio_inherit_child_errors(zio_t *zio, enum zio_child c)
+{
+	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
+		zio->io_error = zio->io_child_error[c];
+}
+
+/*
+ * ==========================================================================
+ * Create the various types of I/O (read, write, free, etc)
+ * ==========================================================================
+ */
+static zio_t *
+zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+    void *data, uint64_t size, zio_done_func_t *done, void *private,
+    zio_type_t type, zio_priority_t priority, enum zio_flag flags,
+    vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
+    enum zio_stage stage, enum zio_stage pipeline)
+{
+	zio_t *zio;
+
+	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
+	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
+
+	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
+	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
+	ASSERT(vd || stage == ZIO_STAGE_OPEN);
+
+	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
+	bzero(zio, sizeof (zio_t));
+
+	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
+
+	list_create(&zio->io_parent_list, sizeof (zio_link_t),
+	    offsetof(zio_link_t, zl_parent_node));
+	list_create(&zio->io_child_list, sizeof (zio_link_t),
+	    offsetof(zio_link_t, zl_child_node));
+
+	if (vd != NULL)
+		zio->io_child_type = ZIO_CHILD_VDEV;
+	else if (flags & ZIO_FLAG_GANG_CHILD)
+		zio->io_child_type = ZIO_CHILD_GANG;
+	else if (flags & ZIO_FLAG_DDT_CHILD)
+		zio->io_child_type = ZIO_CHILD_DDT;
+	else
+		zio->io_child_type = ZIO_CHILD_LOGICAL;
+
+	if (bp != NULL) {
+		zio->io_bp = (blkptr_t *)bp;
+		zio->io_bp_copy = *bp;
+		zio->io_bp_orig = *bp;
+		if (type != ZIO_TYPE_WRITE ||
+		    zio->io_child_type == ZIO_CHILD_DDT)
+			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
+		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
+			zio->io_logical = zio;
+		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
+			pipeline |= ZIO_GANG_STAGES;
+	}
+
+	zio->io_spa = spa;
+	zio->io_txg = txg;
+	zio->io_done = done;
+	zio->io_private = private;
+	zio->io_type = type;
+	zio->io_priority = priority;
+	zio->io_vd = vd;
+	zio->io_offset = offset;
+	zio->io_orig_data = zio->io_data = data;
+	zio->io_orig_size = zio->io_size = size;
+	zio->io_orig_flags = zio->io_flags = flags;
+	zio->io_orig_stage = zio->io_stage = stage;
+	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
+
+	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
+	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
+
+	if (zb != NULL)
+		zio->io_bookmark = *zb;
+
+	if (pio != NULL) {
+		if (zio->io_logical == NULL)
+			zio->io_logical = pio->io_logical;
+		if (zio->io_child_type == ZIO_CHILD_GANG)
+			zio->io_gang_leader = pio->io_gang_leader;
+		zio_add_child(pio, zio);
+	}
+
+	taskq_init_ent(&zio->io_tqent);
+
+	return (zio);
+}
+
+static void
+zio_destroy(zio_t *zio)
+{
+	list_destroy(&zio->io_parent_list);
+	list_destroy(&zio->io_child_list);
+	mutex_destroy(&zio->io_lock);
+	cv_destroy(&zio->io_cv);
+	kmem_cache_free(zio_cache, zio);
+}
+
+zio_t *
+zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
+    void *private, enum zio_flag flags)
+{
+	zio_t *zio;
+
+	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
+	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
+
+	return (zio);
+}
+
+zio_t *
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
+{
+	return (zio_null(NULL, spa, NULL, done, private, flags));
+}
+
+void
+zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
+{
+	int i;
+
+	if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
+		zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
+		    bp, (longlong_t)BP_GET_TYPE(bp));
+	}
+	if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
+	    BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
+		zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
+		    bp, (longlong_t)BP_GET_CHECKSUM(bp));
+	}
+	if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
+	    BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
+		zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
+		    bp, (longlong_t)BP_GET_COMPRESS(bp));
+	}
+	if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
+		zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
+		    bp, (longlong_t)BP_GET_LSIZE(bp));
+	}
+	if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
+		zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
+		    bp, (longlong_t)BP_GET_PSIZE(bp));
+	}
+
+	if (BP_IS_EMBEDDED(bp)) {
+		if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
+			zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
+			    bp, (longlong_t)BPE_GET_ETYPE(bp));
+		}
+	}
+
+	/*
+	 * Pool-specific checks.
+	 *
+	 * Note: it would be nice to verify that the blk_birth and
+	 * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
+	 * allows the birth time of log blocks (and dmu_sync()-ed blocks
+	 * that are in the log) to be arbitrarily large.
+	 */
+	for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+		uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
+		vdev_t *vd;
+		uint64_t offset, asize;
+		if (vdevid >= spa->spa_root_vdev->vdev_children) {
+			zfs_panic_recover("blkptr at %p DVA %u has invalid "
+			    "VDEV %llu",
+			    bp, i, (longlong_t)vdevid);
+		}
+		vd = spa->spa_root_vdev->vdev_child[vdevid];
+		if (vd == NULL) {
+			zfs_panic_recover("blkptr at %p DVA %u has invalid "
+			    "VDEV %llu",
+			    bp, i, (longlong_t)vdevid);
+		}
+		if (vd->vdev_ops == &vdev_hole_ops) {
+			zfs_panic_recover("blkptr at %p DVA %u has hole "
+			    "VDEV %llu",
+			    bp, i, (longlong_t)vdevid);
+
+		}
+		if (vd->vdev_ops == &vdev_missing_ops) {
+			/*
+			 * "missing" vdevs are valid during import, but we
+			 * don't have their detailed info (e.g. asize), so
+			 * we can't perform any more checks on them.
+			 */
+			continue;
+		}
+		offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
+		asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
+		if (BP_IS_GANG(bp))
+			asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+		if (offset + asize > vd->vdev_asize) {
+			zfs_panic_recover("blkptr at %p DVA %u has invalid "
+			    "OFFSET %llu",
+			    bp, i, (longlong_t)offset);
+		}
+	}
+}
+
+zio_t *
+zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
+    void *data, uint64_t size, zio_done_func_t *done, void *private,
+    zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
+{
+	zio_t *zio;
+
+	zfs_blkptr_verify(spa, bp);
+
+	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
+	    data, size, done, private,
+	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
+	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
+
+	return (zio);
+}
+
+zio_t *
+zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    void *data, uint64_t size, const zio_prop_t *zp,
+    zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
+    void *private,
+    zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
+{
+	zio_t *zio;
+
+	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
+	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
+	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
+	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
+	    DMU_OT_IS_VALID(zp->zp_type) &&
+	    zp->zp_level < 32 &&
+	    zp->zp_copies > 0 &&
+	    zp->zp_copies <= spa_max_replication(spa));
+
+	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
+	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
+
+	zio->io_ready = ready;
+	zio->io_physdone = physdone;
+	zio->io_prop = *zp;
+
+	/*
+	 * Data can be NULL if we are going to call zio_write_override() to
+	 * provide the already-allocated BP.  But we may need the data to
+	 * verify a dedup hit (if requested).  In this case, don't try to
+	 * dedup (just take the already-allocated BP verbatim).
+	 */
+	if (data == NULL && zio->io_prop.zp_dedup_verify) {
+		zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
+	}
+
+	return (zio);
+}
+
+zio_t *
+zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
+    uint64_t size, zio_done_func_t *done, void *private,
+    zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
+{
+	zio_t *zio;
+
+	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
+	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
+	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
+
+	return (zio);
+}
+
+void
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
+{
+	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+
+	/*
+	 * We must reset the io_prop to match the values that existed
+	 * when the bp was first written by dmu_sync() keeping in mind
+	 * that nopwrite and dedup are mutually exclusive.
+	 */
+	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
+	zio->io_prop.zp_nopwrite = nopwrite;
+	zio->io_prop.zp_copies = copies;
+	zio->io_bp_override = bp;
+}
+
+void
+zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
+{
+
+	/*
+	 * The check for EMBEDDED is a performance optimization.  We
+	 * process the free here (by ignoring it) rather than
+	 * putting it on the list and then processing it in zio_free_sync().
+	 */
+	if (BP_IS_EMBEDDED(bp))
+		return;
+	metaslab_check_free(spa, bp);
+
+	/*
+	 * Frees that are for the currently-syncing txg, are not going to be
+	 * deferred, and which will not need to do a read (i.e. not GANG or
+	 * DEDUP), can be processed immediately.  Otherwise, put them on the
+	 * in-memory list for later processing.
+	 */
+	if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
+	    txg != spa->spa_syncing_txg ||
+	    spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
+		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+	} else {
+		VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
+	}
+}
+
+zio_t *
+zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+    enum zio_flag flags)
+{
+	zio_t *zio;
+	enum zio_stage stage = ZIO_FREE_PIPELINE;
+
+	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(spa_syncing_txg(spa) == txg);
+	ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
+
+	if (BP_IS_EMBEDDED(bp))
+		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
+	metaslab_check_free(spa, bp);
+	arc_freed(spa, bp);
+
+	/*
+	 * GANG and DEDUP blocks can induce a read (for the gang block header,
+	 * or the DDT), so issue them asynchronously so that this thread is
+	 * not tied up.
+	 */
+	if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
+		stage |= ZIO_STAGE_ISSUE_ASYNC;
+
+	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
+	    NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
+	    NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
+
+	return (zio);
+}
+
+zio_t *
+zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+    zio_done_func_t *done, void *private, enum zio_flag flags)
+{
+	zio_t *zio;
+
+	dprintf_bp(bp, "claiming in txg %llu", txg);
+
+	if (BP_IS_EMBEDDED(bp))
+		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
+	/*
+	 * A claim is an allocation of a specific block.  Claims are needed
+	 * to support immediate writes in the intent log.  The issue is that
+	 * immediate writes contain committed data, but in a txg that was
+	 * *not* committed.  Upon opening the pool after an unclean shutdown,
+	 * the intent log claims all blocks that contain immediate write data
+	 * so that the SPA knows they're in use.
+	 *
+	 * All claims *must* be resolved in the first txg -- before the SPA
+	 * starts allocating blocks -- so that nothing is allocated twice.
+	 * If txg == 0 we just verify that the block is claimable.
+	 */
+	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
+	ASSERT(txg == spa_first_txg(spa) || txg == 0);
+	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(1M) */
+
+	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
+	    done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
+	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+
+	return (zio);
+}
+
+zio_t *
+zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+    zio_done_func_t *done, void *private, enum zio_flag flags)
+{
+	zio_t *zio;
+	int c;
+
+	if (vd->vdev_children == 0) {
+		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
+		    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
+		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
+
+		zio->io_cmd = cmd;
+	} else {
+		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
+
+		for (c = 0; c < vd->vdev_children; c++)
+			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
+			    done, private, flags));
+	}
+
+	return (zio);
+}
+
+zio_t *
+zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+    void *data, int checksum, zio_done_func_t *done, void *private,
+    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
+{
+	zio_t *zio;
+
+	ASSERT(vd->vdev_children == 0);
+	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
+	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+	ASSERT3U(offset + size, <=, vd->vdev_psize);
+
+	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
+	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
+	    NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+
+	zio->io_prop.zp_checksum = checksum;
+
+	return (zio);
+}
+
+zio_t *
+zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+    void *data, int checksum, zio_done_func_t *done, void *private,
+    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
+{
+	zio_t *zio;
+
+	ASSERT(vd->vdev_children == 0);
+	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
+	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+	ASSERT3U(offset + size, <=, vd->vdev_psize);
+
+	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
+	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
+	    NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+
+	zio->io_prop.zp_checksum = checksum;
+
+	if (zio_checksum_table[checksum].ci_eck) {
+		/*
+		 * zec checksums are necessarily destructive -- they modify
+		 * the end of the write buffer to hold the verifier/checksum.
+		 * Therefore, we must make a local copy in case the data is
+		 * being written to multiple places in parallel.
+		 */
+		void *wbuf = zio_buf_alloc(size);
+		bcopy(data, wbuf, size);
+		zio_push_transform(zio, wbuf, size, size, NULL);
+	}
+
+	return (zio);
+}
+
+/*
+ * Create a child I/O to do some work for us.
+ */
+zio_t *
+zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
+	void *data, uint64_t size, int type, zio_priority_t priority,
+	enum zio_flag flags, zio_done_func_t *done, void *private)
+{
+	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
+	zio_t *zio;
+
+	ASSERT(vd->vdev_parent ==
+	    (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
+
+	if (type == ZIO_TYPE_READ && bp != NULL) {
+		/*
+		 * If we have the bp, then the child should perform the
+		 * checksum and the parent need not.  This pushes error
+		 * detection as close to the leaves as possible and
+		 * eliminates redundant checksums in the interior nodes.
+		 */
+		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
+		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
+	}
+
+	if (vd->vdev_children == 0)
+		offset += VDEV_LABEL_START_SIZE;
+
+	flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
+
+	/*
+	 * If we've decided to do a repair, the write is not speculative --
+	 * even if the original read was.
+	 */
+	if (flags & ZIO_FLAG_IO_REPAIR)
+		flags &= ~ZIO_FLAG_SPECULATIVE;
+
+	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
+	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
+	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
+
+	zio->io_physdone = pio->io_physdone;
+	if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
+		zio->io_logical->io_phys_children++;
+
+	return (zio);
+}
+
+zio_t *
+zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
+	int type, zio_priority_t priority, enum zio_flag flags,
+	zio_done_func_t *done, void *private)
+{
+	zio_t *zio;
+
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
+	    data, size, done, private, type, priority,
+	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
+	    vd, offset, NULL,
+	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
+
+	return (zio);
+}
+
+void
+zio_flush(zio_t *zio, vdev_t *vd)
+{
+	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
+	    NULL, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+}
+
+void
+zio_shrink(zio_t *zio, uint64_t size)
+{
+	ASSERT(zio->io_executor == NULL);
+	ASSERT(zio->io_orig_size == zio->io_size);
+	ASSERT(size <= zio->io_size);
+
+	/*
+	 * We don't shrink for raidz because of problems with the
+	 * reconstruction when reading back less than the block size.
+	 * Note, BP_IS_RAIDZ() assumes no compression.
+	 */
+	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
+	if (!BP_IS_RAIDZ(zio->io_bp))
+		zio->io_orig_size = zio->io_size = size;
+}
+
+/*
+ * ==========================================================================
+ * Prepare to read and write logical blocks
+ * ==========================================================================
+ */
+
+static int
+zio_read_bp_init(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
+	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
+	    !(zio->io_flags & ZIO_FLAG_RAW)) {
+		uint64_t psize =
+		    BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
+		void *cbuf = zio_buf_alloc(psize);
+
+		zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
+	}
+
+	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+		decode_embedded_bp_compressed(bp, zio->io_data);
+	} else {
+		ASSERT(!BP_IS_EMBEDDED(bp));
+	}
+
+	if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
+		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+	if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
+		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
+		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_write_bp_init(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	zio_prop_t *zp = &zio->io_prop;
+	enum zio_compress compress = zp->zp_compress;
+	blkptr_t *bp = zio->io_bp;
+	uint64_t lsize = zio->io_size;
+	uint64_t psize = lsize;
+	int pass = 1;
+
+	/*
+	 * If our children haven't all reached the ready stage,
+	 * wait for them and then repeat this pipeline stage.
+	 */
+	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
+	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
+		return (ZIO_PIPELINE_STOP);
+
+	if (!IO_IS_ALLOCATING(zio))
+		return (ZIO_PIPELINE_CONTINUE);
+
+	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+
+	if (zio->io_bp_override) {
+		ASSERT(bp->blk_birth != zio->io_txg);
+		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
+
+		*bp = *zio->io_bp_override;
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+		if (BP_IS_EMBEDDED(bp))
+			return (ZIO_PIPELINE_CONTINUE);
+
+		/*
+		 * If we've been overridden and nopwrite is set then
+		 * set the flag accordingly to indicate that a nopwrite
+		 * has already occurred.
+		 */
+		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
+			ASSERT(!zp->zp_dedup);
+			zio->io_flags |= ZIO_FLAG_NOPWRITE;
+			return (ZIO_PIPELINE_CONTINUE);
+		}
+
+		ASSERT(!zp->zp_nopwrite);
+
+		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
+			return (ZIO_PIPELINE_CONTINUE);
+
+		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
+		    zp->zp_dedup_verify);
+
+		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
+			BP_SET_DEDUP(bp, 1);
+			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
+			return (ZIO_PIPELINE_CONTINUE);
+		}
+	}
+
+	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
+		/*
+		 * We're rewriting an existing block, which means we're
+		 * working on behalf of spa_sync().  For spa_sync() to
+		 * converge, it must eventually be the case that we don't
+		 * have to allocate new blocks.  But compression changes
+		 * the blocksize, which forces a reallocate, and makes
+		 * convergence take longer.  Therefore, after the first
+		 * few passes, stop compressing to ensure convergence.
+		 */
+		pass = spa_sync_pass(spa);
+
+		ASSERT(zio->io_txg == spa_syncing_txg(spa));
+		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+		ASSERT(!BP_GET_DEDUP(bp));
+
+		if (pass >= zfs_sync_pass_dont_compress)
+			compress = ZIO_COMPRESS_OFF;
+
+		/* Make sure someone doesn't change their mind on overwrites */
+		ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
+		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
+	}
+
+	if (compress != ZIO_COMPRESS_OFF) {
+		void *cbuf = zio_buf_alloc(lsize);
+		psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+		if (psize == 0 || psize == lsize) {
+			compress = ZIO_COMPRESS_OFF;
+			zio_buf_free(cbuf, lsize);
+		} else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
+		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
+		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
+			encode_embedded_bp_compressed(bp,
+			    cbuf, compress, lsize, psize);
+			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
+			BP_SET_TYPE(bp, zio->io_prop.zp_type);
+			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
+			zio_buf_free(cbuf, lsize);
+			bp->blk_birth = zio->io_txg;
+			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+			ASSERT(spa_feature_is_active(spa,
+			    SPA_FEATURE_EMBEDDED_DATA));
+			return (ZIO_PIPELINE_CONTINUE);
+		} else {
+			/*
+			 * Round up compressed size up to the ashift
+			 * of the smallest-ashift device, and zero the tail.
+			 * This ensures that the compressed size of the BP
+			 * (and thus compressratio property) are correct,
+			 * in that we charge for the padding used to fill out
+			 * the last sector.
+			 */
+			size_t rounded;
+
+			ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
+
+			rounded = (size_t)P2ROUNDUP(psize,
+			    1ULL << spa->spa_min_ashift);
+			if (rounded >= lsize) {
+				compress = ZIO_COMPRESS_OFF;
+				zio_buf_free(cbuf, lsize);
+				psize = lsize;
+			} else {
+				bzero((char *)cbuf + psize, rounded - psize);
+				psize = rounded;
+				zio_push_transform(zio, cbuf,
+				    psize, lsize, NULL);
+			}
+		}
+	}
+
+	/*
+	 * The final pass of spa_sync() must be all rewrites, but the first
+	 * few passes offer a trade-off: allocating blocks defers convergence,
+	 * but newly allocated blocks are sequential, so they can be written
+	 * to disk faster.  Therefore, we allow the first few passes of
+	 * spa_sync() to allocate new blocks, but force rewrites after that.
+	 * There should only be a handful of blocks after pass 1 in any case.
+	 */
+	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
+	    BP_GET_PSIZE(bp) == psize &&
+	    pass >= zfs_sync_pass_rewrite) {
+		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
+		ASSERT(psize != 0);
+		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
+		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
+	} else {
+		BP_ZERO(bp);
+		zio->io_pipeline = ZIO_WRITE_PIPELINE;
+	}
+
+	if (psize == 0) {
+		if (zio->io_bp_orig.blk_birth != 0 &&
+		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+			BP_SET_LSIZE(bp, lsize);
+			BP_SET_TYPE(bp, zp->zp_type);
+			BP_SET_LEVEL(bp, zp->zp_level);
+			BP_SET_BIRTH(bp, zio->io_txg, 0);
+		}
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+	} else {
+		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
+		BP_SET_LSIZE(bp, lsize);
+		BP_SET_TYPE(bp, zp->zp_type);
+		BP_SET_LEVEL(bp, zp->zp_level);
+		BP_SET_PSIZE(bp, psize);
+		BP_SET_COMPRESS(bp, compress);
+		BP_SET_CHECKSUM(bp, zp->zp_checksum);
+		BP_SET_DEDUP(bp, zp->zp_dedup);
+		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+		if (zp->zp_dedup) {
+			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
+		}
+		if (zp->zp_nopwrite) {
+			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
+		}
+	}
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_free_bp_init(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
+		if (BP_GET_DEDUP(bp))
+			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
+	}
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * ==========================================================================
+ * Execute the I/O pipeline
+ * ==========================================================================
+ */
+
+static void
+zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
+{
+	spa_t *spa = zio->io_spa;
+	zio_type_t t = zio->io_type;
+	int flags = (cutinline ? TQ_FRONT : 0);
+
+	/*
+	 * If we're a config writer or a probe, the normal issue and
+	 * interrupt threads may all be blocked waiting for the config lock.
+	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
+	 */
+	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
+		t = ZIO_TYPE_NULL;
+
+	/*
+	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
+	 */
+	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
+		t = ZIO_TYPE_NULL;
+
+	/*
+	 * If this is a high priority I/O, then use the high priority taskq if
+	 * available.
+	 */
+	if (zio->io_priority == ZIO_PRIORITY_NOW &&
+	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
+		q++;
+
+	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
+
+	/*
+	 * NB: We are assuming that the zio can only be dispatched
+	 * to a single taskq at a time.  It would be a grievous error
+	 * to dispatch the zio to another taskq at the same time.
+	 */
+	ASSERT(taskq_empty_ent(&zio->io_tqent));
+	spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
+	    flags, &zio->io_tqent);
+}
+
+static boolean_t
+zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
+{
+	kthread_t *executor = zio->io_executor;
+	spa_t *spa = zio->io_spa;
+	zio_type_t t;
+
+	for (t = 0; t < ZIO_TYPES; t++) {
+		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+		uint_t i;
+		for (i = 0; i < tqs->stqs_count; i++) {
+			if (taskq_member(tqs->stqs_taskq[i], executor))
+				return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+static int
+zio_issue_async(zio_t *zio)
+{
+	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
+
+	return (ZIO_PIPELINE_STOP);
+}
+
+void
+zio_interrupt(zio_t *zio)
+{
+	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
+}
+
+/*
+ * Execute the I/O pipeline until one of the following occurs:
+ * (1) the I/O completes; (2) the pipeline stalls waiting for
+ * dependent child I/Os; (3) the I/O issues, so we're waiting
+ * for an I/O completion interrupt; (4) the I/O is delegated by
+ * vdev-level caching or aggregation; (5) the I/O is deferred
+ * due to vdev-level queueing; (6) the I/O is handed off to
+ * another thread.  In all cases, the pipeline stops whenever
+ * there's no CPU work; it never burns a thread in cv_wait_io().
+ *
+ * There's no locking on io_stage because there's no legitimate way
+ * for multiple threads to be attempting to process the same I/O.
+ */
+static zio_pipe_stage_t *zio_pipeline[];
+
+/*
+ * zio_execute() is a wrapper around the static function
+ * __zio_execute() so that we can force  __zio_execute() to be
+ * inlined.  This reduces stack overhead which is important
+ * because __zio_execute() is called recursively in several zio
+ * code paths.  zio_execute() itself cannot be inlined because
+ * it is externally visible.
+ */
+void
+zio_execute(zio_t *zio)
+{
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+	__zio_execute(zio);
+	spl_fstrans_unmark(cookie);
+}
+
+/*
+ * Used to determine if in the current context the stack is sized large
+ * enough to allow zio_execute() to be called recursively.  A minimum
+ * stack size of 16K is required to avoid needing to re-dispatch the zio.
+ */
+boolean_t
+zio_execute_stack_check(zio_t *zio)
+{
+#if !defined(HAVE_LARGE_STACKS)
+	dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
+
+	/* Executing in txg_sync_thread() context. */
+	if (dp && curthread == dp->dp_tx.tx_sync_thread)
+		return (B_TRUE);
+
+	/* Pool initialization outside of zio_taskq context. */
+	if (dp && spa_is_initializing(dp->dp_spa) &&
+	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
+	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
+		return (B_TRUE);
+#endif /* HAVE_LARGE_STACKS */
+
+	return (B_FALSE);
+}
+
+__attribute__((always_inline))
+static inline void
+__zio_execute(zio_t *zio)
+{
+	zio->io_executor = curthread;
+
+	while (zio->io_stage < ZIO_STAGE_DONE) {
+		enum zio_stage pipeline = zio->io_pipeline;
+		enum zio_stage stage = zio->io_stage;
+		int rv;
+
+		ASSERT(!MUTEX_HELD(&zio->io_lock));
+		ASSERT(ISP2(stage));
+		ASSERT(zio->io_stall == NULL);
+
+		do {
+			stage <<= 1;
+		} while ((stage & pipeline) == 0);
+
+		ASSERT(stage <= ZIO_STAGE_DONE);
+
+		/*
+		 * If we are in interrupt context and this pipeline stage
+		 * will grab a config lock that is held across I/O,
+		 * or may wait for an I/O that needs an interrupt thread
+		 * to complete, issue async to avoid deadlock.
+		 *
+		 * For VDEV_IO_START, we cut in line so that the io will
+		 * be sent to disk promptly.
+		 */
+		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
+		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
+			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
+			    zio_requeue_io_start_cut_in_line : B_FALSE;
+			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
+			return;
+		}
+
+		/*
+		 * If the current context doesn't have large enough stacks
+		 * the zio must be issued asynchronously to prevent overflow.
+		 */
+		if (zio_execute_stack_check(zio)) {
+			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
+			    zio_requeue_io_start_cut_in_line : B_FALSE;
+			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
+			return;
+		}
+
+		zio->io_stage = stage;
+		rv = zio_pipeline[highbit64(stage) - 1](zio);
+
+		if (rv == ZIO_PIPELINE_STOP)
+			return;
+
+		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
+	}
+}
+
+
+/*
+ * ==========================================================================
+ * Initiate I/O, either sync or async
+ * ==========================================================================
+ */
+int
+zio_wait(zio_t *zio)
+{
+	int error;
+
+	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+	ASSERT(zio->io_executor == NULL);
+
+	zio->io_waiter = curthread;
+
+	__zio_execute(zio);
+
+	mutex_enter(&zio->io_lock);
+	while (zio->io_executor != NULL)
+		cv_wait_io(&zio->io_cv, &zio->io_lock);
+	mutex_exit(&zio->io_lock);
+
+	error = zio->io_error;
+	zio_destroy(zio);
+
+	return (error);
+}
+
+void
+zio_nowait(zio_t *zio)
+{
+	ASSERT(zio->io_executor == NULL);
+
+	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
+	    zio_unique_parent(zio) == NULL) {
+		zio_t *pio;
+
+		/*
+		 * This is a logical async I/O with no parent to wait for it.
+		 * We add it to the spa_async_root_zio "Godfather" I/O which
+		 * will ensure they complete prior to unloading the pool.
+		 */
+		spa_t *spa = zio->io_spa;
+		kpreempt_disable();
+		pio = spa->spa_async_zio_root[CPU_SEQID];
+		kpreempt_enable();
+
+		zio_add_child(pio, zio);
+	}
+
+	__zio_execute(zio);
+}
+
+/*
+ * ==========================================================================
+ * Reexecute or suspend/resume failed I/O
+ * ==========================================================================
+ */
+
+static void
+zio_reexecute(zio_t *pio)
+{
+	zio_t *cio, *cio_next;
+	int c, w;
+
+	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
+	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
+	ASSERT(pio->io_gang_leader == NULL);
+	ASSERT(pio->io_gang_tree == NULL);
+
+	pio->io_flags = pio->io_orig_flags;
+	pio->io_stage = pio->io_orig_stage;
+	pio->io_pipeline = pio->io_orig_pipeline;
+	pio->io_reexecute = 0;
+	pio->io_flags |= ZIO_FLAG_REEXECUTED;
+	pio->io_error = 0;
+	for (w = 0; w < ZIO_WAIT_TYPES; w++)
+		pio->io_state[w] = 0;
+	for (c = 0; c < ZIO_CHILD_TYPES; c++)
+		pio->io_child_error[c] = 0;
+
+	if (IO_IS_ALLOCATING(pio))
+		BP_ZERO(pio->io_bp);
+
+	/*
+	 * As we reexecute pio's children, new children could be created.
+	 * New children go to the head of pio's io_child_list, however,
+	 * so we will (correctly) not reexecute them.  The key is that
+	 * the remainder of pio's io_child_list, from 'cio_next' onward,
+	 * cannot be affected by any side effects of reexecuting 'cio'.
+	 */
+	for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
+		cio_next = zio_walk_children(pio);
+		mutex_enter(&pio->io_lock);
+		for (w = 0; w < ZIO_WAIT_TYPES; w++)
+			pio->io_children[cio->io_child_type][w]++;
+		mutex_exit(&pio->io_lock);
+		zio_reexecute(cio);
+	}
+
+	/*
+	 * Now that all children have been reexecuted, execute the parent.
+	 * We don't reexecute "The Godfather" I/O here as it's the
+	 * responsibility of the caller to wait on him.
+	 */
+	if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
+		__zio_execute(pio);
+}
+
+void
+zio_suspend(spa_t *spa, zio_t *zio)
+{
+	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
+		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
+		    "failure and the failure mode property for this pool "
+		    "is set to panic.", spa_name(spa));
+
+	cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
+	    "failure and has been suspended.\n", spa_name(spa));
+
+	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
+
+	mutex_enter(&spa->spa_suspend_lock);
+
+	if (spa->spa_suspend_zio_root == NULL)
+		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+		    ZIO_FLAG_GODFATHER);
+
+	spa->spa_suspended = B_TRUE;
+
+	if (zio != NULL) {
+		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
+		ASSERT(zio != spa->spa_suspend_zio_root);
+		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+		ASSERT(zio_unique_parent(zio) == NULL);
+		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
+		zio_add_child(spa->spa_suspend_zio_root, zio);
+	}
+
+	mutex_exit(&spa->spa_suspend_lock);
+}
+
+int
+zio_resume(spa_t *spa)
+{
+	zio_t *pio;
+
+	/*
+	 * Reexecute all previously suspended i/o.
+	 */
+	mutex_enter(&spa->spa_suspend_lock);
+	spa->spa_suspended = B_FALSE;
+	cv_broadcast(&spa->spa_suspend_cv);
+	pio = spa->spa_suspend_zio_root;
+	spa->spa_suspend_zio_root = NULL;
+	mutex_exit(&spa->spa_suspend_lock);
+
+	if (pio == NULL)
+		return (0);
+
+	zio_reexecute(pio);
+	return (zio_wait(pio));
+}
+
+void
+zio_resume_wait(spa_t *spa)
+{
+	mutex_enter(&spa->spa_suspend_lock);
+	while (spa_suspended(spa))
+		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
+	mutex_exit(&spa->spa_suspend_lock);
+}
+
+/*
+ * ==========================================================================
+ * Gang blocks.
+ *
+ * A gang block is a collection of small blocks that looks to the DMU
+ * like one large block.  When zio_dva_allocate() cannot find a block
+ * of the requested size, due to either severe fragmentation or the pool
+ * being nearly full, it calls zio_write_gang_block() to construct the
+ * block from smaller fragments.
+ *
+ * A gang block consists of a gang header (zio_gbh_phys_t) and up to
+ * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
+ * an indirect block: it's an array of block pointers.  It consumes
+ * only one sector and hence is allocatable regardless of fragmentation.
+ * The gang header's bps point to its gang members, which hold the data.
+ *
+ * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
+ * as the verifier to ensure uniqueness of the SHA256 checksum.
+ * Critically, the gang block bp's blk_cksum is the checksum of the data,
+ * not the gang header.  This ensures that data block signatures (needed for
+ * deduplication) are independent of how the block is physically stored.
+ *
+ * Gang blocks can be nested: a gang member may itself be a gang block.
+ * Thus every gang block is a tree in which root and all interior nodes are
+ * gang headers, and the leaves are normal blocks that contain user data.
+ * The root of the gang tree is called the gang leader.
+ *
+ * To perform any operation (read, rewrite, free, claim) on a gang block,
+ * zio_gang_assemble() first assembles the gang tree (minus data leaves)
+ * in the io_gang_tree field of the original logical i/o by recursively
+ * reading the gang leader and all gang headers below it.  This yields
+ * an in-core tree containing the contents of every gang header and the
+ * bps for every constituent of the gang block.
+ *
+ * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
+ * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
+ * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
+ * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
+ * zio_read_gang() is a wrapper around zio_read() that omits reading gang
+ * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
+ * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
+ * of the gang header plus zio_checksum_compute() of the data to update the
+ * gang header's blk_cksum as described above.
+ *
+ * The two-phase assemble/issue model solves the problem of partial failure --
+ * what if you'd freed part of a gang block but then couldn't read the
+ * gang header for another part?  Assembling the entire gang tree first
+ * ensures that all the necessary gang header I/O has succeeded before
+ * starting the actual work of free, claim, or write.  Once the gang tree
+ * is assembled, free and claim are in-memory operations that cannot fail.
+ *
+ * In the event that a gang write fails, zio_dva_unallocate() walks the
+ * gang tree to immediately free (i.e. insert back into the space map)
+ * everything we've allocated.  This ensures that we don't get ENOSPC
+ * errors during repeated suspend/resume cycles due to a flaky device.
+ *
+ * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
+ * the gang tree, we won't modify the block, so we can safely defer the free
+ * (knowing that the block is still intact).  If we *can* assemble the gang
+ * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
+ * each constituent bp and we can allocate a new block on the next sync pass.
+ *
+ * In all cases, the gang tree allows complete recovery from partial failure.
+ * ==========================================================================
+ */
+
+static zio_t *
+zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+{
+	if (gn != NULL)
+		return (pio);
+
+	return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
+	    NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+	    &pio->io_bookmark));
+}
+
+zio_t *
+zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+{
+	zio_t *zio;
+
+	if (gn != NULL) {
+		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
+		    gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
+		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+		/*
+		 * As we rewrite each gang header, the pipeline will compute
+		 * a new gang block header checksum for it; but no one will
+		 * compute a new data checksum, so we do that here.  The one
+		 * exception is the gang leader: the pipeline already computed
+		 * its data checksum because that stage precedes gang assembly.
+		 * (Presently, nothing actually uses interior data checksums;
+		 * this is just good hygiene.)
+		 */
+		if (gn != pio->io_gang_leader->io_gang_tree) {
+			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
+			    data, BP_GET_PSIZE(bp));
+		}
+		/*
+		 * If we are here to damage data for testing purposes,
+		 * leave the GBH alone so that we can detect the damage.
+		 */
+		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
+			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+	} else {
+		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
+		    data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
+		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+	}
+
+	return (zio);
+}
+
+/* ARGSUSED */
+zio_t *
+zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+{
+	return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
+	    ZIO_GANG_CHILD_FLAGS(pio)));
+}
+
+/* ARGSUSED */
+zio_t *
+zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+{
+	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
+	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
+}
+
+static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
+	NULL,
+	zio_read_gang,
+	zio_rewrite_gang,
+	zio_free_gang,
+	zio_claim_gang,
+	NULL
+};
+
+static void zio_gang_tree_assemble_done(zio_t *zio);
+
+static zio_gang_node_t *
+zio_gang_node_alloc(zio_gang_node_t **gnpp)
+{
+	zio_gang_node_t *gn;
+
+	ASSERT(*gnpp == NULL);
+
+	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
+	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
+	*gnpp = gn;
+
+	return (gn);
+}
+
+static void
+zio_gang_node_free(zio_gang_node_t **gnpp)
+{
+	zio_gang_node_t *gn = *gnpp;
+	int g;
+
+	for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
+		ASSERT(gn->gn_child[g] == NULL);
+
+	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+	kmem_free(gn, sizeof (*gn));
+	*gnpp = NULL;
+}
+
+static void
+zio_gang_tree_free(zio_gang_node_t **gnpp)
+{
+	zio_gang_node_t *gn = *gnpp;
+	int g;
+
+	if (gn == NULL)
+		return;
+
+	for (g = 0; g < SPA_GBH_NBLKPTRS; g++)
+		zio_gang_tree_free(&gn->gn_child[g]);
+
+	zio_gang_node_free(gnpp);
+}
+
+static void
+zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
+{
+	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
+
+	ASSERT(gio->io_gang_leader == gio);
+	ASSERT(BP_IS_GANG(bp));
+
+	zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
+	    SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
+	    gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
+}
+
+static void
+zio_gang_tree_assemble_done(zio_t *zio)
+{
+	zio_t *gio = zio->io_gang_leader;
+	zio_gang_node_t *gn = zio->io_private;
+	blkptr_t *bp = zio->io_bp;
+	int g;
+
+	ASSERT(gio == zio_unique_parent(zio));
+	ASSERT(zio->io_child_count == 0);
+
+	if (zio->io_error)
+		return;
+
+	if (BP_SHOULD_BYTESWAP(bp))
+		byteswap_uint64_array(zio->io_data, zio->io_size);
+
+	ASSERT(zio->io_data == gn->gn_gbh);
+	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
+	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+
+	for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+		if (!BP_IS_GANG(gbp))
+			continue;
+		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
+	}
+}
+
+static void
+zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
+{
+	zio_t *gio = pio->io_gang_leader;
+	zio_t *zio;
+	int g;
+
+	ASSERT(BP_IS_GANG(bp) == !!gn);
+	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
+	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
+
+	/*
+	 * If you're a gang header, your data is in gn->gn_gbh.
+	 * If you're a gang member, your data is in 'data' and gn == NULL.
+	 */
+	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
+
+	if (gn != NULL) {
+		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+
+		for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+			if (BP_IS_HOLE(gbp))
+				continue;
+			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
+			data = (char *)data + BP_GET_PSIZE(gbp);
+		}
+	}
+
+	if (gn == gio->io_gang_tree)
+		ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
+
+	if (zio != pio)
+		zio_nowait(zio);
+}
+
+static int
+zio_gang_assemble(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
+	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+	zio->io_gang_leader = zio;
+
+	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_gang_issue(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
+		return (ZIO_PIPELINE_STOP);
+
+	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
+	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
+		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
+	else
+		zio_gang_tree_free(&zio->io_gang_tree);
+
+	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static void
+zio_write_gang_member_ready(zio_t *zio)
+{
+	zio_t *pio = zio_unique_parent(zio);
+	dva_t *cdva = zio->io_bp->blk_dva;
+	dva_t *pdva = pio->io_bp->blk_dva;
+	uint64_t asize;
+	int d;
+	ASSERTV(zio_t *gio = zio->io_gang_leader);
+
+	if (BP_IS_HOLE(zio->io_bp))
+		return;
+
+	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
+
+	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
+	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
+	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
+	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
+	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
+
+	mutex_enter(&pio->io_lock);
+	for (d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
+		ASSERT(DVA_GET_GANG(&pdva[d]));
+		asize = DVA_GET_ASIZE(&pdva[d]);
+		asize += DVA_GET_ASIZE(&cdva[d]);
+		DVA_SET_ASIZE(&pdva[d], asize);
+	}
+	mutex_exit(&pio->io_lock);
+}
+
+static int
+zio_write_gang_block(zio_t *pio)
+{
+	spa_t *spa = pio->io_spa;
+	blkptr_t *bp = pio->io_bp;
+	zio_t *gio = pio->io_gang_leader;
+	zio_t *zio;
+	zio_gang_node_t *gn, **gnpp;
+	zio_gbh_phys_t *gbh;
+	uint64_t txg = pio->io_txg;
+	uint64_t resid = pio->io_size;
+	uint64_t lsize;
+	int copies = gio->io_prop.zp_copies;
+	int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
+	zio_prop_t zp;
+	int g, error;
+
+	error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
+	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
+	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
+	if (error) {
+		pio->io_error = error;
+		return (ZIO_PIPELINE_CONTINUE);
+	}
+
+	if (pio == gio) {
+		gnpp = &gio->io_gang_tree;
+	} else {
+		gnpp = pio->io_private;
+		ASSERT(pio->io_ready == zio_write_gang_member_ready);
+	}
+
+	gn = zio_gang_node_alloc(gnpp);
+	gbh = gn->gn_gbh;
+	bzero(gbh, SPA_GANGBLOCKSIZE);
+
+	/*
+	 * Create the gang header.
+	 */
+	zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
+	    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+
+	/*
+	 * Create and nowait the gang children.
+	 */
+	for (g = 0; resid != 0; resid -= lsize, g++) {
+		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
+		    SPA_MINBLOCKSIZE);
+		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
+
+		zp.zp_checksum = gio->io_prop.zp_checksum;
+		zp.zp_compress = ZIO_COMPRESS_OFF;
+		zp.zp_type = DMU_OT_NONE;
+		zp.zp_level = 0;
+		zp.zp_copies = gio->io_prop.zp_copies;
+		zp.zp_dedup = B_FALSE;
+		zp.zp_dedup_verify = B_FALSE;
+		zp.zp_nopwrite = B_FALSE;
+
+		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
+		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
+		    zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
+		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+		    &pio->io_bookmark));
+	}
+
+	/*
+	 * Set pio's pipeline to just wait for zio to finish.
+	 */
+	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+	/*
+	 * We didn't allocate this bp, so make sure it doesn't get unmarked.
+	 */
+	pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
+
+	zio_nowait(zio);
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * The zio_nop_write stage in the pipeline determines if allocating
+ * a new bp is necessary.  By leveraging a cryptographically secure checksum,
+ * such as SHA256, we can compare the checksums of the new data and the old
+ * to determine if allocating a new block is required.  The nopwrite
+ * feature can handle writes in either syncing or open context (i.e. zil
+ * writes) and as a result is mutually exclusive with dedup.
+ */
+static int
+zio_nop_write(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
+	zio_prop_t *zp = &zio->io_prop;
+
+	ASSERT(BP_GET_LEVEL(bp) == 0);
+	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+	ASSERT(zp->zp_nopwrite);
+	ASSERT(!zp->zp_dedup);
+	ASSERT(zio->io_bp_override == NULL);
+	ASSERT(IO_IS_ALLOCATING(zio));
+
+	/*
+	 * Check to see if the original bp and the new bp have matching
+	 * characteristics (i.e. same checksum, compression algorithms, etc).
+	 * If they don't then just continue with the pipeline which will
+	 * allocate a new bp.
+	 */
+	if (BP_IS_HOLE(bp_orig) ||
+	    !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
+	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
+	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
+	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
+	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
+		return (ZIO_PIPELINE_CONTINUE);
+
+	/*
+	 * If the checksums match then reset the pipeline so that we
+	 * avoid allocating a new bp and issuing any I/O.
+	 */
+	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
+		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
+		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
+		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
+		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
+		ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
+		    sizeof (uint64_t)) == 0);
+
+		*bp = *bp_orig;
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+		zio->io_flags |= ZIO_FLAG_NOPWRITE;
+	}
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * ==========================================================================
+ * Dedup
+ * ==========================================================================
+ */
+static void
+zio_ddt_child_read_done(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	ddt_entry_t *dde = zio->io_private;
+	ddt_phys_t *ddp;
+	zio_t *pio = zio_unique_parent(zio);
+
+	mutex_enter(&pio->io_lock);
+	ddp = ddt_phys_select(dde, bp);
+	if (zio->io_error == 0)
+		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
+	if (zio->io_error == 0 && dde->dde_repair_data == NULL)
+		dde->dde_repair_data = zio->io_data;
+	else
+		zio_buf_free(zio->io_data, zio->io_size);
+	mutex_exit(&pio->io_lock);
+}
+
+static int
+zio_ddt_read_start(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	int p;
+
+	ASSERT(BP_GET_DEDUP(bp));
+	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+	if (zio->io_child_error[ZIO_CHILD_DDT]) {
+		ddt_t *ddt = ddt_select(zio->io_spa, bp);
+		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
+		ddt_phys_t *ddp = dde->dde_phys;
+		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
+		blkptr_t blk;
+
+		ASSERT(zio->io_vsd == NULL);
+		zio->io_vsd = dde;
+
+		if (ddp_self == NULL)
+			return (ZIO_PIPELINE_CONTINUE);
+
+		for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+				continue;
+			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
+			    &blk);
+			zio_nowait(zio_read(zio, zio->io_spa, &blk,
+			    zio_buf_alloc(zio->io_size), zio->io_size,
+			    zio_ddt_child_read_done, dde, zio->io_priority,
+			    ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
+			    &zio->io_bookmark));
+		}
+		return (ZIO_PIPELINE_CONTINUE);
+	}
+
+	zio_nowait(zio_read(zio, zio->io_spa, bp,
+	    zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
+	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_ddt_read_done(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
+		return (ZIO_PIPELINE_STOP);
+
+	ASSERT(BP_GET_DEDUP(bp));
+	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+	if (zio->io_child_error[ZIO_CHILD_DDT]) {
+		ddt_t *ddt = ddt_select(zio->io_spa, bp);
+		ddt_entry_t *dde = zio->io_vsd;
+		if (ddt == NULL) {
+			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
+			return (ZIO_PIPELINE_CONTINUE);
+		}
+		if (dde == NULL) {
+			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
+			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
+			return (ZIO_PIPELINE_STOP);
+		}
+		if (dde->dde_repair_data != NULL) {
+			bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
+			zio->io_child_error[ZIO_CHILD_DDT] = 0;
+		}
+		ddt_repair_done(ddt, dde);
+		zio->io_vsd = NULL;
+	}
+
+	ASSERT(zio->io_vsd == NULL);
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static boolean_t
+zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
+{
+	spa_t *spa = zio->io_spa;
+	int p;
+
+	/*
+	 * Note: we compare the original data, not the transformed data,
+	 * because when zio->io_bp is an override bp, we will not have
+	 * pushed the I/O transforms.  That's an important optimization
+	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
+	 */
+	for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+		zio_t *lio = dde->dde_lead_zio[p];
+
+		if (lio != NULL) {
+			return (lio->io_orig_size != zio->io_orig_size ||
+			    bcmp(zio->io_orig_data, lio->io_orig_data,
+			    zio->io_orig_size) != 0);
+		}
+	}
+
+	for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+		ddt_phys_t *ddp = &dde->dde_phys[p];
+
+		if (ddp->ddp_phys_birth != 0) {
+			arc_buf_t *abuf = NULL;
+			arc_flags_t aflags = ARC_FLAG_WAIT;
+			blkptr_t blk = *zio->io_bp;
+			int error;
+
+			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+
+			ddt_exit(ddt);
+
+			error = arc_read(NULL, spa, &blk,
+			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+			    &aflags, &zio->io_bookmark);
+
+			if (error == 0) {
+				if (arc_buf_size(abuf) != zio->io_orig_size ||
+				    bcmp(abuf->b_data, zio->io_orig_data,
+				    zio->io_orig_size) != 0)
+					error = SET_ERROR(EEXIST);
+				VERIFY(arc_buf_remove_ref(abuf, &abuf));
+			}
+
+			ddt_enter(ddt);
+			return (error != 0);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+static void
+zio_ddt_child_write_ready(zio_t *zio)
+{
+	int p = zio->io_prop.zp_copies;
+	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+	ddt_entry_t *dde = zio->io_private;
+	ddt_phys_t *ddp = &dde->dde_phys[p];
+	zio_t *pio;
+
+	if (zio->io_error)
+		return;
+
+	ddt_enter(ddt);
+
+	ASSERT(dde->dde_lead_zio[p] == zio);
+
+	ddt_phys_fill(ddp, zio->io_bp);
+
+	while ((pio = zio_walk_parents(zio)) != NULL)
+		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
+
+	ddt_exit(ddt);
+}
+
+static void
+zio_ddt_child_write_done(zio_t *zio)
+{
+	int p = zio->io_prop.zp_copies;
+	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+	ddt_entry_t *dde = zio->io_private;
+	ddt_phys_t *ddp = &dde->dde_phys[p];
+
+	ddt_enter(ddt);
+
+	ASSERT(ddp->ddp_refcnt == 0);
+	ASSERT(dde->dde_lead_zio[p] == zio);
+	dde->dde_lead_zio[p] = NULL;
+
+	if (zio->io_error == 0) {
+		while (zio_walk_parents(zio) != NULL)
+			ddt_phys_addref(ddp);
+	} else {
+		ddt_phys_clear(ddp);
+	}
+
+	ddt_exit(ddt);
+}
+
+static void
+zio_ddt_ditto_write_done(zio_t *zio)
+{
+	int p = DDT_PHYS_DITTO;
+	blkptr_t *bp = zio->io_bp;
+	ddt_t *ddt = ddt_select(zio->io_spa, bp);
+	ddt_entry_t *dde = zio->io_private;
+	ddt_phys_t *ddp = &dde->dde_phys[p];
+	ddt_key_t *ddk = &dde->dde_key;
+	ASSERTV(zio_prop_t *zp = &zio->io_prop);
+
+	ddt_enter(ddt);
+
+	ASSERT(ddp->ddp_refcnt == 0);
+	ASSERT(dde->dde_lead_zio[p] == zio);
+	dde->dde_lead_zio[p] = NULL;
+
+	if (zio->io_error == 0) {
+		ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
+		ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
+		ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
+		if (ddp->ddp_phys_birth != 0)
+			ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
+		ddt_phys_fill(ddp, bp);
+	}
+
+	ddt_exit(ddt);
+}
+
+static int
+zio_ddt_write(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	blkptr_t *bp = zio->io_bp;
+	uint64_t txg = zio->io_txg;
+	zio_prop_t *zp = &zio->io_prop;
+	int p = zp->zp_copies;
+	int ditto_copies;
+	zio_t *cio = NULL;
+	zio_t *dio = NULL;
+	ddt_t *ddt = ddt_select(spa, bp);
+	ddt_entry_t *dde;
+	ddt_phys_t *ddp;
+
+	ASSERT(BP_GET_DEDUP(bp));
+	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
+	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
+
+	ddt_enter(ddt);
+	dde = ddt_lookup(ddt, bp, B_TRUE);
+	ddp = &dde->dde_phys[p];
+
+	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
+		/*
+		 * If we're using a weak checksum, upgrade to a strong checksum
+		 * and try again.  If we're already using a strong checksum,
+		 * we can't resolve it, so just convert to an ordinary write.
+		 * (And automatically e-mail a paper to Nature?)
+		 */
+		if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
+			zp->zp_checksum = spa_dedup_checksum(spa);
+			zio_pop_transforms(zio);
+			zio->io_stage = ZIO_STAGE_OPEN;
+			BP_ZERO(bp);
+		} else {
+			zp->zp_dedup = B_FALSE;
+		}
+		zio->io_pipeline = ZIO_WRITE_PIPELINE;
+		ddt_exit(ddt);
+		return (ZIO_PIPELINE_CONTINUE);
+	}
+
+	ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
+	ASSERT(ditto_copies < SPA_DVAS_PER_BP);
+
+	if (ditto_copies > ddt_ditto_copies_present(dde) &&
+	    dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
+		zio_prop_t czp = *zp;
+
+		czp.zp_copies = ditto_copies;
+
+		/*
+		 * If we arrived here with an override bp, we won't have run
+		 * the transform stack, so we won't have the data we need to
+		 * generate a child i/o.  So, toss the override bp and restart.
+		 * This is safe, because using the override bp is just an
+		 * optimization; and it's rare, so the cost doesn't matter.
+		 */
+		if (zio->io_bp_override) {
+			zio_pop_transforms(zio);
+			zio->io_stage = ZIO_STAGE_OPEN;
+			zio->io_pipeline = ZIO_WRITE_PIPELINE;
+			zio->io_bp_override = NULL;
+			BP_ZERO(bp);
+			ddt_exit(ddt);
+			return (ZIO_PIPELINE_CONTINUE);
+		}
+
+		dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+		    zio->io_orig_size, &czp, NULL, NULL,
+		    zio_ddt_ditto_write_done, dde, zio->io_priority,
+		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+		zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
+		dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
+	}
+
+	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
+		if (ddp->ddp_phys_birth != 0)
+			ddt_bp_fill(ddp, bp, txg);
+		if (dde->dde_lead_zio[p] != NULL)
+			zio_add_child(zio, dde->dde_lead_zio[p]);
+		else
+			ddt_phys_addref(ddp);
+	} else if (zio->io_bp_override) {
+		ASSERT(bp->blk_birth == txg);
+		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+		ddt_phys_fill(ddp, bp);
+		ddt_phys_addref(ddp);
+	} else {
+		cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+		    zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
+		    zio_ddt_child_write_done, dde, zio->io_priority,
+		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+		zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
+		dde->dde_lead_zio[p] = cio;
+	}
+
+	ddt_exit(ddt);
+
+	if (cio)
+		zio_nowait(cio);
+	if (dio)
+		zio_nowait(dio);
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+ddt_entry_t *freedde; /* for debugging */
+
+static int
+zio_ddt_free(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	blkptr_t *bp = zio->io_bp;
+	ddt_t *ddt = ddt_select(spa, bp);
+	ddt_entry_t *dde;
+	ddt_phys_t *ddp;
+
+	ASSERT(BP_GET_DEDUP(bp));
+	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+	ddt_enter(ddt);
+	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
+	if (dde) {
+		ddp = ddt_phys_select(dde, bp);
+		if (ddp)
+			ddt_phys_decref(ddp);
+	}
+	ddt_exit(ddt);
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free blocks
+ * ==========================================================================
+ */
+static int
+zio_dva_allocate(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	metaslab_class_t *mc = spa_normal_class(spa);
+	blkptr_t *bp = zio->io_bp;
+	int error;
+	int flags = 0;
+
+	if (zio->io_gang_leader == NULL) {
+		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+		zio->io_gang_leader = zio;
+	}
+
+	ASSERT(BP_IS_HOLE(bp));
+	ASSERT0(BP_GET_NDVAS(bp));
+	ASSERT3U(zio->io_prop.zp_copies, >, 0);
+	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
+	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+
+	/*
+	 * The dump device does not support gang blocks so allocation on
+	 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
+	 * the "fast" gang feature.
+	 */
+	flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
+	flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
+	    METASLAB_GANG_CHILD : 0;
+	flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
+	error = metaslab_alloc(spa, mc, zio->io_size, bp,
+	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
+
+	if (error) {
+		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
+		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
+		    error);
+		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
+			return (zio_write_gang_block(zio));
+		zio->io_error = error;
+	}
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_dva_free(zio_t *zio)
+{
+	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_dva_claim(zio_t *zio)
+{
+	int error;
+
+	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
+	if (error)
+		zio->io_error = error;
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * Undo an allocation.  This is used by zio_done() when an I/O fails
+ * and we want to give back the block we just allocated.
+ * This handles both normal blocks and gang blocks.
+ */
+static void
+zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
+{
+	int g;
+
+	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
+	ASSERT(zio->io_bp_override == NULL);
+
+	if (!BP_IS_HOLE(bp))
+		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
+
+	if (gn != NULL) {
+		for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+			zio_dva_unallocate(zio, gn->gn_child[g],
+			    &gn->gn_gbh->zg_blkptr[g]);
+		}
+	}
+}
+
+/*
+ * Try to allocate an intent log block.  Return 0 on success, errno on failure.
+ */
+int
+zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
+    boolean_t use_slog)
+{
+	int error = 1;
+
+	ASSERT(txg > spa_syncing_txg(spa));
+
+	/*
+	 * ZIL blocks are always contiguous (i.e. not gang blocks) so we
+	 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
+	 * when allocating them.
+	 */
+	if (use_slog) {
+		error = metaslab_alloc(spa, spa_log_class(spa), size,
+		    new_bp, 1, txg, NULL,
+		    METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
+	}
+
+	if (error) {
+		error = metaslab_alloc(spa, spa_normal_class(spa), size,
+		    new_bp, 1, txg, NULL,
+		    METASLAB_FASTWRITE);
+	}
+
+	if (error == 0) {
+		BP_SET_LSIZE(new_bp, size);
+		BP_SET_PSIZE(new_bp, size);
+		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
+		BP_SET_CHECKSUM(new_bp,
+		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
+		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
+		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
+		BP_SET_LEVEL(new_bp, 0);
+		BP_SET_DEDUP(new_bp, 0);
+		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
+	}
+
+	return (error);
+}
+
+/*
+ * Free an intent log block.
+ */
+void
+zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
+{
+	ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
+	ASSERT(!BP_IS_GANG(bp));
+
+	zio_free(spa, txg, bp);
+}
+
+/*
+ * ==========================================================================
+ * Read and write to physical devices
+ * ==========================================================================
+ */
+
+
+/*
+ * Issue an I/O to the underlying vdev. Typically the issue pipeline
+ * stops after this stage and will resume upon I/O completion.
+ * However, there are instances where the vdev layer may need to
+ * continue the pipeline when an I/O was not issued. Since the I/O
+ * that was sent to the vdev layer might be different than the one
+ * currently active in the pipeline (see vdev_queue_io()), we explicitly
+ * force the underlying vdev layers to call either zio_execute() or
+ * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
+ */
+static int
+zio_vdev_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	uint64_t align;
+	spa_t *spa = zio->io_spa;
+
+	ASSERT(zio->io_error == 0);
+	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
+
+	if (vd == NULL) {
+		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
+			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
+
+		/*
+		 * The mirror_ops handle multiple DVAs in a single BP.
+		 */
+		vdev_mirror_ops.vdev_op_io_start(zio);
+		return (ZIO_PIPELINE_STOP);
+	}
+
+	/*
+	 * We keep track of time-sensitive I/Os so that the scan thread
+	 * can quickly react to certain workloads.  In particular, we care
+	 * about non-scrubbing, top-level reads and writes with the following
+	 * characteristics:
+	 *	- synchronous writes of user data to non-slog devices
+	 *	- any reads of user data
+	 * When these conditions are met, adjust the timestamp of spa_last_io
+	 * which allows the scan thread to adjust its workload accordingly.
+	 */
+	if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
+	    vd == vd->vdev_top && !vd->vdev_islog &&
+	    zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
+	    zio->io_txg != spa_syncing_txg(spa)) {
+		uint64_t old = spa->spa_last_io;
+		uint64_t new = ddi_get_lbolt64();
+		if (old != new)
+			(void) atomic_cas_64(&spa->spa_last_io, old, new);
+	}
+
+	align = 1ULL << vd->vdev_top->vdev_ashift;
+
+	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
+	    P2PHASE(zio->io_size, align) != 0) {
+		/* Transform logical writes to be a full physical block size. */
+		uint64_t asize = P2ROUNDUP(zio->io_size, align);
+		char *abuf = zio_buf_alloc(asize);
+		ASSERT(vd == vd->vdev_top);
+		if (zio->io_type == ZIO_TYPE_WRITE) {
+			bcopy(zio->io_data, abuf, zio->io_size);
+			bzero(abuf + zio->io_size, asize - zio->io_size);
+		}
+		zio_push_transform(zio, abuf, asize, asize, zio_subblock);
+	}
+
+	/*
+	 * If this is not a physical io, make sure that it is properly aligned
+	 * before proceeding.
+	 */
+	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
+		ASSERT0(P2PHASE(zio->io_offset, align));
+		ASSERT0(P2PHASE(zio->io_size, align));
+	} else {
+		/*
+		 * For physical writes, we allow 512b aligned writes and assume
+		 * the device will perform a read-modify-write as necessary.
+		 */
+		ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
+		ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
+	}
+
+	VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
+
+	/*
+	 * If this is a repair I/O, and there's no self-healing involved --
+	 * that is, we're just resilvering what we expect to resilver --
+	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
+	 * This prevents spurious resilvering with nested replication.
+	 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
+	 * A is out of date, we'll read from C+D, then use the data to
+	 * resilver A+B -- but we don't actually want to resilver B, just A.
+	 * The top-level mirror has no way to know this, so instead we just
+	 * discard unnecessary repairs as we work our way down the vdev tree.
+	 * The same logic applies to any form of nested replication:
+	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
+	 */
+	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
+	    zio->io_txg != 0 &&	/* not a delegated i/o */
+	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
+		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+		zio_vdev_io_bypass(zio);
+		return (ZIO_PIPELINE_CONTINUE);
+	}
+
+	if (vd->vdev_ops->vdev_op_leaf &&
+	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
+
+		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
+			return (ZIO_PIPELINE_CONTINUE);
+
+		if ((zio = vdev_queue_io(zio)) == NULL)
+			return (ZIO_PIPELINE_STOP);
+
+		if (!vdev_accessible(vd, zio)) {
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return (ZIO_PIPELINE_STOP);
+		}
+	}
+
+	vd->vdev_ops->vdev_op_io_start(zio);
+	return (ZIO_PIPELINE_STOP);
+}
+
+static int
+zio_vdev_io_done(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
+	boolean_t unexpected_error = B_FALSE;
+
+	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
+		return (ZIO_PIPELINE_STOP);
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+
+	if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
+
+		vdev_queue_io_done(zio);
+
+		if (zio->io_type == ZIO_TYPE_WRITE)
+			vdev_cache_write(zio);
+
+		if (zio_injection_enabled && zio->io_error == 0)
+			zio->io_error = zio_handle_device_injection(vd,
+			    zio, EIO);
+
+		if (zio_injection_enabled && zio->io_error == 0)
+			zio->io_error = zio_handle_label_injection(zio, EIO);
+
+		if (zio->io_error) {
+			if (!vdev_accessible(vd, zio)) {
+				zio->io_error = SET_ERROR(ENXIO);
+			} else {
+				unexpected_error = B_TRUE;
+			}
+		}
+	}
+
+	ops->vdev_op_io_done(zio);
+
+	if (unexpected_error)
+		VERIFY(vdev_probe(vd, zio) == NULL);
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * For non-raidz ZIOs, we can just copy aside the bad data read from the
+ * disk, and use that to finish the checksum ereport later.
+ */
+static void
+zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
+    const void *good_buf)
+{
+	/* no processing needed */
+	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
+}
+
+/*ARGSUSED*/
+void
+zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
+{
+	void *buf = zio_buf_alloc(zio->io_size);
+
+	bcopy(zio->io_data, buf, zio->io_size);
+
+	zcr->zcr_cbinfo = zio->io_size;
+	zcr->zcr_cbdata = buf;
+	zcr->zcr_finish = zio_vsd_default_cksum_finish;
+	zcr->zcr_free = zio_buf_free;
+}
+
+static int
+zio_vdev_io_assess(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+
+	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
+		return (ZIO_PIPELINE_STOP);
+
+	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
+		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
+
+	if (zio->io_vsd != NULL) {
+		zio->io_vsd_ops->vsd_free(zio);
+		zio->io_vsd = NULL;
+	}
+
+	if (zio_injection_enabled && zio->io_error == 0)
+		zio->io_error = zio_handle_fault_injection(zio, EIO);
+
+	/*
+	 * If the I/O failed, determine whether we should attempt to retry it.
+	 *
+	 * On retry, we cut in line in the issue queue, since we don't want
+	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
+	 */
+	if (zio->io_error && vd == NULL &&
+	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
+		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
+		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
+		zio->io_error = 0;
+		zio->io_flags |= ZIO_FLAG_IO_RETRY |
+		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
+		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
+		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
+		    zio_requeue_io_start_cut_in_line);
+		return (ZIO_PIPELINE_STOP);
+	}
+
+	/*
+	 * If we got an error on a leaf device, convert it to ENXIO
+	 * if the device is not accessible at all.
+	 */
+	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+	    !vdev_accessible(vd, zio))
+		zio->io_error = SET_ERROR(ENXIO);
+
+	/*
+	 * If we can't write to an interior vdev (mirror or RAID-Z),
+	 * set vdev_cant_write so that we stop trying to allocate from it.
+	 */
+	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
+	    vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
+		vd->vdev_cant_write = B_TRUE;
+	}
+
+	if (zio->io_error)
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+	    zio->io_physdone != NULL) {
+		ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
+		ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
+		zio->io_physdone(zio->io_logical);
+	}
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+void
+zio_vdev_io_reissue(zio_t *zio)
+{
+	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+	ASSERT(zio->io_error == 0);
+
+	zio->io_stage >>= 1;
+}
+
+void
+zio_vdev_io_redone(zio_t *zio)
+{
+	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
+
+	zio->io_stage >>= 1;
+}
+
+void
+zio_vdev_io_bypass(zio_t *zio)
+{
+	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+	ASSERT(zio->io_error == 0);
+
+	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
+	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
+}
+
+/*
+ * ==========================================================================
+ * Generate and verify checksums
+ * ==========================================================================
+ */
+static int
+zio_checksum_generate(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	enum zio_checksum checksum;
+
+	if (bp == NULL) {
+		/*
+		 * This is zio_write_phys().
+		 * We're either generating a label checksum, or none at all.
+		 */
+		checksum = zio->io_prop.zp_checksum;
+
+		if (checksum == ZIO_CHECKSUM_OFF)
+			return (ZIO_PIPELINE_CONTINUE);
+
+		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
+	} else {
+		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
+			ASSERT(!IO_IS_ALLOCATING(zio));
+			checksum = ZIO_CHECKSUM_GANG_HEADER;
+		} else {
+			checksum = BP_GET_CHECKSUM(bp);
+		}
+	}
+
+	zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_checksum_verify(zio_t *zio)
+{
+	zio_bad_cksum_t info;
+	blkptr_t *bp = zio->io_bp;
+	int error;
+
+	ASSERT(zio->io_vd != NULL);
+
+	if (bp == NULL) {
+		/*
+		 * This is zio_read_phys().
+		 * We're either verifying a label checksum, or nothing at all.
+		 */
+		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
+			return (ZIO_PIPELINE_CONTINUE);
+
+		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
+	}
+
+	if ((error = zio_checksum_error(zio, &info)) != 0) {
+		zio->io_error = error;
+		if (error == ECKSUM &&
+		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+			zfs_ereport_start_checksum(zio->io_spa,
+			    zio->io_vd, zio, zio->io_offset,
+			    zio->io_size, NULL, &info);
+		}
+	}
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * Called by RAID-Z to ensure we don't compute the checksum twice.
+ */
+void
+zio_checksum_verified(zio_t *zio)
+{
+	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
+}
+
+/*
+ * ==========================================================================
+ * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
+ * An error of 0 indicates success.  ENXIO indicates whole-device failure,
+ * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
+ * indicate errors that are specific to one I/O, and most likely permanent.
+ * Any other error is presumed to be worse because we weren't expecting it.
+ * ==========================================================================
+ */
+int
+zio_worst_error(int e1, int e2)
+{
+	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
+	int r1, r2;
+
+	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
+		if (e1 == zio_error_rank[r1])
+			break;
+
+	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
+		if (e2 == zio_error_rank[r2])
+			break;
+
+	return (r1 > r2 ? e1 : e2);
+}
+
+/*
+ * ==========================================================================
+ * I/O completion
+ * ==========================================================================
+ */
+static int
+zio_ready(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	zio_t *pio, *pio_next;
+
+	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
+	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
+		return (ZIO_PIPELINE_STOP);
+
+	if (zio->io_ready) {
+		ASSERT(IO_IS_ALLOCATING(zio));
+		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
+		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
+		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
+
+		zio->io_ready(zio);
+	}
+
+	if (bp != NULL && bp != &zio->io_bp_copy)
+		zio->io_bp_copy = *bp;
+
+	if (zio->io_error)
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+	mutex_enter(&zio->io_lock);
+	zio->io_state[ZIO_WAIT_READY] = 1;
+	pio = zio_walk_parents(zio);
+	mutex_exit(&zio->io_lock);
+
+	/*
+	 * As we notify zio's parents, new parents could be added.
+	 * New parents go to the head of zio's io_parent_list, however,
+	 * so we will (correctly) not notify them.  The remainder of zio's
+	 * io_parent_list, from 'pio_next' onward, cannot change because
+	 * all parents must wait for us to be done before they can be done.
+	 */
+	for (; pio != NULL; pio = pio_next) {
+		pio_next = zio_walk_parents(zio);
+		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
+	}
+
+	if (zio->io_flags & ZIO_FLAG_NODATA) {
+		if (BP_IS_GANG(bp)) {
+			zio->io_flags &= ~ZIO_FLAG_NODATA;
+		} else {
+			ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
+			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+		}
+	}
+
+	if (zio_injection_enabled &&
+	    zio->io_spa->spa_syncing_txg == zio->io_txg)
+		zio_handle_ignored_writes(zio);
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_done(zio_t *zio)
+{
+	zio_t *pio, *pio_next;
+	int c, w;
+
+	/*
+	 * If our children haven't all completed,
+	 * wait for them and then repeat this pipeline stage.
+	 */
+	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
+	    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
+	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
+	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
+		return (ZIO_PIPELINE_STOP);
+
+	for (c = 0; c < ZIO_CHILD_TYPES; c++)
+		for (w = 0; w < ZIO_WAIT_TYPES; w++)
+			ASSERT(zio->io_children[c][w] == 0);
+
+	if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
+		ASSERT(zio->io_bp->blk_pad[0] == 0);
+		ASSERT(zio->io_bp->blk_pad[1] == 0);
+		ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
+		    sizeof (blkptr_t)) == 0 ||
+		    (zio->io_bp == zio_unique_parent(zio)->io_bp));
+		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
+		    zio->io_bp_override == NULL &&
+		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
+			ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp));
+			ASSERT3U(zio->io_prop.zp_copies, <=,
+			    BP_GET_NDVAS(zio->io_bp));
+			ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
+			    (BP_COUNT_GANG(zio->io_bp) ==
+			    BP_GET_NDVAS(zio->io_bp)));
+		}
+		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
+			VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
+	}
+
+	/*
+	 * If there were child vdev/gang/ddt errors, they apply to us now.
+	 */
+	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
+	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
+	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
+
+	/*
+	 * If the I/O on the transformed data was successful, generate any
+	 * checksum reports now while we still have the transformed data.
+	 */
+	if (zio->io_error == 0) {
+		while (zio->io_cksum_report != NULL) {
+			zio_cksum_report_t *zcr = zio->io_cksum_report;
+			uint64_t align = zcr->zcr_align;
+			uint64_t asize = P2ROUNDUP(zio->io_size, align);
+			char *abuf = zio->io_data;
+
+			if (asize != zio->io_size) {
+				abuf = zio_buf_alloc(asize);
+				bcopy(zio->io_data, abuf, zio->io_size);
+				bzero(abuf+zio->io_size, asize-zio->io_size);
+			}
+
+			zio->io_cksum_report = zcr->zcr_next;
+			zcr->zcr_next = NULL;
+			zcr->zcr_finish(zcr, abuf);
+			zfs_ereport_free_checksum(zcr);
+
+			if (asize != zio->io_size)
+				zio_buf_free(abuf, asize);
+		}
+	}
+
+	zio_pop_transforms(zio);	/* note: may set zio->io_error */
+
+	vdev_stat_update(zio, zio->io_size);
+
+	/*
+	 * If this I/O is attached to a particular vdev is slow, exceeding
+	 * 30 seconds to complete, post an error described the I/O delay.
+	 * We ignore these errors if the device is currently unavailable.
+	 */
+	if (zio->io_delay >= MSEC_TO_TICK(zio_delay_max)) {
+		if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd))
+			zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa,
+			    zio->io_vd, zio, 0, 0);
+	}
+
+	if (zio->io_error) {
+		/*
+		 * If this I/O is attached to a particular vdev,
+		 * generate an error message describing the I/O failure
+		 * at the block level.  We ignore these errors if the
+		 * device is currently unavailable.
+		 */
+		if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
+			!vdev_is_dead(zio->io_vd))
+			zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa,
+						zio->io_vd, zio, 0, 0);
+
+		if ((zio->io_error == EIO || !(zio->io_flags &
+		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
+		    zio == zio->io_logical) {
+			/*
+			 * For logical I/O requests, tell the SPA to log the
+			 * error and generate a logical data ereport.
+			 */
+			spa_log_error(zio->io_spa, zio);
+			zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa,
+			    NULL, zio, 0, 0);
+		}
+	}
+
+	if (zio->io_error && zio == zio->io_logical) {
+		/*
+		 * Determine whether zio should be reexecuted.  This will
+		 * propagate all the way to the root via zio_notify_parent().
+		 */
+		ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
+		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+		if (IO_IS_ALLOCATING(zio) &&
+		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+			if (zio->io_error != ENOSPC)
+				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
+			else
+				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+		}
+
+		if ((zio->io_type == ZIO_TYPE_READ ||
+		    zio->io_type == ZIO_TYPE_FREE) &&
+		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
+		    zio->io_error == ENXIO &&
+		    spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
+		    spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
+			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+
+		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
+			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+
+		/*
+		 * Here is a possibly good place to attempt to do
+		 * either combinatorial reconstruction or error correction
+		 * based on checksums.  It also might be a good place
+		 * to send out preliminary ereports before we suspend
+		 * processing.
+		 */
+	}
+
+	/*
+	 * If there were logical child errors, they apply to us now.
+	 * We defer this until now to avoid conflating logical child
+	 * errors with errors that happened to the zio itself when
+	 * updating vdev stats and reporting FMA events above.
+	 */
+	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
+
+	if ((zio->io_error || zio->io_reexecute) &&
+	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
+	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
+		zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
+
+	zio_gang_tree_free(&zio->io_gang_tree);
+
+	/*
+	 * Godfather I/Os should never suspend.
+	 */
+	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
+	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
+		zio->io_reexecute = 0;
+
+	if (zio->io_reexecute) {
+		/*
+		 * This is a logical I/O that wants to reexecute.
+		 *
+		 * Reexecute is top-down.  When an i/o fails, if it's not
+		 * the root, it simply notifies its parent and sticks around.
+		 * The parent, seeing that it still has children in zio_done(),
+		 * does the same.  This percolates all the way up to the root.
+		 * The root i/o will reexecute or suspend the entire tree.
+		 *
+		 * This approach ensures that zio_reexecute() honors
+		 * all the original i/o dependency relationships, e.g.
+		 * parents not executing until children are ready.
+		 */
+		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+		zio->io_gang_leader = NULL;
+
+		mutex_enter(&zio->io_lock);
+		zio->io_state[ZIO_WAIT_DONE] = 1;
+		mutex_exit(&zio->io_lock);
+
+		/*
+		 * "The Godfather" I/O monitors its children but is
+		 * not a true parent to them. It will track them through
+		 * the pipeline but severs its ties whenever they get into
+		 * trouble (e.g. suspended). This allows "The Godfather"
+		 * I/O to return status without blocking.
+		 */
+		for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
+			zio_link_t *zl = zio->io_walk_link;
+			pio_next = zio_walk_parents(zio);
+
+			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
+			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
+				zio_remove_child(pio, zio, zl);
+				zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
+			}
+		}
+
+		if ((pio = zio_unique_parent(zio)) != NULL) {
+			/*
+			 * We're not a root i/o, so there's nothing to do
+			 * but notify our parent.  Don't propagate errors
+			 * upward since we haven't permanently failed yet.
+			 */
+			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
+			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
+			zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
+		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
+			/*
+			 * We'd fail again if we reexecuted now, so suspend
+			 * until conditions improve (e.g. device comes online).
+			 */
+			zio_suspend(zio->io_spa, zio);
+		} else {
+			/*
+			 * Reexecution is potentially a huge amount of work.
+			 * Hand it off to the otherwise-unused claim taskq.
+			 */
+			ASSERT(taskq_empty_ent(&zio->io_tqent));
+			spa_taskq_dispatch_ent(zio->io_spa,
+			    ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
+			    (task_func_t *)zio_reexecute, zio, 0,
+			    &zio->io_tqent);
+		}
+		return (ZIO_PIPELINE_STOP);
+	}
+
+	ASSERT(zio->io_child_count == 0);
+	ASSERT(zio->io_reexecute == 0);
+	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
+
+	/*
+	 * Report any checksum errors, since the I/O is complete.
+	 */
+	while (zio->io_cksum_report != NULL) {
+		zio_cksum_report_t *zcr = zio->io_cksum_report;
+		zio->io_cksum_report = zcr->zcr_next;
+		zcr->zcr_next = NULL;
+		zcr->zcr_finish(zcr, NULL);
+		zfs_ereport_free_checksum(zcr);
+	}
+
+	if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
+	    !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
+	    !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
+		metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
+	}
+
+	/*
+	 * It is the responsibility of the done callback to ensure that this
+	 * particular zio is no longer discoverable for adoption, and as
+	 * such, cannot acquire any new parents.
+	 */
+	if (zio->io_done)
+		zio->io_done(zio);
+
+	mutex_enter(&zio->io_lock);
+	zio->io_state[ZIO_WAIT_DONE] = 1;
+	mutex_exit(&zio->io_lock);
+
+	for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
+		zio_link_t *zl = zio->io_walk_link;
+		pio_next = zio_walk_parents(zio);
+		zio_remove_child(pio, zio, zl);
+		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
+	}
+
+	if (zio->io_waiter != NULL) {
+		mutex_enter(&zio->io_lock);
+		zio->io_executor = NULL;
+		cv_broadcast(&zio->io_cv);
+		mutex_exit(&zio->io_lock);
+	} else {
+		zio_destroy(zio);
+	}
+
+	return (ZIO_PIPELINE_STOP);
+}
+
+/*
+ * ==========================================================================
+ * I/O pipeline definition
+ * ==========================================================================
+ */
+static zio_pipe_stage_t *zio_pipeline[] = {
+	NULL,
+	zio_read_bp_init,
+	zio_free_bp_init,
+	zio_issue_async,
+	zio_write_bp_init,
+	zio_checksum_generate,
+	zio_nop_write,
+	zio_ddt_read_start,
+	zio_ddt_read_done,
+	zio_ddt_write,
+	zio_ddt_free,
+	zio_gang_assemble,
+	zio_gang_issue,
+	zio_dva_allocate,
+	zio_dva_free,
+	zio_dva_claim,
+	zio_ready,
+	zio_vdev_io_start,
+	zio_vdev_io_done,
+	zio_vdev_io_assess,
+	zio_checksum_verify,
+	zio_done
+};
+
+/* dnp is the dnode for zb1->zb_object */
+boolean_t
+zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
+    const zbookmark_phys_t *zb2)
+{
+	uint64_t zb1nextL0, zb2thisobj;
+
+	ASSERT(zb1->zb_objset == zb2->zb_objset);
+	ASSERT(zb2->zb_level == 0);
+
+	/* The objset_phys_t isn't before anything. */
+	if (dnp == NULL)
+		return (B_FALSE);
+
+	zb1nextL0 = (zb1->zb_blkid + 1) <<
+	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+
+	zb2thisobj = zb2->zb_object ? zb2->zb_object :
+	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
+
+	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
+		uint64_t nextobj = zb1nextL0 *
+		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
+		return (nextobj <= zb2thisobj);
+	}
+
+	if (zb1->zb_object < zb2thisobj)
+		return (B_TRUE);
+	if (zb1->zb_object > zb2thisobj)
+		return (B_FALSE);
+	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
+		return (B_FALSE);
+	return (zb1nextL0 <= zb2->zb_blkid);
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+EXPORT_SYMBOL(zio_type_name);
+EXPORT_SYMBOL(zio_buf_alloc);
+EXPORT_SYMBOL(zio_data_buf_alloc);
+EXPORT_SYMBOL(zio_buf_free);
+EXPORT_SYMBOL(zio_data_buf_free);
+
+module_param(zio_delay_max, int, 0644);
+MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event");
+
+module_param(zio_requeue_io_start_cut_in_line, int, 0644);
+MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O");
+
+module_param(zfs_sync_pass_deferred_free, int, 0644);
+MODULE_PARM_DESC(zfs_sync_pass_deferred_free,
+	"Defer frees starting in this pass");
+
+module_param(zfs_sync_pass_dont_compress, int, 0644);
+MODULE_PARM_DESC(zfs_sync_pass_dont_compress,
+	"Don't compress starting in this pass");
+
+module_param(zfs_sync_pass_rewrite, int, 0644);
+MODULE_PARM_DESC(zfs_sync_pass_rewrite,
+	"Rewrite new bps starting in this pass");
+#endif
diff --git a/module/zfs/zvol.c.orig b/module/zfs/zvol.c.orig
new file mode 100644
index 000000000000..574d3c306bc9
--- /dev/null
+++ b/module/zfs/zvol.c.orig
@@ -0,0 +1,1645 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * LLNL-CODE-403049.
+ *
+ * ZFS volume emulation driver.
+ *
+ * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
+ * Volumes are accessed through the symbolic links named:
+ *
+ * /dev/<pool_name>/<dataset_name>
+ *
+ * Volumes are persistent through reboot and module load.  No user command
+ * needs to be run before opening and using a device.
+ */
+
+#include <sys/dbuf.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/zil_impl.h>
+#include <sys/zio.h>
+#include <sys/zfs_rlock.h>
+#include <sys/zfs_znode.h>
+#include <sys/zvol.h>
+#include <linux/blkdev_compat.h>
+
+unsigned int zvol_inhibit_dev = 0;
+unsigned int zvol_major = ZVOL_MAJOR;
+unsigned int zvol_prefetch_bytes = (128 * 1024);
+unsigned long zvol_max_discard_blocks = 16384;
+
+static kmutex_t zvol_state_lock;
+static list_t zvol_state_list;
+static char *zvol_tag = "zvol_tag";
+
+/*
+ * The in-core state of each volume.
+ */
+typedef struct zvol_state {
+	char			zv_name[MAXNAMELEN];	/* name */
+	uint64_t		zv_volsize;		/* advertised space */
+	uint64_t		zv_volblocksize;	/* volume block size */
+	objset_t		*zv_objset;	/* objset handle */
+	uint32_t		zv_flags;	/* ZVOL_* flags */
+	uint32_t		zv_open_count;	/* open counts */
+	uint32_t		zv_changed;	/* disk changed */
+	zilog_t			*zv_zilog;	/* ZIL handle */
+	znode_t			zv_znode;	/* for range locking */
+	dmu_buf_t		*zv_dbuf;	/* bonus handle */
+	dev_t			zv_dev;		/* device id */
+	struct gendisk		*zv_disk;	/* generic disk */
+	struct request_queue	*zv_queue;	/* request queue */
+	list_node_t		zv_next;	/* next zvol_state_t linkage */
+} zvol_state_t;
+
+#define	ZVOL_RDONLY	0x1
+
+/*
+ * Find the next available range of ZVOL_MINORS minor numbers.  The
+ * zvol_state_list is kept in ascending minor order so we simply need
+ * to scan the list for the first gap in the sequence.  This allows us
+ * to recycle minor number as devices are created and removed.
+ */
+static int
+zvol_find_minor(unsigned *minor)
+{
+	zvol_state_t *zv;
+
+	*minor = 0;
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+	for (zv = list_head(&zvol_state_list); zv != NULL;
+	    zv = list_next(&zvol_state_list, zv), *minor += ZVOL_MINORS) {
+		if (MINOR(zv->zv_dev) != MINOR(*minor))
+			break;
+	}
+
+	/* All minors are in use */
+	if (*minor >= (1 << MINORBITS))
+		return (SET_ERROR(ENXIO));
+
+	return (0);
+}
+
+/*
+ * Find a zvol_state_t given the full major+minor dev_t.
+ */
+static zvol_state_t *
+zvol_find_by_dev(dev_t dev)
+{
+	zvol_state_t *zv;
+
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+	for (zv = list_head(&zvol_state_list); zv != NULL;
+	    zv = list_next(&zvol_state_list, zv)) {
+		if (zv->zv_dev == dev)
+			return (zv);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Find a zvol_state_t given the name provided at zvol_alloc() time.
+ */
+static zvol_state_t *
+zvol_find_by_name(const char *name)
+{
+	zvol_state_t *zv;
+
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+	for (zv = list_head(&zvol_state_list); zv != NULL;
+	    zv = list_next(&zvol_state_list, zv)) {
+		if (strncmp(zv->zv_name, name, MAXNAMELEN) == 0)
+			return (zv);
+	}
+
+	return (NULL);
+}
+
+
+/*
+ * Given a path, return TRUE if path is a ZVOL.
+ */
+boolean_t
+zvol_is_zvol(const char *device)
+{
+	struct block_device *bdev;
+	unsigned int major;
+
+	bdev = lookup_bdev(device);
+	if (IS_ERR(bdev))
+		return (B_FALSE);
+
+	major = MAJOR(bdev->bd_dev);
+	bdput(bdev);
+
+	if (major == zvol_major)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+/*
+ * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
+ */
+void
+zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+{
+	zfs_creat_t *zct = arg;
+	nvlist_t *nvprops = zct->zct_props;
+	int error;
+	uint64_t volblocksize, volsize;
+
+	VERIFY(nvlist_lookup_uint64(nvprops,
+	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
+	if (nvlist_lookup_uint64(nvprops,
+	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
+		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
+
+	/*
+	 * These properties must be removed from the list so the generic
+	 * property setting step won't apply to them.
+	 */
+	VERIFY(nvlist_remove_all(nvprops,
+	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
+	(void) nvlist_remove_all(nvprops,
+	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
+
+	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
+	    DMU_OT_NONE, 0, tx);
+	ASSERT(error == 0);
+
+	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
+	    DMU_OT_NONE, 0, tx);
+	ASSERT(error == 0);
+
+	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
+	ASSERT(error == 0);
+}
+
+/*
+ * ZFS_IOC_OBJSET_STATS entry point.
+ */
+int
+zvol_get_stats(objset_t *os, nvlist_t *nv)
+{
+	int error;
+	dmu_object_info_t *doi;
+	uint64_t val;
+
+	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
+	if (error)
+		return (SET_ERROR(error));
+
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
+	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+	error = dmu_object_info(os, ZVOL_OBJ, doi);
+
+	if (error == 0) {
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
+		    doi->doi_data_block_size);
+	}
+
+	kmem_free(doi, sizeof (dmu_object_info_t));
+
+	return (SET_ERROR(error));
+}
+
+static void
+zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
+{
+	struct block_device *bdev;
+
+	bdev = bdget_disk(zv->zv_disk, 0);
+	if (bdev == NULL)
+		return;
+/*
+ * 2.6.28 API change
+ * Added check_disk_size_change() helper function.
+ */
+#ifdef HAVE_CHECK_DISK_SIZE_CHANGE
+	set_capacity(zv->zv_disk, volsize >> 9);
+	zv->zv_volsize = volsize;
+	check_disk_size_change(zv->zv_disk, bdev);
+#else
+	zv->zv_volsize = volsize;
+	zv->zv_changed = 1;
+	(void) check_disk_change(bdev);
+#endif /* HAVE_CHECK_DISK_SIZE_CHANGE */
+
+	bdput(bdev);
+}
+
+/*
+ * Sanity check volume size.
+ */
+int
+zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
+{
+	if (volsize == 0)
+		return (SET_ERROR(EINVAL));
+
+	if (volsize % blocksize != 0)
+		return (SET_ERROR(EINVAL));
+
+#ifdef _ILP32
+	if (volsize - 1 > MAXOFFSET_T)
+		return (SET_ERROR(EOVERFLOW));
+#endif
+	return (0);
+}
+
+/*
+ * Ensure the zap is flushed then inform the VFS of the capacity change.
+ */
+static int
+zvol_update_volsize(uint64_t volsize, objset_t *os)
+{
+	dmu_tx_t *tx;
+	int error;
+
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (SET_ERROR(error));
+	}
+
+	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
+	    &volsize, tx);
+	dmu_tx_commit(tx);
+
+	if (error == 0)
+		error = dmu_free_long_range(os,
+		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
+
+	return (error);
+}
+
+static int
+zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
+{
+	zvol_size_changed(zv, volsize);
+
+	/*
+	 * We should post a event here describing the expansion.  However,
+	 * the zfs_ereport_post() interface doesn't nicely support posting
+	 * events for zvols, it assumes events relate to vdevs or zios.
+	 */
+
+	return (0);
+}
+
+/*
+ * Set ZFS_PROP_VOLSIZE set entry point.
+ */
+int
+zvol_set_volsize(const char *name, uint64_t volsize)
+{
+	zvol_state_t *zv = NULL;
+	objset_t *os = NULL;
+	int error;
+	dmu_object_info_t *doi;
+	uint64_t readonly;
+	boolean_t owned = B_FALSE;
+
+	error = dsl_prop_get_integer(name,
+	    zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
+	if (error != 0)
+		return (SET_ERROR(error));
+	if (readonly)
+		return (SET_ERROR(EROFS));
+
+	mutex_enter(&zvol_state_lock);
+	zv = zvol_find_by_name(name);
+
+	if (zv == NULL || zv->zv_objset == NULL) {
+		if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
+		    FTAG, &os)) != 0) {
+			mutex_exit(&zvol_state_lock);
+			return (SET_ERROR(error));
+		}
+		owned = B_TRUE;
+		if (zv != NULL)
+			zv->zv_objset = os;
+	} else {
+		os = zv->zv_objset;
+	}
+
+	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+
+	if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
+	    (error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
+		goto out;
+
+	error = zvol_update_volsize(volsize, os);
+	kmem_free(doi, sizeof (dmu_object_info_t));
+
+	if (error == 0 && zv != NULL)
+		error = zvol_update_live_volsize(zv, volsize);
+out:
+	if (owned) {
+		dmu_objset_disown(os, FTAG);
+		if (zv != NULL)
+			zv->zv_objset = NULL;
+	}
+	mutex_exit(&zvol_state_lock);
+	return (error);
+}
+
+/*
+ * Sanity check volume block size.
+ */
+int
+zvol_check_volblocksize(const char *name, uint64_t volblocksize)
+{
+	/* Record sizes above 128k need the feature to be enabled */
+	if (volblocksize > SPA_OLD_MAXBLOCKSIZE) {
+		spa_t *spa;
+		int error;
+
+		if ((error = spa_open(name, &spa, FTAG)) != 0)
+			return (error);
+
+		if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+			spa_close(spa, FTAG);
+			return (SET_ERROR(ENOTSUP));
+		}
+
+		/*
+		 * We don't allow setting the property above 1MB,
+		 * unless the tunable has been changed.
+		 */
+		if (volblocksize > zfs_max_recordsize)
+			return (SET_ERROR(EDOM));
+
+		spa_close(spa, FTAG);
+	}
+
+	if (volblocksize < SPA_MINBLOCKSIZE ||
+	    volblocksize > SPA_MAXBLOCKSIZE ||
+	    !ISP2(volblocksize))
+		return (SET_ERROR(EDOM));
+
+	return (0);
+}
+
+/*
+ * Set ZFS_PROP_VOLBLOCKSIZE set entry point.
+ */
+int
+zvol_set_volblocksize(const char *name, uint64_t volblocksize)
+{
+	zvol_state_t *zv;
+	dmu_tx_t *tx;
+	int error;
+
+	mutex_enter(&zvol_state_lock);
+
+	zv = zvol_find_by_name(name);
+	if (zv == NULL) {
+		error = SET_ERROR(ENXIO);
+		goto out;
+	}
+
+	if (zv->zv_flags & ZVOL_RDONLY) {
+		error = SET_ERROR(EROFS);
+		goto out;
+	}
+
+	tx = dmu_tx_create(zv->zv_objset);
+	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+	} else {
+		error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
+		    volblocksize, 0, tx);
+		if (error == ENOTSUP)
+			error = SET_ERROR(EBUSY);
+		dmu_tx_commit(tx);
+		if (error == 0)
+			zv->zv_volblocksize = volblocksize;
+	}
+out:
+	mutex_exit(&zvol_state_lock);
+
+	return (SET_ERROR(error));
+}
+
+/*
+ * Replay a TX_WRITE ZIL transaction that didn't get committed
+ * after a system failure
+ */
+static int
+zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
+{
+	objset_t *os = zv->zv_objset;
+	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
+	uint64_t off = lr->lr_offset;
+	uint64_t len = lr->lr_length;
+	dmu_tx_t *tx;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+	} else {
+		dmu_write(os, ZVOL_OBJ, off, len, data, tx);
+		dmu_tx_commit(tx);
+	}
+
+	return (SET_ERROR(error));
+}
+
+static int
+zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
+{
+	return (SET_ERROR(ENOTSUP));
+}
+
+/*
+ * Callback vectors for replaying records.
+ * Only TX_WRITE is needed for zvol.
+ */
+zil_replay_func_t zvol_replay_vector[TX_MAX_TYPE] = {
+	(zil_replay_func_t)zvol_replay_err,	/* no such transaction type */
+	(zil_replay_func_t)zvol_replay_err,	/* TX_CREATE */
+	(zil_replay_func_t)zvol_replay_err,	/* TX_MKDIR */
+	(zil_replay_func_t)zvol_replay_err,	/* TX_MKXATTR */
+	(zil_replay_func_t)zvol_replay_err,	/* TX_SYMLINK */
+	(zil_replay_func_t)zvol_replay_err,	/* TX_REMOVE */
+	(zil_replay_func_t)zvol_replay_err,	/* TX_RMDIR */
+	(zil_replay_func_t)zvol_replay_err,	/* TX_LINK */
+	(zil_replay_func_t)zvol_replay_err,	/* TX_RENAME */
+	(zil_replay_func_t)zvol_replay_write,	/* TX_WRITE */
+	(zil_replay_func_t)zvol_replay_err,	/* TX_TRUNCATE */
+	(zil_replay_func_t)zvol_replay_err,	/* TX_SETATTR */
+	(zil_replay_func_t)zvol_replay_err,	/* TX_ACL */
+};
+
+/*
+ * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
+ *
+ * We store data in the log buffers if it's small enough.
+ * Otherwise we will later flush the data out via dmu_sync().
+ */
+ssize_t zvol_immediate_write_sz = 32768;
+
+static void
+zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
+    uint64_t size, int sync)
+{
+	uint32_t blocksize = zv->zv_volblocksize;
+	zilog_t *zilog = zv->zv_zilog;
+	boolean_t slogging;
+	ssize_t immediate_write_sz;
+
+	if (zil_replaying(zilog, tx))
+		return;
+
+	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+		? 0 : zvol_immediate_write_sz;
+	slogging = spa_has_slogs(zilog->zl_spa) &&
+		(zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+
+	while (size) {
+		itx_t *itx;
+		lr_write_t *lr;
+		ssize_t len;
+		itx_wr_state_t write_state;
+
+		/*
+		 * Unlike zfs_log_write() we can be called with
+		 * up to DMU_MAX_ACCESS/2 (5MB) writes.
+		 */
+		if (blocksize > immediate_write_sz && !slogging &&
+		    size >= blocksize && offset % blocksize == 0) {
+			write_state = WR_INDIRECT; /* uses dmu_sync */
+			len = blocksize;
+		} else if (sync) {
+			write_state = WR_COPIED;
+			len = MIN(ZIL_MAX_LOG_DATA, size);
+		} else {
+			write_state = WR_NEED_COPY;
+			len = MIN(ZIL_MAX_LOG_DATA, size);
+		}
+
+		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
+		    (write_state == WR_COPIED ? len : 0));
+		lr = (lr_write_t *)&itx->itx_lr;
+		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
+		    ZVOL_OBJ, offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) {
+			zil_itx_destroy(itx);
+			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+			lr = (lr_write_t *)&itx->itx_lr;
+			write_state = WR_NEED_COPY;
+		}
+
+		itx->itx_wr_state = write_state;
+		if (write_state == WR_NEED_COPY)
+			itx->itx_sod += len;
+		lr->lr_foid = ZVOL_OBJ;
+		lr->lr_offset = offset;
+		lr->lr_length = len;
+		lr->lr_blkoff = 0;
+		BP_ZERO(&lr->lr_blkptr);
+
+		itx->itx_private = zv;
+		itx->itx_sync = sync;
+
+		(void) zil_itx_assign(zilog, itx, tx);
+
+		offset += len;
+		size -= len;
+	}
+}
+
+static int
+zvol_write(struct bio *bio)
+{
+	zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
+	uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+	uint64_t size = BIO_BI_SIZE(bio);
+	int error = 0;
+	dmu_tx_t *tx;
+	rl_t *rl;
+
+	if (bio->bi_rw & VDEV_REQ_FLUSH)
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+	/*
+	 * Some requests are just for flush and nothing else.
+	 */
+	if (size == 0)
+		goto out;
+
+	rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER);
+
+	tx = dmu_tx_create(zv->zv_objset);
+	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, size);
+
+	/* This will only fail for ENOSPC */
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_range_unlock(rl);
+		goto out;
+	}
+
+	error = dmu_write_bio(zv->zv_objset, ZVOL_OBJ, bio, tx);
+	if (error == 0)
+		zvol_log_write(zv, tx, offset, size,
+		    !!(bio->bi_rw & VDEV_REQ_FUA));
+
+	dmu_tx_commit(tx);
+	zfs_range_unlock(rl);
+
+	if ((bio->bi_rw & VDEV_REQ_FUA) ||
+	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+out:
+	return (error);
+}
+
+static int
+zvol_discard(struct bio *bio)
+{
+	zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
+	uint64_t start = BIO_BI_SECTOR(bio) << 9;
+	uint64_t size = BIO_BI_SIZE(bio);
+	uint64_t end = start + size;
+	int error;
+	rl_t *rl;
+
+	if (end > zv->zv_volsize)
+		return (SET_ERROR(EIO));
+
+	/*
+	 * Align the request to volume block boundaries when REQ_SECURE is
+	 * available, but not requested. If we don't, then this will force
+	 * dnode_free_range() to zero out the unaligned parts, which is slow
+	 * (read-modify-write) and useless since we are not freeing any space
+	 * by doing so. Kernels that do not support REQ_SECURE (2.6.32 through
+	 * 2.6.35) will not receive this optimization.
+	 */
+#ifdef REQ_SECURE
+	if (!(bio->bi_rw & REQ_SECURE)) {
+		start = P2ROUNDUP(start, zv->zv_volblocksize);
+		end = P2ALIGN(end, zv->zv_volblocksize);
+		size = end - start;
+	}
+#endif
+
+	if (start >= end)
+		return (0);
+
+	rl = zfs_range_lock(&zv->zv_znode, start, size, RL_WRITER);
+
+	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size);
+
+	/*
+	 * TODO: maybe we should add the operation to the log.
+	 */
+
+	zfs_range_unlock(rl);
+
+	return (error);
+}
+
+static int
+zvol_read(struct bio *bio)
+{
+	zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
+	uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+	uint64_t len = BIO_BI_SIZE(bio);
+	int error;
+	rl_t *rl;
+
+	if (len == 0)
+		return (0);
+
+
+	rl = zfs_range_lock(&zv->zv_znode, offset, len, RL_READER);
+
+	error = dmu_read_bio(zv->zv_objset, ZVOL_OBJ, bio);
+
+	zfs_range_unlock(rl);
+
+	/* convert checksum errors into IO errors */
+	if (error == ECKSUM)
+		error = SET_ERROR(EIO);
+
+	return (error);
+}
+
+static MAKE_REQUEST_FN_RET
+zvol_request(struct request_queue *q, struct bio *bio)
+{
+	zvol_state_t *zv = q->queuedata;
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+	uint64_t offset = BIO_BI_SECTOR(bio);
+	unsigned int sectors = bio_sectors(bio);
+	int rw = bio_data_dir(bio);
+#ifdef HAVE_GENERIC_IO_ACCT
+	unsigned long start = jiffies;
+#endif
+	int error = 0;
+
+	if (bio_has_data(bio) && offset + sectors >
+	    get_capacity(zv->zv_disk)) {
+		printk(KERN_INFO
+		    "%s: bad access: block=%llu, count=%lu\n",
+		    zv->zv_disk->disk_name,
+		    (long long unsigned)offset,
+		    (long unsigned)sectors);
+		error = SET_ERROR(EIO);
+		goto out1;
+	}
+
+	generic_start_io_acct(rw, sectors, &zv->zv_disk->part0);
+
+	if (rw == WRITE) {
+		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
+			error = SET_ERROR(EROFS);
+			goto out2;
+		}
+
+		if (bio->bi_rw & VDEV_REQ_DISCARD) {
+			error = zvol_discard(bio);
+			goto out2;
+		}
+
+		error = zvol_write(bio);
+	} else
+		error = zvol_read(bio);
+
+out2:
+	generic_end_io_acct(rw, &zv->zv_disk->part0, start);
+out1:
+	BIO_END_IO(bio, -error);
+	spl_fstrans_unmark(cookie);
+#ifdef HAVE_MAKE_REQUEST_FN_RET_INT
+	return (0);
+#elif defined(HAVE_MAKE_REQUEST_FN_RET_QC)
+	return (BLK_QC_T_NONE);
+#endif
+}
+
+static void
+zvol_get_done(zgd_t *zgd, int error)
+{
+	if (zgd->zgd_db)
+		dmu_buf_rele(zgd->zgd_db, zgd);
+
+	zfs_range_unlock(zgd->zgd_rl);
+
+	if (error == 0 && zgd->zgd_bp)
+		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+
+	kmem_free(zgd, sizeof (zgd_t));
+}
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+static int
+zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+{
+	zvol_state_t *zv = arg;
+	objset_t *os = zv->zv_objset;
+	uint64_t object = ZVOL_OBJ;
+	uint64_t offset = lr->lr_offset;
+	uint64_t size = lr->lr_length;
+	blkptr_t *bp = &lr->lr_blkptr;
+	dmu_buf_t *db;
+	zgd_t *zgd;
+	int error;
+
+	ASSERT(zio != NULL);
+	ASSERT(size != 0);
+
+	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+	zgd->zgd_zilog = zv->zv_zilog;
+	zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
+
+	/*
+	 * Write records come in two flavors: immediate and indirect.
+	 * For small writes it's cheaper to store the data with the
+	 * log record (immediate); for large writes it's cheaper to
+	 * sync the data and get a pointer to it (indirect) so that
+	 * we don't have to write the data twice.
+	 */
+	if (buf != NULL) { /* immediate write */
+		error = dmu_read(os, object, offset, size, buf,
+		    DMU_READ_NO_PREFETCH);
+	} else {
+		size = zv->zv_volblocksize;
+		offset = P2ALIGN_TYPED(offset, size, uint64_t);
+		error = dmu_buf_hold(os, object, offset, zgd, &db,
+		    DMU_READ_NO_PREFETCH);
+		if (error == 0) {
+			blkptr_t *obp = dmu_buf_get_blkptr(db);
+			if (obp) {
+				ASSERT(BP_IS_HOLE(bp));
+				*bp = *obp;
+			}
+
+			zgd->zgd_db = db;
+			zgd->zgd_bp = &lr->lr_blkptr;
+
+			ASSERT(db != NULL);
+			ASSERT(db->db_offset == offset);
+			ASSERT(db->db_size == size);
+
+			error = dmu_sync(zio, lr->lr_common.lrc_txg,
+			    zvol_get_done, zgd);
+
+			if (error == 0)
+				return (0);
+		}
+	}
+
+	zvol_get_done(zgd, error);
+
+	return (SET_ERROR(error));
+}
+
+/*
+ * The zvol_state_t's are inserted in increasing MINOR(dev_t) order.
+ */
+static void
+zvol_insert(zvol_state_t *zv_insert)
+{
+	zvol_state_t *zv = NULL;
+
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+	ASSERT3U(MINOR(zv_insert->zv_dev) & ZVOL_MINOR_MASK, ==, 0);
+	for (zv = list_head(&zvol_state_list); zv != NULL;
+	    zv = list_next(&zvol_state_list, zv)) {
+		if (MINOR(zv->zv_dev) > MINOR(zv_insert->zv_dev))
+			break;
+	}
+
+	list_insert_before(&zvol_state_list, zv, zv_insert);
+}
+
+/*
+ * Simply remove the zvol from to list of zvols.
+ */
+static void
+zvol_remove(zvol_state_t *zv_remove)
+{
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+	list_remove(&zvol_state_list, zv_remove);
+}
+
+static int
+zvol_first_open(zvol_state_t *zv)
+{
+	objset_t *os;
+	uint64_t volsize;
+	int locked = 0;
+	int error;
+	uint64_t ro;
+
+	/*
+	 * In all other cases the spa_namespace_lock is taken before the
+	 * bdev->bd_mutex lock.  But in this case the Linux __blkdev_get()
+	 * function calls fops->open() with the bdev->bd_mutex lock held.
+	 *
+	 * To avoid a potential lock inversion deadlock we preemptively
+	 * try to take the spa_namespace_lock().  Normally it will not
+	 * be contended and this is safe because spa_open_common() handles
+	 * the case where the caller already holds the spa_namespace_lock.
+	 *
+	 * When it is contended we risk a lock inversion if we were to
+	 * block waiting for the lock.  Luckily, the __blkdev_get()
+	 * function allows us to return -ERESTARTSYS which will result in
+	 * bdev->bd_mutex being dropped, reacquired, and fops->open() being
+	 * called again.  This process can be repeated safely until both
+	 * locks are acquired.
+	 */
+	if (!mutex_owned(&spa_namespace_lock)) {
+		locked = mutex_tryenter(&spa_namespace_lock);
+		if (!locked)
+			return (-SET_ERROR(ERESTARTSYS));
+	}
+
+	error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
+	if (error)
+		goto out_mutex;
+
+	/* lie and say we're read-only */
+	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os);
+	if (error)
+		goto out_mutex;
+
+	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+	if (error) {
+		dmu_objset_disown(os, zvol_tag);
+		goto out_mutex;
+	}
+
+	zv->zv_objset = os;
+	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
+	if (error) {
+		dmu_objset_disown(os, zvol_tag);
+		goto out_mutex;
+	}
+
+	set_capacity(zv->zv_disk, volsize >> 9);
+	zv->zv_volsize = volsize;
+	zv->zv_zilog = zil_open(os, zvol_get_data);
+
+	if (ro || dmu_objset_is_snapshot(os) ||
+	    !spa_writeable(dmu_objset_spa(os))) {
+		set_disk_ro(zv->zv_disk, 1);
+		zv->zv_flags |= ZVOL_RDONLY;
+	} else {
+		set_disk_ro(zv->zv_disk, 0);
+		zv->zv_flags &= ~ZVOL_RDONLY;
+	}
+
+out_mutex:
+	if (locked)
+		mutex_exit(&spa_namespace_lock);
+
+	return (SET_ERROR(-error));
+}
+
+static void
+zvol_last_close(zvol_state_t *zv)
+{
+	zil_close(zv->zv_zilog);
+	zv->zv_zilog = NULL;
+
+	dmu_buf_rele(zv->zv_dbuf, zvol_tag);
+	zv->zv_dbuf = NULL;
+
+	/*
+	 * Evict cached data
+	 */
+	if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
+	    !(zv->zv_flags & ZVOL_RDONLY))
+		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+	(void) dmu_objset_evict_dbufs(zv->zv_objset);
+
+	dmu_objset_disown(zv->zv_objset, zvol_tag);
+	zv->zv_objset = NULL;
+}
+
+static int
+zvol_open(struct block_device *bdev, fmode_t flag)
+{
+	zvol_state_t *zv = bdev->bd_disk->private_data;
+	int error = 0, drop_mutex = 0;
+
+	/*
+	 * If the caller is already holding the mutex do not take it
+	 * again, this will happen as part of zvol_create_minor().
+	 * Once add_disk() is called the device is live and the kernel
+	 * will attempt to open it to read the partition information.
+	 */
+	if (!mutex_owned(&zvol_state_lock)) {
+		mutex_enter(&zvol_state_lock);
+		drop_mutex = 1;
+	}
+
+	ASSERT3P(zv, !=, NULL);
+
+	if (zv->zv_open_count == 0) {
+		error = zvol_first_open(zv);
+		if (error)
+			goto out_mutex;
+	}
+
+	if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+		error = -EROFS;
+		goto out_open_count;
+	}
+
+	zv->zv_open_count++;
+
+out_open_count:
+	if (zv->zv_open_count == 0)
+		zvol_last_close(zv);
+
+out_mutex:
+	if (drop_mutex)
+		mutex_exit(&zvol_state_lock);
+
+	check_disk_change(bdev);
+
+	return (SET_ERROR(error));
+}
+
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
+static void
+#else
+static int
+#endif
+zvol_release(struct gendisk *disk, fmode_t mode)
+{
+	zvol_state_t *zv = disk->private_data;
+	int drop_mutex = 0;
+
+	if (!mutex_owned(&zvol_state_lock)) {
+		mutex_enter(&zvol_state_lock);
+		drop_mutex = 1;
+	}
+
+	if (zv->zv_open_count > 0) {
+		zv->zv_open_count--;
+		if (zv->zv_open_count == 0)
+			zvol_last_close(zv);
+	}
+
+	if (drop_mutex)
+		mutex_exit(&zvol_state_lock);
+
+#ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
+	return (0);
+#endif
+}
+
+static int
+zvol_ioctl(struct block_device *bdev, fmode_t mode,
+    unsigned int cmd, unsigned long arg)
+{
+	zvol_state_t *zv = bdev->bd_disk->private_data;
+	int error = 0;
+
+	if (zv == NULL)
+		return (SET_ERROR(-ENXIO));
+
+	switch (cmd) {
+	case BLKFLSBUF:
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		break;
+	case BLKZNAME:
+		error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
+		break;
+
+	default:
+		error = -ENOTTY;
+		break;
+
+	}
+
+	return (SET_ERROR(error));
+}
+
+#ifdef CONFIG_COMPAT
+static int
+zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
+    unsigned cmd, unsigned long arg)
+{
+	return (zvol_ioctl(bdev, mode, cmd, arg));
+}
+#else
+#define	zvol_compat_ioctl	NULL
+#endif
+
+static int zvol_media_changed(struct gendisk *disk)
+{
+	zvol_state_t *zv = disk->private_data;
+
+	return (zv->zv_changed);
+}
+
+static int zvol_revalidate_disk(struct gendisk *disk)
+{
+	zvol_state_t *zv = disk->private_data;
+
+	zv->zv_changed = 0;
+	set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
+
+	return (0);
+}
+
+/*
+ * Provide a simple virtual geometry for legacy compatibility.  For devices
+ * smaller than 1 MiB a small head and sector count is used to allow very
+ * tiny devices.  For devices over 1 Mib a standard head and sector count
+ * is used to keep the cylinders count reasonable.
+ */
+static int
+zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	zvol_state_t *zv = bdev->bd_disk->private_data;
+	sector_t sectors = get_capacity(zv->zv_disk);
+
+	if (sectors > 2048) {
+		geo->heads = 16;
+		geo->sectors = 63;
+	} else {
+		geo->heads = 2;
+		geo->sectors = 4;
+	}
+
+	geo->start = 0;
+	geo->cylinders = sectors / (geo->heads * geo->sectors);
+
+	return (0);
+}
+
+static struct kobject *
+zvol_probe(dev_t dev, int *part, void *arg)
+{
+	zvol_state_t *zv;
+	struct kobject *kobj;
+
+	mutex_enter(&zvol_state_lock);
+	zv = zvol_find_by_dev(dev);
+	kobj = zv ? get_disk(zv->zv_disk) : NULL;
+	mutex_exit(&zvol_state_lock);
+
+	return (kobj);
+}
+
+#ifdef HAVE_BDEV_BLOCK_DEVICE_OPERATIONS
+static struct block_device_operations zvol_ops = {
+	.open			= zvol_open,
+	.release		= zvol_release,
+	.ioctl			= zvol_ioctl,
+	.compat_ioctl		= zvol_compat_ioctl,
+	.media_changed		= zvol_media_changed,
+	.revalidate_disk	= zvol_revalidate_disk,
+	.getgeo			= zvol_getgeo,
+	.owner			= THIS_MODULE,
+};
+
+#else /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
+
+static int
+zvol_open_by_inode(struct inode *inode, struct file *file)
+{
+	return (zvol_open(inode->i_bdev, file->f_mode));
+}
+
+static int
+zvol_release_by_inode(struct inode *inode, struct file *file)
+{
+	return (zvol_release(inode->i_bdev->bd_disk, file->f_mode));
+}
+
+static int
+zvol_ioctl_by_inode(struct inode *inode, struct file *file,
+    unsigned int cmd, unsigned long arg)
+{
+	if (file == NULL || inode == NULL)
+		return (SET_ERROR(-EINVAL));
+
+	return (zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg));
+}
+
+#ifdef CONFIG_COMPAT
+static long
+zvol_compat_ioctl_by_inode(struct file *file,
+    unsigned int cmd, unsigned long arg)
+{
+	if (file == NULL)
+		return (SET_ERROR(-EINVAL));
+
+	return (zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev,
+	    file->f_mode, cmd, arg));
+}
+#else
+#define	zvol_compat_ioctl_by_inode	NULL
+#endif
+
+static struct block_device_operations zvol_ops = {
+	.open			= zvol_open_by_inode,
+	.release		= zvol_release_by_inode,
+	.ioctl			= zvol_ioctl_by_inode,
+	.compat_ioctl		= zvol_compat_ioctl_by_inode,
+	.media_changed		= zvol_media_changed,
+	.revalidate_disk	= zvol_revalidate_disk,
+	.getgeo			= zvol_getgeo,
+	.owner			= THIS_MODULE,
+};
+#endif /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
+
+/*
+ * Allocate memory for a new zvol_state_t and setup the required
+ * request queue and generic disk structures for the block device.
+ */
+static zvol_state_t *
+zvol_alloc(dev_t dev, const char *name)
+{
+	zvol_state_t *zv;
+
+	zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
+
+	list_link_init(&zv->zv_next);
+
+	zv->zv_queue = blk_alloc_queue(GFP_ATOMIC);
+	if (zv->zv_queue == NULL)
+		goto out_kmem;
+
+	blk_queue_make_request(zv->zv_queue, zvol_request);
+
+#ifdef HAVE_BLK_QUEUE_FLUSH
+	blk_queue_flush(zv->zv_queue, VDEV_REQ_FLUSH | VDEV_REQ_FUA);
+#else
+	blk_queue_ordered(zv->zv_queue, QUEUE_ORDERED_DRAIN, NULL);
+#endif /* HAVE_BLK_QUEUE_FLUSH */
+
+	zv->zv_disk = alloc_disk(ZVOL_MINORS);
+	if (zv->zv_disk == NULL)
+		goto out_queue;
+
+	zv->zv_queue->queuedata = zv;
+	zv->zv_dev = dev;
+	zv->zv_open_count = 0;
+	strlcpy(zv->zv_name, name, MAXNAMELEN);
+
+	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
+	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
+	    sizeof (rl_t), offsetof(rl_t, r_node));
+	zv->zv_znode.z_is_zvol = TRUE;
+
+	zv->zv_disk->major = zvol_major;
+	zv->zv_disk->first_minor = (dev & MINORMASK);
+	zv->zv_disk->fops = &zvol_ops;
+	zv->zv_disk->private_data = zv;
+	zv->zv_disk->queue = zv->zv_queue;
+	snprintf(zv->zv_disk->disk_name, DISK_NAME_LEN, "%s%d",
+	    ZVOL_DEV_NAME, (dev & MINORMASK));
+
+	return (zv);
+
+out_queue:
+	blk_cleanup_queue(zv->zv_queue);
+out_kmem:
+	kmem_free(zv, sizeof (zvol_state_t));
+
+	return (NULL);
+}
+
+/*
+ * Cleanup then free a zvol_state_t which was created by zvol_alloc().
+ */
+static void
+zvol_free(zvol_state_t *zv)
+{
+	avl_destroy(&zv->zv_znode.z_range_avl);
+	mutex_destroy(&zv->zv_znode.z_range_lock);
+
+	del_gendisk(zv->zv_disk);
+	blk_cleanup_queue(zv->zv_queue);
+	put_disk(zv->zv_disk);
+
+	kmem_free(zv, sizeof (zvol_state_t));
+}
+
+static int
+__zvol_snapdev_hidden(const char *name)
+{
+	uint64_t snapdev;
+	char *parent;
+	char *atp;
+	int error = 0;
+
+	parent = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+	(void) strlcpy(parent, name, MAXPATHLEN);
+
+	if ((atp = strrchr(parent, '@')) != NULL) {
+		*atp = '\0';
+		error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL);
+		if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN))
+			error = SET_ERROR(ENODEV);
+	}
+
+	kmem_free(parent, MAXPATHLEN);
+
+	return (SET_ERROR(error));
+}
+
+static int
+__zvol_create_minor(const char *name, boolean_t ignore_snapdev)
+{
+	zvol_state_t *zv;
+	objset_t *os;
+	dmu_object_info_t *doi;
+	uint64_t volsize;
+	uint64_t len;
+	unsigned minor = 0;
+	int error = 0;
+
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+
+	zv = zvol_find_by_name(name);
+	if (zv) {
+		error = SET_ERROR(EEXIST);
+		goto out;
+	}
+
+	if (ignore_snapdev == B_FALSE) {
+		error = __zvol_snapdev_hidden(name);
+		if (error)
+			goto out;
+	}
+
+	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+
+	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os);
+	if (error)
+		goto out_doi;
+
+	error = dmu_object_info(os, ZVOL_OBJ, doi);
+	if (error)
+		goto out_dmu_objset_disown;
+
+	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+	if (error)
+		goto out_dmu_objset_disown;
+
+	error = zvol_find_minor(&minor);
+	if (error)
+		goto out_dmu_objset_disown;
+
+	zv = zvol_alloc(MKDEV(zvol_major, minor), name);
+	if (zv == NULL) {
+		error = SET_ERROR(EAGAIN);
+		goto out_dmu_objset_disown;
+	}
+
+	if (dmu_objset_is_snapshot(os))
+		zv->zv_flags |= ZVOL_RDONLY;
+
+	zv->zv_volblocksize = doi->doi_data_block_size;
+	zv->zv_volsize = volsize;
+	zv->zv_objset = os;
+
+	set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
+
+	blk_queue_max_hw_sectors(zv->zv_queue, (DMU_MAX_ACCESS / 4) >> 9);
+	blk_queue_max_segments(zv->zv_queue, UINT16_MAX);
+	blk_queue_max_segment_size(zv->zv_queue, UINT_MAX);
+	blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize);
+	blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize);
+	blk_queue_max_discard_sectors(zv->zv_queue,
+	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
+	blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize);
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue);
+#ifdef QUEUE_FLAG_NONROT
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue);
+#endif
+#ifdef QUEUE_FLAG_ADD_RANDOM
+	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue);
+#endif
+
+	if (spa_writeable(dmu_objset_spa(os))) {
+		if (zil_replay_disable)
+			zil_destroy(dmu_objset_zil(os), B_FALSE);
+		else
+			zil_replay(os, zv, zvol_replay_vector);
+	}
+
+	/*
+	 * When udev detects the addition of the device it will immediately
+	 * invoke blkid(8) to determine the type of content on the device.
+	 * Prefetching the blocks commonly scanned by blkid(8) will speed
+	 * up this process.
+	 */
+	len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
+	if (len > 0) {
+		dmu_prefetch(os, ZVOL_OBJ, 0, len);
+		dmu_prefetch(os, ZVOL_OBJ, volsize - len, len);
+	}
+
+	zv->zv_objset = NULL;
+out_dmu_objset_disown:
+	dmu_objset_disown(os, zvol_tag);
+out_doi:
+	kmem_free(doi, sizeof (dmu_object_info_t));
+out:
+
+	if (error == 0) {
+		zvol_insert(zv);
+		add_disk(zv->zv_disk);
+	}
+
+	return (SET_ERROR(error));
+}
+
+/*
+ * Create a block device minor node and setup the linkage between it
+ * and the specified volume.  Once this function returns the block
+ * device is live and ready for use.
+ */
+int
+zvol_create_minor(const char *name)
+{
+	int error;
+
+	mutex_enter(&zvol_state_lock);
+	error = __zvol_create_minor(name, B_FALSE);
+	mutex_exit(&zvol_state_lock);
+
+	return (SET_ERROR(error));
+}
+
+static int
+__zvol_remove_minor(const char *name)
+{
+	zvol_state_t *zv;
+
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+
+	zv = zvol_find_by_name(name);
+	if (zv == NULL)
+		return (SET_ERROR(ENXIO));
+
+	if (zv->zv_open_count > 0)
+		return (SET_ERROR(EBUSY));
+
+	zvol_remove(zv);
+	zvol_free(zv);
+
+	return (0);
+}
+
+/*
+ * Remove a block device minor node for the specified volume.
+ */
+int
+zvol_remove_minor(const char *name)
+{
+	int error;
+
+	mutex_enter(&zvol_state_lock);
+	error = __zvol_remove_minor(name);
+	mutex_exit(&zvol_state_lock);
+
+	return (SET_ERROR(error));
+}
+
+/*
+ * Rename a block device minor mode for the specified volume.
+ */
+static void
+__zvol_rename_minor(zvol_state_t *zv, const char *newname)
+{
+	int readonly = get_disk_ro(zv->zv_disk);
+
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+
+	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
+
+	/*
+	 * The block device's read-only state is briefly changed causing
+	 * a KOBJ_CHANGE uevent to be issued.  This ensures udev detects
+	 * the name change and fixes the symlinks.  This does not change
+	 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
+	 * changes.  This would normally be done using kobject_uevent() but
+	 * that is a GPL-only symbol which is why we need this workaround.
+	 */
+	set_disk_ro(zv->zv_disk, !readonly);
+	set_disk_ro(zv->zv_disk, readonly);
+}
+
+static int
+zvol_create_minors_cb(const char *dsname, void *arg)
+{
+	(void) zvol_create_minor(dsname);
+
+	return (0);
+}
+
+/*
+ * Create minors for specified dataset including children and snapshots.
+ */
+int
+zvol_create_minors(const char *name)
+{
+	int error = 0;
+
+	if (!zvol_inhibit_dev)
+		error = dmu_objset_find((char *)name, zvol_create_minors_cb,
+		    NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+
+	return (SET_ERROR(error));
+}
+
+/*
+ * Remove minors for specified dataset including children and snapshots.
+ */
+void
+zvol_remove_minors(const char *name)
+{
+	zvol_state_t *zv, *zv_next;
+	int namelen = ((name) ? strlen(name) : 0);
+
+	if (zvol_inhibit_dev)
+		return;
+
+	mutex_enter(&zvol_state_lock);
+
+	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
+		zv_next = list_next(&zvol_state_list, zv);
+
+		if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
+		    (strncmp(zv->zv_name, name, namelen) == 0 &&
+		    zv->zv_name[namelen] == '/')) {
+			zvol_remove(zv);
+			zvol_free(zv);
+		}
+	}
+
+	mutex_exit(&zvol_state_lock);
+}
+
+/*
+ * Rename minors for specified dataset including children and snapshots.
+ */
+void
+zvol_rename_minors(const char *oldname, const char *newname)
+{
+	zvol_state_t *zv, *zv_next;
+	int oldnamelen, newnamelen;
+	char *name;
+
+	if (zvol_inhibit_dev)
+		return;
+
+	oldnamelen = strlen(oldname);
+	newnamelen = strlen(newname);
+	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+
+	mutex_enter(&zvol_state_lock);
+
+	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
+		zv_next = list_next(&zvol_state_list, zv);
+
+		if (strcmp(zv->zv_name, oldname) == 0) {
+			__zvol_rename_minor(zv, newname);
+		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
+		    (zv->zv_name[oldnamelen] == '/' ||
+		    zv->zv_name[oldnamelen] == '@')) {
+			snprintf(name, MAXNAMELEN, "%s%c%s", newname,
+			    zv->zv_name[oldnamelen],
+			    zv->zv_name + oldnamelen + 1);
+			__zvol_rename_minor(zv, name);
+		}
+	}
+
+	mutex_exit(&zvol_state_lock);
+
+	kmem_free(name, MAXNAMELEN);
+}
+
+static int
+snapdev_snapshot_changed_cb(const char *dsname, void *arg) {
+	uint64_t snapdev = *(uint64_t *) arg;
+
+	if (strchr(dsname, '@') == NULL)
+		return (0);
+
+	switch (snapdev) {
+		case ZFS_SNAPDEV_VISIBLE:
+			mutex_enter(&zvol_state_lock);
+			(void) __zvol_create_minor(dsname, B_TRUE);
+			mutex_exit(&zvol_state_lock);
+			break;
+		case ZFS_SNAPDEV_HIDDEN:
+			(void) zvol_remove_minor(dsname);
+			break;
+	}
+
+	return (0);
+}
+
+int
+zvol_set_snapdev(const char *dsname, uint64_t snapdev) {
+	(void) dmu_objset_find((char *) dsname, snapdev_snapshot_changed_cb,
+		&snapdev, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+	/* caller should continue to modify snapdev property */
+	return (-1);
+}
+
+int
+zvol_init(void)
+{
+	int error;
+
+	list_create(&zvol_state_list, sizeof (zvol_state_t),
+	    offsetof(zvol_state_t, zv_next));
+
+	mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	error = register_blkdev(zvol_major, ZVOL_DRIVER);
+	if (error) {
+		printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
+		goto out;
+	}
+
+	blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
+	    THIS_MODULE, zvol_probe, NULL, NULL);
+
+	return (0);
+
+out:
+	mutex_destroy(&zvol_state_lock);
+	list_destroy(&zvol_state_list);
+
+	return (SET_ERROR(error));
+}
+
+void
+zvol_fini(void)
+{
+	zvol_remove_minors(NULL);
+	blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
+	unregister_blkdev(zvol_major, ZVOL_DRIVER);
+	mutex_destroy(&zvol_state_lock);
+	list_destroy(&zvol_state_list);
+}
+
+module_param(zvol_inhibit_dev, uint, 0644);
+MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
+
+module_param(zvol_major, uint, 0444);
+MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
+
+module_param(zvol_max_discard_blocks, ulong, 0444);
+MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
+
+module_param(zvol_prefetch_bytes, uint, 0644);
+MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
diff --git a/module/zfs/zvol.c.rej b/module/zfs/zvol.c.rej
new file mode 100644
index 000000000000..7db973a8b00b
--- /dev/null
+++ b/module/zfs/zvol.c.rej
@@ -0,0 +1,11 @@
+--- module/zfs/zvol.c
++++ module/zfs/zvol.c
+@@ -255,7 +255,7 @@ zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ 	zvol_extent_t *ze;
+ 	int bs = ma->ma_zv->zv_volblocksize;
+ 
+-	if (BP_IS_HOLE(bp) ||
++	if (bp == NULL || BP_IS_HOLE(bp) ||
+ 	    zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
+ 		return (0);
+