From 9c89682218cd48165a0519aa14ef5f20743fca3f Mon Sep 17 00:00:00 2001
From: Isaac Huang <he.huang@intel.com>
Date: Mon, 27 Feb 2017 11:27:56 -0700
Subject: [PATCH] Port of draid from coral-beta-draid branch

Up to and including b6ca80d723b4b9139d3133114a17d019149eb1a4

Signed-off-by: Isaac Huang <he.huang@intel.com>
---
 .gitignore                       |    1 +
 cmd/Makefile.am                  |    2 +-
 cmd/draidcfg/.gitignore          |    1 +
 cmd/draidcfg/Makefile.am         |   20 +
 cmd/draidcfg/draid_permutation.c |  763 +++++++++++++++
 cmd/draidcfg/draid_permutation.h |   41 +
 cmd/draidcfg/draidcfg.c          |  343 +++++++
 cmd/zdb/zdb.c                    |    3 +-
 cmd/zpool/zpool_main.c           |   23 +-
 cmd/zpool/zpool_vdev.c           |   91 +-
 configure.ac                     |    1 +
 include/libzfs.h                 |    6 +-
 include/sys/Makefile.am          |    2 +
 include/sys/dsl_scan.h           |    2 +
 include/sys/fs/zfs.h             |   12 +
 include/sys/metaslab_impl.h      |    1 +
 include/sys/nvpair.h             |    1 +
 include/sys/spa_scan.h           |   47 +
 include/sys/vdev.h               |    1 +
 include/sys/vdev_draid_impl.h    |  105 ++
 include/sys/vdev_impl.h          |   32 +
 include/sys/vdev_raidz_impl.h    |    3 +
 lib/libzfs/libzfs_import.c       |   72 +-
 lib/libzfs/libzfs_pool.c         |   15 +-
 lib/libzpool/Makefile.am         |    2 +
 module/nvpair/fnvpair.c          |   19 +-
 module/zcommon/zfs_namecheck.c   |    4 +-
 module/zfs/Makefile.in           |    2 +
 module/zfs/dsl_scan.c            |  101 +-
 module/zfs/metaslab.c            |   74 +-
 module/zfs/spa.c                 |   92 +-
 module/zfs/spa_scan.c            |  383 ++++++++
 module/zfs/vdev.c                |   61 +-
 module/zfs/vdev_draid.c          | 1551 ++++++++++++++++++++++++++++++
 module/zfs/vdev_label.c          |   34 +-
 module/zfs/vdev_mirror.c         |   45 +-
 module/zfs/vdev_raidz.c          |  117 ++-
 module/zfs/vdev_raidz.h          |   33 +
 module/zfs/zio.c                 |   20 +-
 39 files changed, 3988 insertions(+), 138 deletions(-)
 create mode 100644 cmd/draidcfg/.gitignore
 create mode 100644 cmd/draidcfg/Makefile.am
 create mode 100644 cmd/draidcfg/draid_permutation.c
 create mode 100644 cmd/draidcfg/draid_permutation.h
 create mode 100644 cmd/draidcfg/draidcfg.c
 create mode 100644 include/sys/spa_scan.h
 create mode 100644 include/sys/vdev_draid_impl.h
 create mode 100644 module/zfs/spa_scan.c
 create mode 100644 module/zfs/vdev_draid.c
 create mode 100644 module/zfs/vdev_raidz.h

diff --git a/.gitignore b/.gitignore
index 1eb3b9b4d74a..eb2046196fda 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,7 @@
 *.mod.c
 *~
 *.swp
+*.orig
 .deps
 .libs
 .dirstamp
diff --git a/cmd/Makefile.am b/cmd/Makefile.am
index 04aa7c6333da..bf5a78dc8886 100644
--- a/cmd/Makefile.am
+++ b/cmd/Makefile.am
@@ -1,3 +1,3 @@
 SUBDIRS  = zfs zpool zdb zhack zinject zstreamdump ztest zpios
 SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat dbufstat zed
-SUBDIRS += arc_summary raidz_test
+SUBDIRS += arc_summary raidz_test draidcfg
diff --git a/cmd/draidcfg/.gitignore b/cmd/draidcfg/.gitignore
new file mode 100644
index 000000000000..ad7c307b04e3
--- /dev/null
+++ b/cmd/draidcfg/.gitignore
@@ -0,0 +1 @@
+/draidcfg
diff --git a/cmd/draidcfg/Makefile.am b/cmd/draidcfg/Makefile.am
new file mode 100644
index 000000000000..f587d271860e
--- /dev/null
+++ b/cmd/draidcfg/Makefile.am
@@ -0,0 +1,20 @@
+include $(top_srcdir)/config/Rules.am
+
+AM_CPPFLAGS += -DDEBUG
+
+DEFAULT_INCLUDES += \
+	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/lib/libspl/include
+
+bin_PROGRAMS = draidcfg
+
+draidcfg_SOURCES = \
+	draidcfg.c \
+	draid_permutation.c \
+	draid_permutation.h
+
+draidcfg_LDADD = \
+	$(top_builddir)/lib/libnvpair/libnvpair.la \
+	$(top_builddir)/lib/libzpool/libzpool.la \
+	$(top_builddir)/lib/libzfs/libzfs.la
+draidcfg_LDADD += -lm
diff --git a/cmd/draidcfg/draid_permutation.c b/cmd/draidcfg/draid_permutation.c
new file mode 100644
index 000000000000..4753f3f31f66
--- /dev/null
+++ b/cmd/draidcfg/draid_permutation.c
@@ -0,0 +1,763 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2016 Intel Corporation.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <assert.h>
+#include <unistd.h>
+#include <string.h>
+#include <math.h>
+
+#include "draid_permutation.h"
+
+
+#define	MAX_GROUPSIZE	32
+#define	MAX_GROUPS	128
+#define	MAX_SPARES 	100
+#define	MAX_DEVS	(MAX_GROUPSIZE * MAX_GROUPS + MAX_SPARES)
+#define	MAX_ROWS	16384
+
+#define	UNOPT		0
+#define	EVAL_WORST	1
+#define	EVAL_MEAN	2
+#define	EVAL_RMS	3
+
+static int verbose = 0;
+
+typedef struct
+{
+	int  groupsz;
+	int  ngroups;
+	int  nspares;
+	int  ndevs;
+	int  nrows;
+	/* each row maps all drives, groups from 0, spares down from ndevs-1 */
+	int **rows;
+	int   nbroken; /* # broken drives */
+	int  *broken; /* which drives are broken */
+} map_t;
+
+typedef struct
+{
+	int  value;
+	int  order;
+} pair_t;
+
+static void
+permute_devs(int *in, int *out, int ndevs)
+{
+	pair_t tmp[MAX_DEVS];
+	int    i;
+	int    j;
+
+	if (ndevs == 2) { /* swap */
+		i = in[0];
+		j = in[1];
+		out[0] = j;
+		out[1] = i;
+		return;
+	}
+
+	for (i = 0; i < ndevs; i++) { /* assign random order */
+		tmp[i].value = in[i];
+		tmp[i].order = mrand48();
+	}
+
+	for (i = 1; i < ndevs; i++) /* sort */
+		for (j = 0; j < i; j++)
+			if (tmp[i].order < tmp[j].order) {
+				pair_t t = tmp[i];
+				tmp[i] = tmp[j];
+				tmp[j] = t;
+			}
+
+	for (i = 0; i < ndevs; i++)
+		out[i] = tmp[i].value;
+}
+
+static void
+print_map(map_t *map)
+{
+	int i;
+	int j;
+
+	for (i = 0; i < map->nrows; i++) {
+		for (j = 0; j < map->ndevs; j++) {
+			if (j == map->ndevs - map->nspares)
+				printf("S ");
+
+			printf("%2d ", map->rows[i][j]);
+		}
+		printf("\n");
+	}
+}
+
+static void
+check_map(map_t *map)
+{
+	int   i;
+	int   j;
+	int   nrows = map->nrows;
+	int   ndevs = map->ndevs;
+	int **rows = map->rows;
+	int   devcounts[MAX_DEVS];
+	int   brokencounts[MAX_DEVS];
+
+	ASSERT(map->groupsz <= MAX_GROUPSIZE);
+	ASSERT(map->ngroups <= MAX_GROUPS);
+	ASSERT(map->nspares <= MAX_SPARES);
+	ASSERT(map->ndevs == map->nspares + map->ngroups * map->groupsz);
+	ASSERT(map->nrows <= MAX_ROWS);
+	ASSERT(map->nbroken <= MAX_SPARES);
+
+	/* Ensure each dev appears once in every row */
+	memset(devcounts, 0, sizeof (int) * map->ndevs);
+
+	for (i = 0; i < nrows; i++) {
+		int *row = rows[i];
+
+		for (j = 0; j < ndevs; j++) {
+			int dev = row[j];
+
+			ASSERT(0 <= dev && dev < ndevs);
+			ASSERT(devcounts[dev] == i);
+			devcounts[dev] = i+1;
+		}
+	}
+
+	/* Ensure broken drives only appear once */
+	memset(brokencounts, 0, sizeof (int) * map->ndevs);
+
+	for (i = 0; i < map->nbroken; i++) {
+		int dev = map->broken[i];
+
+		ASSERT(0 <= dev && dev < map->ndevs); /* valid drive */
+		ASSERT(brokencounts[i] == 0); /* not used already */
+		brokencounts[i] = 1;
+	}
+}
+
+static map_t *
+dup_map(map_t *oldmap)
+{
+	int    groupsz = oldmap->groupsz;
+	int    ngroups = oldmap->ngroups;
+	int    nspares = oldmap->nspares;
+	int    ndevs   = oldmap->ndevs;
+	int    nrows   = oldmap->nrows;
+	map_t *map = malloc(sizeof (map_t));
+	int    i;
+
+	ASSERT(nrows <= MAX_ROWS);
+	ASSERT(ndevs <= MAX_DEVS);
+
+	map->groupsz = groupsz;
+	map->ngroups = ngroups;
+	map->nspares = nspares;
+	map->ndevs = ndevs;
+	map->nrows = nrows;
+	map->rows = malloc(sizeof (int *) * nrows);
+
+	for (i = 0; i < nrows; i++) {
+		map->rows[i] = malloc(sizeof (int) * ndevs);
+		memcpy(map->rows[i], oldmap->rows[i], sizeof (int) * ndevs);
+	}
+
+	/* Init to no failures (nothing broken) */
+	map->broken = malloc(sizeof (int) * nspares);
+	map->nbroken = 0;
+
+	check_map(map);
+	return (map);
+}
+
+static map_t *
+new_map(int groupsz, int ngroups, int nspares, int nrows)
+{
+	map_t *map = malloc(sizeof (map_t));
+	int    ndevs = nspares + ngroups * groupsz;
+	int    i;
+	int    j;
+
+	ASSERT(nrows <= MAX_ROWS);
+	ASSERT(ndevs <= MAX_DEVS);
+
+	map->groupsz = groupsz;
+	map->ngroups = ngroups;
+	map->nspares = nspares;
+	map->ndevs = ndevs;
+	map->nrows = nrows;
+	map->rows = malloc(sizeof (int *) * nrows);
+
+	for (i = 0; i < nrows; i++) {
+		map->rows[i] = malloc(sizeof (int) * ndevs);
+
+		if (i == 0)
+			for (j = 0; j < ndevs; j++)
+				map->rows[i][j] = j;
+		else
+			permute_devs(map->rows[i-1], map->rows[i], ndevs);
+	}
+
+	/* Init to no failures (nothing broken) */
+	map->broken = malloc(sizeof (int) * nspares);
+	map->nbroken = 0;
+
+	check_map(map);
+	return (map);
+}
+
+static void
+free_map(map_t *map)
+{
+	int i;
+
+	free(map->broken);
+	for (i = 0; i < map->nrows; i++)
+		free(map->rows[i]);
+	free(map->rows);
+	free(map);
+}
+
+static inline int
+is_broken(map_t *map, int dev)
+{
+	int i;
+
+	for (i = 0; i < map->nbroken; i++)
+		if (dev == map->broken[i])
+			return (1);
+
+	return (0);
+}
+
+static int
+eval_resilver(map_t *map, int print)
+{
+	/* Evaluate how resilvering I/O will be distributed */
+	int  i;
+	int  j;
+	int  k;
+	int  spare;
+	int  dev;
+	int  ndevs = map->ndevs;
+	int  nspares = map->nspares;
+	int  ngroups = map->ngroups;
+	int  groupsz = map->groupsz;
+	int  nrows = map->nrows;
+	int  writes[MAX_DEVS];
+	int  reads[MAX_DEVS];
+	int  max_reads = 0;
+	int  max_writes = 0;
+	int  max_ios = 0;
+
+	memset(reads, 0, sizeof (int) * ndevs);
+	memset(writes, 0, sizeof (int) * ndevs);
+
+	/* resilver all rows */
+	for (i = 0; i < nrows; i++) {
+		int *row = map->rows[i];
+
+		/* resilver all groups with broken drives */
+		for (j = 0; j < ngroups; j++) {
+			int  fix = 0;
+
+			/* See if any disk in this group is broken */
+			for (k = 0; k < groupsz && !fix; k++)
+				fix = is_broken(map, row[j*groupsz + k]);
+
+			if (!fix)
+				continue;
+
+			/*
+			 * This group needs fixing
+			 * Read all the non-broken drives and write all the
+			 * broken drives to their hot spare for this row
+			 */
+			spare = ndevs - nspares;
+			for (k = 0; k < groupsz; k++) {
+				dev = row[j*groupsz + k];
+
+				if (!is_broken(map, dev)) {
+					reads[dev]++;
+				} else {
+					ASSERT(spare < ndevs);
+
+					while (is_broken(map, row[spare])) {
+						spare++;
+						ASSERT(spare < ndevs);
+					}
+					writes[row[spare++]]++;
+				}
+			}
+		}
+	}
+
+	/* find drives with most I/O */
+	for (i = 0; i < ndevs; i++) {
+		if (reads[i] > max_reads)
+			max_reads = reads[i];
+		if (writes[i] > max_writes)
+			max_writes = writes[i];
+
+		if (reads[i] + writes[i] > max_ios)
+			max_ios = reads[i] + writes[i];
+	}
+
+	if (print) {
+		printf("Reads:  ");
+		for (i = 0; i < ndevs; i++)
+			printf(" %5.3f", ((double)reads[i]*ngroups)/nrows);
+		printf("\n");
+		printf("Writes: ");
+		for (i = 0; i < ndevs; i++)
+			printf(" %5.3f", ((double)writes[i]*ngroups)/nrows);
+		printf("\n");
+	}
+
+	return (max_ios);
+}
+
+static double
+eval_decluster(map_t *map, int how, int faults, int print)
+{
+	int f1;
+	int f2;
+	int ios;
+	int worst1 = -1;
+	int worst2 = -1;
+	int n = 0;
+	long sum = 0;
+	long sumsq = 0;
+	long max_ios = 0;
+	double val;
+
+	ASSERT(eval_resilver(map, 0) == 0); /* not broken already */
+	ASSERT(faults == 1 || faults == 2);
+
+	map->nbroken = faults;
+
+	for (f1 = 0; f1 < map->ndevs; f1++) {
+		map->broken[0] = f1;
+
+		if (faults < 2) {
+			ios = eval_resilver(map, 0); /* eval single failure */
+			n++;
+			sum += ios;
+			sumsq += ios*ios;
+			if (max_ios < ios) {
+				worst1 = f1;
+				max_ios = ios;
+			}
+		} else { /* eval double failure */
+			for (f2 = f1 + 1; f2 < map->ndevs; f2++) {
+				map->broken[1] = f2; /* use 2nd hot spare */
+
+				ios = eval_resilver(map, 0);
+				n++;
+				sum += ios;
+				sumsq += ios*ios;
+				if (max_ios < ios) {
+					worst1 = f1;
+					worst2 = f2;
+					max_ios = ios;
+				}
+			}
+		}
+	}
+	map->nbroken = 0;
+
+	if (print) {
+		map->nbroken = faults;
+		map->broken[0] = worst1;
+		map->broken[2] = worst2;
+
+		eval_resilver(map, 1);
+
+		map->nbroken = 0;
+	}
+
+	switch (how) {
+	case EVAL_WORST:
+		/*
+		 * imbalance from worst possible drive failure
+		 * insensitive to failures handled better
+		 */
+		val = max_ios;
+		break;
+	case EVAL_MEAN:
+		/*
+		 * average over all possible drive failures
+		 * sensitive to all possible failures
+		 */
+		val = ((double)sum)/n;
+		break;
+	case EVAL_RMS:
+		/*
+		 * root mean square over all possible drive failures
+		 * penalizes higher imbalance more
+		 */
+		val = sqrt(((double)sumsq)/n);
+		break;
+	default:
+		ASSERT(0);
+	}
+	return ((val/map->nrows)*map->ngroups);
+}
+
+static int
+rand_in_range(int min, int count)
+{
+	return (min + drand48()*count);
+}
+
+static void
+permute_map(map_t *map, int temp)
+{
+	static int prev_temp;
+
+	int nrows = (temp < 1) ? 1 : (temp > 100) ?
+	    map->nrows : rand_in_range(1, (map->nrows * temp)/100);
+	int row   = rand_in_range(0, map->nrows - nrows);
+	int ncols = map->ndevs;
+	int col   = rand_in_range(0, map->ndevs - ncols);
+	int i;
+
+	if (verbose > 0 &&
+	    temp != prev_temp &&
+	    (temp < 10 || (temp % 10 == 0)))
+		printf("Permute t %3d (%d-%d, %d-%d)\n",
+		    temp, col, ncols, row, nrows);
+	prev_temp = temp;
+
+	for (i = row; i < row + nrows; i++)
+		permute_devs(&map->rows[i][col], &map->rows[i][col], ncols);
+}
+
+static map_t *
+develop_map(map_t *map)
+{
+	map_t *dmap = new_map(map->groupsz, map->ngroups,
+	    map->nspares, map->nrows * map->ndevs);
+	int    base;
+	int    dev;
+	int    i;
+
+	for (base = 0; base < map->nrows; base++)
+		for (dev = 0; dev < map->ndevs; dev++)
+			for (i = 0; i < map->ndevs; i++)
+				dmap->rows[base*map->ndevs + dev][i] =
+				    (map->rows[base][i] + dev) % map->ndevs;
+
+	return (dmap);
+}
+
+static map_t *
+optimize_map(map_t *map, int eval, int faults)
+{
+	double temp    = 100.0;
+	double alpha   = 0.995;
+	double epsilon = 0.001;
+	double val = eval_decluster(map, eval, faults, 0);
+	int ups = 0;
+	int downs = 0;
+	int sames = 0;
+	int iter = 0;
+
+	while (temp > epsilon) {
+		map_t  *map2 = dup_map(map);
+		double  val2;
+		double  delta;
+
+		permute_map(map2, (int)temp);
+
+		val2  = eval_decluster(map2, eval, faults, 0);
+		delta = (val2 - val);
+
+		if (delta < 0 || exp(-10000*delta/temp) > drand48()) {
+			if (delta > 0)
+				ups++;
+			else if (delta < 0)
+				downs++;
+			else
+				sames++;
+
+			free_map(map);
+			map = map2;
+			val = val2;
+		} else {
+			free_map(map2);
+		}
+
+		temp *= alpha;
+
+		if ((++iter % 100) == 0) {
+			if (verbose > 0)
+				printf("%f (%d ups, %d sames, %d downs)\n",
+				    val, ups, sames, downs);
+			ups = downs = sames = 0;
+		}
+	}
+
+	if (verbose > 0)
+		printf("%d iters, %d ups %d sames %d downs\n",
+		    iter, ups, sames, downs);
+	return (map);
+}
+
+static void
+print_map_stats(map_t *map, int optimize, int print_ios)
+{
+	double score = eval_decluster(map, EVAL_WORST, 1, 0);
+
+	printf("%6s (%2d x %2d + %2d) x %5d: %2.3f\n",
+	    (optimize == UNOPT) ? "Unopt" :
+	    (optimize == EVAL_WORST) ? "Worst" :
+	    (optimize == EVAL_MEAN) ? "Mean"  : "Rms",
+	    map->ngroups, map->groupsz, map->nspares, map->nrows, score);
+
+	if (map->ndevs < 80 && score >= 1.05)
+		printf("Warning score %6.3f has over 5 percent imbalance!\n",
+		    score);
+	else if (score >= 1.1)
+		printf("Warning score %6.3f has over 10 percent imbalance!\n",
+		    score);
+
+#ifdef FOOO
+	printf("Single: worst %6.3f mean %6.3f\n",
+	    eval_decluster(map, EVAL_WORST, 1, 0),
+	    eval_decluster(map, EVAL_MEAN, 1, 0));
+
+	printf("Double: worst %6.3f mean %6.3f\n",
+	    eval_decluster(map, EVAL_WORST, 2, 0),
+	    eval_decluster(map, EVAL_MEAN, 2, 0));
+#endif
+
+	if (print_ios) {
+		eval_decluster(map, EVAL_WORST, 1, 1);
+		eval_decluster(map, EVAL_WORST, 2, 1);
+	}
+}
+
+int
+draid_permutation_generate(struct vdev_draid_configuration *cfg)
+{
+	const int loop = 16; /* HH: make this a parameter */
+	const int faults = 1;
+	const int eval = EVAL_WORST;
+
+	int groupsz = cfg->dcf_data + cfg->dcf_parity;
+	int nspares = cfg->dcf_spare;
+	int ngroups = (cfg->dcf_children - nspares) / groupsz;
+	int nrows;
+	int i, fd, urand_fd;
+	long int best_seed;
+	map_t *best_map;
+
+	fd = open("/dev/random", O_RDONLY | O_NONBLOCK);
+	if (fd == -1) {
+		perror("Cannot open /dev/random\n");
+		return (-1);
+	}
+	urand_fd = open("/dev/urandom", O_RDONLY);
+
+	/* HH: fine tune these heuristics */
+	if (cfg->dcf_children - nspares > 80)
+		nrows = 128; /* 81 - ? */
+	else if (cfg->dcf_children - nspares > 40)
+		nrows = 64;  /* 41 - 80 */
+	else
+		nrows = 32;  /* 1 - 40 */
+
+	for (i = 0, best_map = NULL; i < loop; i++) {
+		int rc;
+		long int seed;
+		map_t *map, *omap;
+
+		rc = read(fd, &seed, sizeof (seed));
+		if (rc != sizeof (seed)) {
+			printf("Not enough entropy at /dev/random: read %d, "
+			    "wanted %lu.\n", rc, sizeof (seed));
+			/* urand_fd may not be valid but it does not matter */
+			rc = read(urand_fd, &seed, sizeof (seed));
+			if (rc != sizeof (seed))
+				break;
+			printf("Using /dev/urandom instead.\n");
+		}
+
+		srand48(seed);
+
+		map = new_map(groupsz, ngroups, nspares, nrows);
+		omap = optimize_map(dup_map(map), eval, faults);
+		if (eval_decluster(omap, eval, faults, 0) >
+		    eval_decluster(map, eval, faults, 0)) {
+			/*
+			 * optimize_map() may create a worse map, because the
+			 * simulated annealing process may accept worse
+			 * neighbors to avoid getting stuck in local optima
+			 */
+			free_map(omap);
+		} else {
+			free_map(map);
+			map = omap;
+		}
+
+		if (best_map == NULL ||
+		    eval_decluster(map, eval, faults, 0) <
+		    eval_decluster(best_map, eval, faults, 0)) {
+			if (best_map != NULL)
+				free_map(best_map);
+			best_map = map;
+			best_seed = seed;
+		} else {
+			free_map(map);
+		}
+	}
+
+	close(fd);
+	close(urand_fd);
+	if (i != loop)
+		fprintf(stderr, "Early termination at loop %d. Generated "
+		    "permutations may not be optimal!\n", i + 1);
+
+	if (best_map != NULL) {
+		int j;
+		map_t *dmap;
+		uint64_t *perms;
+
+		assert(best_map->nrows == nrows);
+		assert(best_map->ndevs == cfg->dcf_children);
+
+		perms = malloc(sizeof (*perms) * nrows * best_map->ndevs);
+		assert(perms != NULL);
+
+		for (i = 0; i < nrows; i++)
+			for (j = 0; j < best_map->ndevs; j++)
+				perms[i * best_map->ndevs + j] =
+				    best_map->rows[i][j];
+
+		cfg->dcf_bases = nrows;
+		cfg->dcf_base_perms = perms;
+
+		if (verbose > 1)
+			print_map(best_map);
+		dmap = develop_map(best_map);
+		free_map(best_map);
+		print_map_stats(dmap, eval, 0);
+		printf("Seed chosen: %lx\n", best_seed);
+		free_map(dmap);
+		return (0);
+	} else {
+		return (-1);
+	}
+}
+
+int
+debug_main(int argc, char **argv)
+{
+	int    ngroups = 0;
+	int    groupsz = 0;
+	int    nspares = 0;
+	int    nrows = 0;
+	int    optimize = UNOPT;
+	int    faults = 1;
+	int    develop = 0;
+	map_t *map;
+	int c;
+
+	while ((c = getopt(argc, argv, "g:d:s:n:vUWMR12D")) != -1)
+		switch (c) {
+		case 'D':
+			develop = 1;
+			break;
+		case 'g':
+			sscanf(optarg, "%d", &ngroups);
+			break;
+		case 'd':
+			sscanf(optarg, "%d", &groupsz);
+			break;
+		case 's':
+			sscanf(optarg, "%d", &nspares);
+			break;
+		case 'n':
+			sscanf(optarg, "%d", &nrows);
+			break;
+		case 'v':
+			verbose++;
+			break;
+		case 'U':
+			optimize = UNOPT;
+			break;
+		case 'W':
+			optimize = EVAL_WORST;
+			break;
+		case 'M':
+			optimize = EVAL_MEAN;
+			break;
+		case 'R':
+			optimize = EVAL_RMS;
+			break;
+		case '1':
+			faults = 1;
+			break;
+		case '2':
+			faults = 2;
+			break;
+		default:
+			fprintf(stderr, "arg???\n");
+			return (1);
+		}
+
+	if (ngroups <= 0 || groupsz <= 0 || nspares <= 0 || nrows <= 0) {
+		fprintf(stderr, "missing arg???\n");
+		return (1);
+	}
+
+	map = new_map(groupsz, ngroups, nspares, nrows);
+	if (verbose > 1)
+		print_map(map);
+
+	if (verbose > 0)
+		print_map_stats(map, UNOPT, 1);
+
+	if (optimize != UNOPT) {
+		map = optimize_map(map, optimize, faults);
+
+		if (verbose > 1)
+			print_map(map);
+		if (verbose > 0)
+			print_map_stats(map, optimize, 1);
+	}
+
+	if (develop) {
+		map_t *dmap = develop_map(map);
+
+		free_map(map);
+		map = dmap;
+	}
+
+	print_map_stats(map, optimize, verbose > 0);
+	return (0);
+}
diff --git a/cmd/draidcfg/draid_permutation.h b/cmd/draidcfg/draid_permutation.h
new file mode 100644
index 000000000000..8562ccf09852
--- /dev/null
+++ b/cmd/draidcfg/draid_permutation.h
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2016 Intel Corporation.
+ */
+
+
+#ifndef _DRAID_PERMUTATION_H
+#define	_DRAID_PERMUTATION_H
+
+#include <sys/vdev_draid_impl.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+extern int draid_permutation_generate(struct vdev_draid_configuration *);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif /* _DRAID_PERMUTATION_H */
diff --git a/cmd/draidcfg/draidcfg.c b/cmd/draidcfg/draidcfg.c
new file mode 100644
index 000000000000..0c3a8375d18c
--- /dev/null
+++ b/cmd/draidcfg/draidcfg.c
@@ -0,0 +1,343 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2016 Intel Corporation.
+ */
+
+
+#include <libzfs.h>
+#include <libnvpair.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <libintl.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_draid_impl.h>
+
+#include "draid_permutation.h"
+
+
+static struct vdev_draid_configuration *
+draidcfg_find(const uint64_t data, const uint64_t parity,
+	const uint64_t spare, const uint64_t children)
+{
+	/* P  D  D...  P  D  D...  S */
+	static const uint64_t bases7[1][7] = {{1, 2, 4, 3, 6, 5, 0}};
+	static const uint64_t bases11[1][11] = {{
+	    1, 4, 5, 9, 3, 2, 8, 10, 7, 6, 0}};
+	static const uint64_t bases19[1][19] = {{
+	    1, 5, 6, 11, 17, 9, 7, 16, 4, 10, 12, 3, 15, 18, 14, 13, 8, 2, 0}};
+	static const uint64_t bases23[1][23] = {{
+	    1, 8, 18, 6, 2, 16, 13, 12, 4, 9, 3, 10, 11, 19, 14, 20, 22,
+	    15, 5, 17, 21, 7, 0}};
+	static const uint64_t bases31[1][31] = {{
+	    1, 8, 2, 16, 4, 17, 12, 3, 24, 6, 10, 18, 20, 5, 9, 15, 27, 30, 23,
+	    29, 7, 25, 14, 19, 28, 26, 22, 21, 13, 11, 0}};
+	static const uint64_t bases41[1][41] = {{
+	    1, 25, 10, 4, 18, 40, 16, 31, 37, 23, 6, 27, 19,
+	    24, 26, 35, 14, 22, 17, 15, 36, 39, 32, 21, 33,
+	    5, 2, 9, 20, 8, 11, 29, 28, 3, 34, 30, 12, 13, 38, 7, 0}};
+
+	static struct vdev_draid_configuration known_cfgs[6] = {
+	    {
+	    .dcf_data = 2, .dcf_parity = 1, .dcf_spare = 1, .dcf_children = 7,
+	    .dcf_bases = 1, .dcf_base_perms = &bases7[0][0]
+	    },
+	    {
+	    .dcf_data = 4, .dcf_parity = 1, .dcf_spare = 1, .dcf_children = 11,
+	    .dcf_bases = 1, .dcf_base_perms = &bases11[0][0]
+	    },
+	    {
+	    .dcf_data = 8, .dcf_parity = 1, .dcf_spare = 1, .dcf_children = 19,
+	    .dcf_bases = 1, .dcf_base_perms = &bases19[0][0]
+	    },
+	    {
+	    .dcf_data = 8, .dcf_parity = 3, .dcf_spare = 1, .dcf_children = 23,
+	    .dcf_bases = 1, .dcf_base_perms = &bases23[0][0]
+	    },
+	    {
+	    .dcf_data = 4, .dcf_parity = 1, .dcf_spare = 1, .dcf_children = 31,
+	    .dcf_bases = 1, .dcf_base_perms = &bases31[0][0]
+	    },
+	    {
+	    .dcf_data = 8, .dcf_parity = 2, .dcf_spare = 1, .dcf_children = 41,
+	    .dcf_bases = 1, .dcf_base_perms = &bases41[0][0]
+	    },
+	};
+
+	int i;
+
+	for (i = 0; i < sizeof (known_cfgs) / sizeof (known_cfgs[0]); i++) {
+		struct vdev_draid_configuration *cfg = &known_cfgs[i];
+
+		if (data == cfg->dcf_data && parity == cfg->dcf_parity &&
+		    spare == cfg->dcf_spare && children == cfg->dcf_children)
+			return (cfg);
+	}
+
+	return (NULL);
+}
+
+static struct vdev_draid_configuration *
+draidcfg_create(const uint64_t data, const uint64_t parity,
+		const uint64_t spare, const uint64_t children)
+{
+	struct vdev_draid_configuration *cfg = calloc(1, sizeof (*cfg));
+
+	assert(cfg != NULL);
+	cfg->dcf_data = data;
+	cfg->dcf_parity = parity;
+	cfg->dcf_spare = spare;
+	cfg->dcf_children = children;
+
+	cfg->dcf_bases = 0;
+	cfg->dcf_base_perms = NULL;
+	if (draid_permutation_generate(cfg) != 0) {
+		free(cfg);
+		return (NULL);
+	}
+
+	assert(cfg->dcf_bases != 0);
+	assert(cfg->dcf_base_perms != NULL);
+	return (cfg);
+}
+
+static inline void
+draidcfg_free(struct vdev_draid_configuration *cfg)
+{
+	free((void *)cfg->dcf_base_perms);
+	free(cfg);
+}
+
+static int
+draidcfg_create_file(const uint64_t data, const uint64_t parity,
+		const uint64_t spare, const uint64_t children, const char *path)
+{
+	FILE *fp;
+	size_t len;
+	int ret = 0;
+	void *packed;
+	nvlist_t *nvl;
+	boolean_t freecfg = B_FALSE;
+	struct vdev_draid_configuration *cfg;
+
+	ASSERT(children != 0);
+	ASSERT3U(children, <=, VDEV_DRAID_MAX_CHILDREN);
+
+	if (children - 1 > VDEV_DRAID_U8_MAX) {
+		fprintf(stderr, "Configuration for over %u children "
+		    "is not supported\n", VDEV_DRAID_U8_MAX + 1);
+		return (1);
+	}
+
+	cfg = draidcfg_find(data, parity, spare, children);
+	if (cfg == NULL) {
+		cfg = draidcfg_create(data, parity, spare, children);
+		if (cfg == NULL) {
+			fprintf(stderr, "Cannot create"
+			    "supported configuration\n");
+			return (1);
+		}
+		freecfg = B_TRUE;
+	}
+
+	fp = fopen(path, "w+");
+	if (fp == NULL) {
+		fprintf(stderr, "Cannot open file %s for write\n", path);
+		if (freecfg)
+			draidcfg_free(cfg);
+		return (1);
+	}
+
+	nvl = fnvlist_alloc();
+	fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_DATA, data);
+	fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_PARITY, parity);
+	fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_SPARE, spare);
+	fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_CHILDREN, children);
+	fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_BASE, cfg->dcf_bases);
+
+	if (children - 1 <= VDEV_DRAID_U8_MAX) {
+		int i, j;
+		uint8_t *val = calloc(children * cfg->dcf_bases, sizeof (*val));
+
+		for (i = 0; i < cfg->dcf_bases; i++) {
+			for (j = 0; j < children; j++) {
+				uint64_t c =
+				    cfg->dcf_base_perms[i * children + j];
+
+				ASSERT3U(c, <, children);
+				ASSERT3U(c, <=, VDEV_DRAID_U8_MAX);
+				val[i * children + j] = (uint8_t)c;
+			}
+		}
+
+		fnvlist_add_uint8_array(nvl, ZPOOL_CONFIG_DRAIDCFG_PERM,
+		    val, children * cfg->dcf_bases);
+		free(val);
+	} else {
+		ASSERT3U(children, ==, 0); /* not supported yet */
+	}
+
+	assert(vdev_draid_config_validate(NULL, nvl));
+
+	packed = fnvlist_pack_xdr(nvl, &len);
+	if (fwrite(packed, 1, len, fp) != len) {
+		ret = 1;
+		fprintf(stderr, "Cannot write %lu bytes to %s\n", len, path);
+	}
+
+	fnvlist_pack_free(packed, len);
+	fnvlist_free(nvl);
+	if (freecfg)
+		draidcfg_free(cfg);
+	fclose(fp);
+	return (ret);
+}
+
+static void
+draidcfg_print(nvlist_t *config)
+{
+	uint_t c;
+	uint8_t *perm = NULL;
+	uint64_t n, d, p, s, b, i;
+
+	n = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_CHILDREN);
+	d = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_DATA);
+	p = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_PARITY);
+	s = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_SPARE);
+	b = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_BASE);
+
+	printf("dRAID%lu vdev of %lu child drives: %lu x (%lu data + "
+	    "%lu parity) and %lu distributed spare\n",
+	    p, n, (n - s) / (d + p), d, p, s);
+	printf("Using %lu base permutation%s\n", b, b > 1 ? "s" : "");
+
+	VERIFY0(nvlist_lookup_uint8_array(config,
+	    ZPOOL_CONFIG_DRAIDCFG_PERM, &perm, &c));
+	ASSERT3U(c, ==, b * n);
+
+	for (i = 0; i < b; i++) {
+		int j;
+
+		printf("  ");
+		for (j = 0; j < n; j++)
+			printf("%*u,", n > 99 ? 3 : 2, perm[i * n + j]);
+		printf("\n");
+	}
+}
+
+static inline int usage(void)
+{
+	printf(gettext("draidcfg [-r] [-n children] [-d data] [-p parity]"
+	    " [-s spare] <configfile>\n"));
+	return (1);
+}
+
+int
+main(int argc, char **argv)
+{
+	boolean_t read = B_FALSE;
+	char *cfg = NULL;
+	uint64_t data = 0, parity = 0, spare = 0, children = 0;
+	int c;
+
+	while ((c = getopt(argc, argv, "rn:d:p:s:")) != -1) {
+		char *endptr;
+		uint64_t *p = NULL;
+
+		switch (c) {
+		case 'r':
+			read = B_TRUE;
+			break;
+		case 'n':
+			p = &children;
+		case 'd':
+			if (p == NULL)
+				p = &data;
+		case 'p':
+			if (p == NULL)
+				p = &parity;
+		case 's':
+			if (p == NULL)
+				p = &spare;
+
+			errno = 0;
+			*p = strtoull(optarg, &endptr, 0);
+			if (errno != 0 || *endptr != '\0') {
+				fprintf(stderr,
+				    gettext("Invalid -%c value: %s\n"),
+				    c, optarg);
+				return (usage());
+			}
+			break;
+		case ':':
+			fprintf(stderr, gettext("Missing argument for "
+			    "'%c' option\n"), optopt);
+			return (usage());
+		case '?':
+			fprintf(stderr, gettext("Invalid option '%c'\n"),
+			    optopt);
+			return (usage());
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		fprintf(stderr,
+		    gettext("Missing configuration file argument\n"));
+		return (usage());
+	}
+
+	cfg = argv[0];
+
+	if (read) {
+		nvlist_t *nvl = draidcfg_read_file(cfg);
+
+		if (nvl == NULL) {
+			return (1);
+		} else {
+			draidcfg_print(nvl);
+			nvlist_free(nvl);
+			return (0);
+		}
+	}
+
+	assert(!read);
+
+	if (data == 0 || parity == 0 || spare == 0 || children == 0) {
+		fprintf(stderr,
+		    gettext("Missing data/parity/spare/children argument\n"));
+		return (usage());
+	}
+
+	if (parity > VDEV_RAIDZ_MAXPARITY) {
+		fprintf(stderr, gettext("Invalid parity %lu\n"), parity);
+		return (usage());
+	}
+
+	if (children % (data + parity) != spare) {
+		fprintf(stderr, gettext("Invalid draid configration\n"));
+		return (usage());
+	}
+
+	return (draidcfg_create_file(data, parity, spare, children, cfg));
+}
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index b020b378331c..17b16b87b69f 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -808,7 +808,8 @@ dump_metaslab(metaslab_t *msp)
 	}
 
 	if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
-		ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
+		ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift) ||
+		    vd->vdev_ops == &vdev_draid_ops);
 
 		mutex_enter(&msp->ms_lock);
 		dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index cc6c18eed897..bbf55e8ad8ad 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -52,6 +52,7 @@
 #include <sys/fm/util.h>
 #include <sys/fm/protocol.h>
 #include <sys/zfs_ioctl.h>
+#include <sys/vdev_draid_impl.h>
 #include <math.h>
 
 #include <libzfs.h>
@@ -2265,7 +2266,7 @@ zpool_do_import(int argc, char **argv)
 	char *endptr;
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:R:stT:VX")) != -1) {
+	while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:R:stT:VX:")) != -1) {
 		switch (c) {
 		case 'a':
 			do_all = B_TRUE;
@@ -5537,7 +5538,8 @@ print_scan_status(pool_scan_stat_t *ps)
 	zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf));
 
 	assert(ps->pss_func == POOL_SCAN_SCRUB ||
-	    ps->pss_func == POOL_SCAN_RESILVER);
+	    ps->pss_func == POOL_SCAN_RESILVER ||
+	    ps->pss_func == POOL_SCAN_REBUILD);
 	/*
 	 * Scan is finished or canceled.
 	 */
@@ -5546,16 +5548,20 @@ print_scan_status(pool_scan_stat_t *ps)
 		char *fmt = NULL;
 
 		if (ps->pss_func == POOL_SCAN_SCRUB) {
-			fmt = gettext("scrub repaired %s in %lluh%um with "
+			fmt = gettext("scrub repaired %s in %lluh%um%us with "
 			    "%llu errors on %s");
 		} else if (ps->pss_func == POOL_SCAN_RESILVER) {
-			fmt = gettext("resilvered %s in %lluh%um with "
+			fmt = gettext("resilvered %s in %lluh%um%us with "
+			    "%llu errors on %s");
+		} else if (ps->pss_func == POOL_SCAN_REBUILD) {
+			fmt = gettext("rebuilt %s in %lluh%um%us with "
 			    "%llu errors on %s");
 		}
 		/* LINTED */
 		(void) printf(fmt, processed_buf,
 		    (u_longlong_t)(minutes_taken / 60),
 		    (uint_t)(minutes_taken % 60),
+		    (uint_t)((end - start) % 60),
 		    (u_longlong_t)ps->pss_errors,
 		    ctime((time_t *)&end));
 		return;
@@ -5566,6 +5572,9 @@ print_scan_status(pool_scan_stat_t *ps)
 		} else if (ps->pss_func == POOL_SCAN_RESILVER) {
 			(void) printf(gettext("resilver canceled on %s"),
 			    ctime(&end));
+		} else if (ps->pss_func == POOL_SCAN_REBUILD) {
+			(void) printf(gettext("rebuild canceled on %s"),
+			    ctime(&end));
 		}
 		return;
 	}
@@ -5581,6 +5590,9 @@ print_scan_status(pool_scan_stat_t *ps)
 	} else if (ps->pss_func == POOL_SCAN_RESILVER) {
 		(void) printf(gettext("resilver in progress since %s"),
 		    ctime(&start));
+	} else if (ps->pss_func == POOL_SCAN_REBUILD) {
+		(void) printf(gettext("rebuild in progress since %s"),
+		    ctime(&start));
 	}
 
 	examined = ps->pss_examined ? ps->pss_examined : 1;
@@ -5619,6 +5631,9 @@ print_scan_status(pool_scan_stat_t *ps)
 	} else if (ps->pss_func == POOL_SCAN_SCRUB) {
 		(void) printf(gettext("\t%s repaired, %.2f%% done\n"),
 		    processed_buf, 100 * fraction_done);
+	} else if (ps->pss_func == POOL_SCAN_REBUILD) {
+		(void) printf(gettext("\t%s rebuilt, %.2f%% done\n"),
+		    processed_buf, 100 * fraction_done);
 	}
 }
 
diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c
index c96157eff632..1101a3fb6ccd 100644
--- a/cmd/zpool/zpool_vdev.c
+++ b/cmd/zpool/zpool_vdev.c
@@ -84,6 +84,8 @@
 #include <blkid/blkid.h>
 #include "zpool_util.h"
 #include <sys/zfs_context.h>
+#include <sys/vdev_draid_impl.h>
+
 
 /*
  * For any given vdev specification, we can have multiple errors.  The
@@ -594,6 +596,7 @@ is_spare(nvlist_t *config, const char *path)
  *	/dev/xxx	Complete disk path
  *	/xxx		Full path to file
  *	xxx		Shorthand for <zfs_vdev_paths>/xxx
+ *	$draidxxx	dRAID spare, see VDEV_DRAID_SPARE_PATH_FMT
  */
 static nvlist_t *
 make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
@@ -636,6 +639,11 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
 
 		/* After whole disk check restore original passed path */
 		strlcpy(path, arg, sizeof (path));
+	} else if (arg[0] == VDEV_DRAID_SPARE_PATH_FMT[0]) {
+		ashift = 12;
+		wholedisk = B_TRUE;
+		strlcpy(path, arg, sizeof (path));
+		type = VDEV_TYPE_DRAID_SPARE;
 	} else {
 		err = is_shorthand_path(arg, path, sizeof (path),
 		    &statbuf, &wholedisk);
@@ -664,17 +672,19 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
 		}
 	}
 
-	/*
-	 * Determine whether this is a device or a file.
-	 */
-	if (wholedisk || S_ISBLK(statbuf.st_mode)) {
-		type = VDEV_TYPE_DISK;
-	} else if (S_ISREG(statbuf.st_mode)) {
-		type = VDEV_TYPE_FILE;
-	} else {
-		(void) fprintf(stderr, gettext("cannot use '%s': must be a "
-		    "block device or regular file\n"), path);
-		return (NULL);
+	if (type == NULL) {
+		/*
+		 * Determine whether this is a device or a file.
+		 */
+		if (wholedisk || S_ISBLK(statbuf.st_mode)) {
+			type = VDEV_TYPE_DISK;
+		} else if (S_ISREG(statbuf.st_mode)) {
+			type = VDEV_TYPE_FILE;
+		} else {
+			fprintf(stderr, gettext("cannot use '%s': must "
+			    "be a block device or regular file\n"), path);
+			return (NULL);
+		}
 	}
 
 	/*
@@ -825,7 +835,8 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
 			rep.zprl_type = type;
 			rep.zprl_children = 0;
 
-			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
+			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
+			    strcmp(type, VDEV_TYPE_DRAID) == 0) {
 				verify(nvlist_lookup_uint64(nv,
 				    ZPOOL_CONFIG_NPARITY,
 				    &rep.zprl_parity) == 0);
@@ -1398,7 +1409,8 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
 static const char *
 is_grouping(const char *type, int *mindev, int *maxdev)
 {
-	if (strncmp(type, "raidz", 5) == 0) {
+	if (strncmp(type, VDEV_TYPE_RAIDZ, 5) == 0 ||
+	    strncmp(type, VDEV_TYPE_DRAID, 5) == 0) {
 		const char *p = type + 5;
 		char *end;
 		long nparity;
@@ -1418,8 +1430,12 @@ is_grouping(const char *type, int *mindev, int *maxdev)
 		if (mindev != NULL)
 			*mindev = nparity + 1;
 		if (maxdev != NULL)
-			*maxdev = 255;
-		return (VDEV_TYPE_RAIDZ);
+			*maxdev = VDEV_DRAID_MAX_CHILDREN;
+
+		if (strncmp(type, VDEV_TYPE_RAIDZ, 5) == 0)
+			return (VDEV_TYPE_RAIDZ);
+		else
+			return (VDEV_TYPE_DRAID);
 	}
 
 	if (maxdev != NULL)
@@ -1488,6 +1504,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 		if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
 			nvlist_t **child = NULL;
 			int c, children = 0;
+			nvlist_t *draidcfg = NULL;
 
 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
 				if (spares != NULL) {
@@ -1544,6 +1561,34 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 			for (c = 1; c < argc; c++) {
 				if (is_grouping(argv[c], NULL, NULL) != NULL)
 					break;
+
+				if (strcmp(type, VDEV_TYPE_DRAID) == 0 &&
+				    strncmp(argv[c], "cfg=", 4) == 0) {
+					if (draidcfg == NULL) {
+						draidcfg =
+						    draidcfg_read_file(argv[c]
+						    + 4);
+						if (draidcfg != NULL)
+							continue;
+						fprintf(stderr,
+						    gettext("invalid draid "
+						    "configuration '%s'\n"),
+						    argv[c]);
+					} else {
+						fprintf(stderr,
+						    gettext("dRAID config "
+						    "specified more than "
+						    "once: %s\n"), argv[c]);
+					}
+
+					for (c = 0; c < children - 1; c++)
+						nvlist_free(child[c]);
+					free(child);
+					if (draidcfg != NULL)
+						nvlist_free(draidcfg);
+					return (NULL);
+				}
+
 				children++;
 				child = realloc(child,
 				    children * sizeof (nvlist_t *));
@@ -1598,7 +1643,8 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 				    type) == 0);
 				verify(nvlist_add_uint64(nv,
 				    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
-				if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
+				if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
+				    strcmp(type, VDEV_TYPE_DRAID) == 0) {
 					verify(nvlist_add_uint64(nv,
 					    ZPOOL_CONFIG_NPARITY,
 					    mindev - 1) == 0);
@@ -1610,6 +1656,19 @@ construct_spec(nvlist_t *props, int argc, char **argv)
 				for (c = 0; c < children; c++)
 					nvlist_free(child[c]);
 				free(child);
+
+				if (draidcfg != NULL) {
+					ASSERT0(strcmp(type, VDEV_TYPE_DRAID));
+
+					if (!vdev_draid_config_add(nv,
+					    draidcfg))
+						fprintf(stderr,
+						    gettext("ignoring invalid "
+						    "draid config\n"));
+
+					nvlist_free(draidcfg);
+					draidcfg = NULL;
+				}
 			}
 		} else {
 			/*
diff --git a/configure.ac b/configure.ac
index 2b3957b3b576..08b1908d204f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -112,6 +112,7 @@ AC_CONFIG_FILES([
 	cmd/arc_summary/Makefile
 	cmd/zed/Makefile
 	cmd/raidz_test/Makefile
+	cmd/draidcfg/Makefile
 	contrib/Makefile
 	contrib/bash_completion.d/Makefile
 	contrib/dracut/Makefile
diff --git a/include/libzfs.h b/include/libzfs.h
index cd3ae15726e9..9c8bd644d0ba 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -405,7 +405,6 @@ typedef struct importargs {
 extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *);
 
 /* legacy pool search routines */
-extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **);
 extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *,
     char *, uint64_t);
 
@@ -824,6 +823,11 @@ int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *);
 extern int zpool_enable_datasets(zpool_handle_t *, const char *, int);
 extern int zpool_disable_datasets(zpool_handle_t *, boolean_t);
 
+/*
+ * dRAID import support
+ */
+nvlist_t *draidcfg_read_file(const char *);
+
 /*
  * Mappings between vdev and FRU.
  */
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am
index 956643801c66..f01c1fc03517 100644
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@@ -55,6 +55,7 @@ COMMON_H = \
 	$(top_srcdir)/include/sys/space_reftree.h \
 	$(top_srcdir)/include/sys/spa.h \
 	$(top_srcdir)/include/sys/spa_impl.h \
+	$(top_srcdir)/include/sys/spa_scan.h \
 	$(top_srcdir)/include/sys/spa_checksum.h \
 	$(top_srcdir)/include/sys/sysevent.h \
 	$(top_srcdir)/include/sys/trace.h \
@@ -85,6 +86,7 @@ COMMON_H = \
 	$(top_srcdir)/include/sys/vdev_impl.h \
 	$(top_srcdir)/include/sys/vdev_raidz.h \
 	$(top_srcdir)/include/sys/vdev_raidz_impl.h \
+	$(top_srcdir)/include/sys/vdev_draid_impl.h \
 	$(top_srcdir)/include/sys/xvattr.h \
 	$(top_srcdir)/include/sys/zap.h \
 	$(top_srcdir)/include/sys/zap_impl.h \
diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h
index 44a11ba57207..9b7571828530 100644
--- a/include/sys/dsl_scan.h
+++ b/include/sys/dsl_scan.h
@@ -118,6 +118,8 @@ typedef struct dsl_scan {
 	boolean_t scn_async_destroying;
 	boolean_t scn_async_stalled;
 
+	boolean_t scn_is_sequential; /* sequential rebuild? */
+	vdev_t *scn_vd; /* vdev to scan, valid only if scn_is_sequential */
 	/* for debugging / information */
 	uint64_t scn_visited_this_txg;
 
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index ba64876121f4..4d38b93bbe50 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -641,6 +641,15 @@ typedef struct zpool_rewind_policy {
 #define	ZPOOL_CONFIG_VDEV_TOP_ZAP	"com.delphix:vdev_zap_top"
 #define	ZPOOL_CONFIG_VDEV_LEAF_ZAP	"com.delphix:vdev_zap_leaf"
 #define	ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS	"com.delphix:has_per_vdev_zaps"
+
+#define	ZPOOL_CONFIG_DRAIDCFG		"com.intel:draid_config"
+#define	ZPOOL_CONFIG_DRAIDCFG_DATA	"com.intel:draid_data"
+#define	ZPOOL_CONFIG_DRAIDCFG_PARITY	"com.intel:draid_parity"
+#define	ZPOOL_CONFIG_DRAIDCFG_SPARE	"com.intel:draid_spare"
+#define	ZPOOL_CONFIG_DRAIDCFG_BASE	"com.intel:draid_base"
+#define	ZPOOL_CONFIG_DRAIDCFG_CHILDREN	"com.intel:draid_children"
+#define	ZPOOL_CONFIG_DRAIDCFG_PERM	"com.intel:draid_perm"
+
 /*
  * The persistent vdev state is stored as separate values rather than a single
  * 'vdev_state' entry.  This is because a device can be in multiple states, such
@@ -669,6 +678,8 @@ typedef struct zpool_rewind_policy {
 #define	VDEV_TYPE_MIRROR		"mirror"
 #define	VDEV_TYPE_REPLACING		"replacing"
 #define	VDEV_TYPE_RAIDZ			"raidz"
+#define	VDEV_TYPE_DRAID			"draid"
+#define	VDEV_TYPE_DRAID_SPARE		"dspare"
 #define	VDEV_TYPE_DISK			"disk"
 #define	VDEV_TYPE_FILE			"file"
 #define	VDEV_TYPE_MISSING		"missing"
@@ -760,6 +771,7 @@ typedef enum pool_scan_func {
 	POOL_SCAN_NONE,
 	POOL_SCAN_SCRUB,
 	POOL_SCAN_RESILVER,
+	POOL_SCAN_REBUILD, /* sequential SPA scan */
 	POOL_SCAN_FUNCS
 } pool_scan_func_t;
 
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index f8a713a4f1ff..ba0018497e34 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -338,6 +338,7 @@ struct metaslab {
 	 */
 	boolean_t	ms_loaded;
 	boolean_t	ms_loading;
+	boolean_t	ms_rebuilding;
 
 	int64_t		ms_deferspace;	/* sum of ms_defermap[] space	*/
 	uint64_t	ms_weight;	/* weight vs. others in group	*/
diff --git a/include/sys/nvpair.h b/include/sys/nvpair.h
index d2dfad5ca2b3..9c358c0af920 100644
--- a/include/sys/nvpair.h
+++ b/include/sys/nvpair.h
@@ -281,6 +281,7 @@ nvlist_t *fnvlist_alloc(void);
 void fnvlist_free(nvlist_t *);
 size_t fnvlist_size(nvlist_t *);
 char *fnvlist_pack(nvlist_t *, size_t *);
+char *fnvlist_pack_xdr(nvlist_t *, size_t *);
 void fnvlist_pack_free(char *, size_t);
 nvlist_t *fnvlist_unpack(char *, size_t);
 nvlist_t *fnvlist_dup(nvlist_t *);
diff --git a/include/sys/spa_scan.h b/include/sys/spa_scan.h
new file mode 100644
index 000000000000..df2a3ed3baa4
--- /dev/null
+++ b/include/sys/spa_scan.h
@@ -0,0 +1,47 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef	_SYS_SPA_SCAN_H
+#define	_SYS_SPA_SCAN_H
+
+#include <sys/types.h>
+#include <sys/spa.h>
+#include <sys/dsl_pool.h>
+#include <sys/dmu.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+extern boolean_t spa_scan_enabled(const spa_t *);
+extern void spa_scan_setup_sync(dmu_tx_t *);
+extern void spa_scan_start(spa_t *, vdev_t *, uint64_t);
+extern int spa_scan_rebuild_cb(dsl_pool_t *,
+    const blkptr_t *, const zbookmark_phys_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_SPA_SCAN_H */
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 4f54b1707c54..ed5aad66057e 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -107,6 +107,7 @@ extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
 extern void vdev_clear(spa_t *spa, vdev_t *vd);
 
 extern boolean_t vdev_is_dead(vdev_t *vd);
+extern boolean_t vdev_is_dead_at(vdev_t *vd, uint64_t offset);
 extern boolean_t vdev_readable(vdev_t *vd);
 extern boolean_t vdev_writeable(vdev_t *vd);
 extern boolean_t vdev_allocatable(vdev_t *vd);
diff --git a/include/sys/vdev_draid_impl.h b/include/sys/vdev_draid_impl.h
new file mode 100644
index 000000000000..33a251dfcbe1
--- /dev/null
+++ b/include/sys/vdev_draid_impl.h
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef _VDEV_DRAID_IMPL_H
+#define	_VDEV_DRAID_IMPL_H
+
+#include <sys/types.h>
+#include <sys/abd.h>
+#include <sys/nvpair.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+typedef struct zio zio_t;
+typedef struct vdev vdev_t;
+typedef struct raidz_map raidz_map_t;
+
+struct vdev_draid_configuration {
+	uint64_t dcf_data;
+	uint64_t dcf_parity;
+	uint64_t dcf_spare;
+	uint64_t dcf_children;
+	uint64_t dcf_bases;
+	abd_t *dcf_zero_abd;
+	const uint64_t *dcf_base_perms;
+};
+
+extern boolean_t vdev_draid_ms_mirrored(const vdev_t *, uint64_t);
+extern boolean_t vdev_draid_group_degraded(vdev_t *, vdev_t *,
+    uint64_t, uint64_t, boolean_t);
+extern uint64_t vdev_draid_check_block(const vdev_t *vd, uint64_t, uint64_t);
+extern uint64_t vdev_draid_get_astart(const vdev_t *, const uint64_t);
+extern uint64_t vdev_draid_offset2group(const vdev_t *, uint64_t, boolean_t);
+extern uint64_t vdev_draid_group2offset(const vdev_t *, uint64_t, boolean_t);
+extern boolean_t vdev_draid_is_remainder_group(const vdev_t *,
+    uint64_t, boolean_t);
+extern uint64_t vdev_draid_get_groupsz(const vdev_t *, boolean_t);
+extern boolean_t vdev_draid_config_validate(const vdev_t *, nvlist_t *);
+extern boolean_t vdev_draid_config_add(nvlist_t *, nvlist_t *);
+extern void vdev_draid_fix_skip_sectors(zio_t *);
+extern int vdev_draid_hide_skip_sectors(raidz_map_t *);
+extern void vdev_draid_restore_skip_sectors(raidz_map_t *, int);
+extern boolean_t vdev_draid_readable(vdev_t *, uint64_t);
+extern boolean_t vdev_draid_is_dead(vdev_t *, uint64_t);
+extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t);
+extern vdev_t *vdev_draid_spare_get_parent(vdev_t *);
+extern nvlist_t *vdev_draid_spare_read_config(vdev_t *);
+
+#define	VDEV_DRAID_MAX_CHILDREN	255
+#define	VDEV_DRAID_U8_MAX	((uint8_t)-1)
+
+#define	VDEV_DRAID_SPARE_PATH_FMT "$"VDEV_TYPE_DRAID"%lu-%lu-s%lu"
+
+/* trace_printk is GPL only */
+#undef	DRAID_USE_TRACE_PRINTK
+
+#ifdef _KERNEL
+#define	U64FMT "%llu"
+#ifdef DRAID_USE_TRACE_PRINTK
+#define	draid_print(fmt, ...) trace_printk(fmt, ##__VA_ARGS__)
+#else
+#define	draid_print(fmt, ...) printk(fmt, ##__VA_ARGS__)
+#endif
+#else
+#define	U64FMT "%lu"
+#define	draid_print(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#endif
+
+extern int draid_debug_lvl;
+extern void vdev_draid_debug_zio(zio_t *, boolean_t);
+
+#define	draid_dbg(lvl, fmt, ...) \
+	do { \
+		if (draid_debug_lvl >= (lvl)) \
+			draid_print(fmt, ##__VA_ARGS__); \
+	} while (0);
+
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif /* _VDEV_DRAID_IMPL_H */
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index d7f11a2b885d..8b15d1f3b614 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -32,6 +32,7 @@
 #include <sys/nvpair.h>
 #include <sys/space_map.h>
 #include <sys/vdev.h>
+#include <sys/abd.h>
 #include <sys/dkio.h>
 #include <sys/uberblock_impl.h>
 #include <sys/zfs_ratelimit.h>
@@ -185,6 +186,7 @@ struct vdev {
 	boolean_t	vdev_ishole;	/* is a hole in the namespace	*/
 	kmutex_t	vdev_queue_lock; /* protects vdev_queue_depth	*/
 	uint64_t	vdev_top_zap;
+	nvlist_t	*vdev_cfg; /* additional configuration		*/
 
 	/*
 	 * The queue depth parameters determine how many async writes are
@@ -355,12 +357,39 @@ extern vdev_ops_t vdev_root_ops;
 extern vdev_ops_t vdev_mirror_ops;
 extern vdev_ops_t vdev_replacing_ops;
 extern vdev_ops_t vdev_raidz_ops;
+extern vdev_ops_t vdev_draid_ops;
+extern vdev_ops_t vdev_draid_spare_ops;
 extern vdev_ops_t vdev_disk_ops;
 extern vdev_ops_t vdev_file_ops;
 extern vdev_ops_t vdev_missing_ops;
 extern vdev_ops_t vdev_hole_ops;
 extern vdev_ops_t vdev_spare_ops;
 
+/*
+ * Virtual device vector for mirroring.
+ */
+typedef struct mirror_child {
+	vdev_t		*mc_vd;
+	uint64_t	mc_offset;
+	int		mc_error;
+	int		mc_load;
+	uint8_t		mc_tried;
+	uint8_t		mc_skipped;
+	uint8_t		mc_speculative;
+} mirror_child_t;
+
+typedef struct mirror_map {
+	int		*mm_preferred;
+	int		mm_preferred_cnt;
+	int		mm_children;
+	boolean_t	mm_replacing;
+	boolean_t	mm_root;
+	mirror_child_t	mm_child[];
+} mirror_map_t;
+
+extern mirror_map_t *vdev_mirror_map_alloc(int, boolean_t, boolean_t);
+extern const zio_vsd_ops_t vdev_mirror_vsd_ops;
+
 /*
  * Common size functions
  */
@@ -368,6 +397,9 @@ extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
 extern uint64_t vdev_get_min_asize(vdev_t *vd);
 extern void vdev_set_min_asize(vdev_t *vd);
 
+extern boolean_t vdev_raidz_need_resilver(vdev_t *, uint64_t, size_t);
+extern boolean_t vdev_draid_need_resilver(vdev_t *, uint64_t, size_t);
+
 /*
  * Global variables
  */
diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h
index 4bd15e3d53c2..39941250e634 100644
--- a/include/sys/vdev_raidz_impl.h
+++ b/include/sys/vdev_raidz_impl.h
@@ -106,6 +106,7 @@ typedef struct raidz_col {
 	size_t rc_offset;		/* device offset */
 	size_t rc_size;			/* I/O size */
 	abd_t *rc_abd;			/* I/O data */
+	abd_t *rc_abd_skip;		/* Skip sector */
 	void *rc_gdata;			/* used to store the "good" version */
 	int rc_error;			/* I/O error for this device */
 	unsigned int rc_tried;		/* Did we attempt this I/O column? */
@@ -123,10 +124,12 @@ typedef struct raidz_map {
 	size_t rm_nskip;		/* Skipped sectors for padding */
 	size_t rm_skipstart;		/* Column index of padding start */
 	abd_t *rm_abd_copy;		/* rm_asize-buffer of copied data */
+	abd_t *rm_abd_skip;		/* dRAID skip sectors */
 	size_t rm_reports;		/* # of referencing checksum reports */
 	unsigned int rm_freed;		/* map no longer has referencing ZIO */
 	unsigned int rm_ecksuminjected;	/* checksum error was injected */
 	raidz_impl_ops_t *rm_ops;	/* RAIDZ math operations */
+	boolean_t rm_declustered;	/* dRAID? */
 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
 } raidz_map_t;
 
diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c
index ce65840905b7..51dfc4937ddc 100644
--- a/lib/libzfs/libzfs_import.c
+++ b/lib/libzfs/libzfs_import.c
@@ -61,6 +61,7 @@
 #include <sys/dktp/fdisk.h>
 #include <sys/efi_partition.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_draid_impl.h>
 #include <blkid/blkid.h>
 #include "libzfs.h"
 #include "libzfs_impl.h"
@@ -862,7 +863,7 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
 /*
  * Determine if the vdev id is a hole in the namespace.
  */
-boolean_t
+static boolean_t
 vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
 {
 	int c;
@@ -876,6 +877,64 @@ vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
 	return (B_FALSE);
 }
 
+nvlist_t *
+draidcfg_read_file(const char *path)
+{
+	int fd;
+	struct stat64 sb;
+	char *buf;
+	nvlist_t *config;
+
+	if ((fd = open(path, O_RDONLY)) < 0) {
+		(void) fprintf(stderr, "Cannot open '%s'\n", path);
+		return (NULL);
+	}
+
+	if (fstat64(fd, &sb) != 0) {
+		(void) fprintf(stderr, "Failed to stat '%s'\n", path);
+		close(fd);
+		return (NULL);
+	}
+
+	if (!S_ISREG(sb.st_mode)) {
+		(void) fprintf(stderr, "Not a regular file '%s'\n", path);
+		close(fd);
+		return (NULL);
+	}
+
+	if ((buf = malloc(sb.st_size)) == NULL) {
+		(void) fprintf(stderr, "Failed to allocate %llu bytes\n",
+		    (u_longlong_t)sb.st_size);
+		close(fd);
+		return (NULL);
+	}
+
+	if (read(fd, buf, sb.st_size) != sb.st_size) {
+		(void) fprintf(stderr, "Failed to read %llu bytes\n",
+		    (u_longlong_t)sb.st_size);
+		close(fd);
+		free(buf);
+		return (NULL);
+	}
+
+	(void) close(fd);
+
+	if (nvlist_unpack(buf, sb.st_size, &config, 0) != 0) {
+		(void) fprintf(stderr, "Failed to unpack nvlist\n");
+		free(buf);
+		return (NULL);
+	}
+
+	free(buf);
+
+	if (!vdev_draid_config_validate(NULL, config)) {
+		nvlist_free(config);
+		return (NULL);
+	}
+
+	return (config);
+}
+
 /*
  * Convert our list of pools into the definitive set of configurations.  We
  * start by picking the best config for each toplevel vdev.  Once that's done,
@@ -1982,17 +2041,6 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
 	return (ret);
 }
 
-nvlist_t *
-zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv)
-{
-	importargs_t iarg = { 0 };
-
-	iarg.paths = argc;
-	iarg.path = argv;
-
-	return (zpool_find_import_impl(hdl, &iarg));
-}
-
 /*
  * Given a cache file, return the contents as a list of importable pools.
  * poolname or guid (but not both) are provided by the caller when trying
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 1bb80298c413..e18ad93e865e 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -41,6 +41,7 @@
 #include <sys/efi_partition.h>
 #include <sys/vtoc.h>
 #include <sys/zfs_ioctl.h>
+#include <sys/vdev_draid_impl.h>
 #include <dlfcn.h>
 
 #include "zfs_namecheck.h"
@@ -945,6 +946,7 @@ zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool)
 	if (ret == 0 && !isopen &&
 	    (strncmp(pool, "mirror", 6) == 0 ||
 	    strncmp(pool, "raidz", 5) == 0 ||
+	    strncmp(pool, "draid", 5) == 0 ||
 	    strncmp(pool, "spare", 5) == 0 ||
 	    strcmp(pool, "log") == 0)) {
 		if (hdl != NULL)
@@ -2040,6 +2042,8 @@ vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare,
 
 			verify(strncmp(type, VDEV_TYPE_RAIDZ,
 			    strlen(VDEV_TYPE_RAIDZ)) == 0 ||
+			    strncmp(type, VDEV_TYPE_DRAID,
+			    strlen(VDEV_TYPE_DRAID)) == 0 ||
 			    strncmp(type, VDEV_TYPE_MIRROR,
 			    strlen(VDEV_TYPE_MIRROR)) == 0);
 			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
@@ -2152,6 +2156,7 @@ boolean_t
 zpool_vdev_is_interior(const char *name)
 {
 	if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
+	    strncmp(name, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0 ||
 	    strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0)
 		return (B_TRUE);
 	return (B_FALSE);
@@ -2703,6 +2708,10 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 			if (islog)
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "cannot replace a log with a spare"));
+			else if (new_disk[0] == VDEV_DRAID_SPARE_PATH_FMT[0])
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "dspare can only replace a child "
+				    "drive in its parent draid vdev"));
 			else if (version >= SPA_VERSION_MULTI_REPLACE)
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "already in replacing/spare config; wait "
@@ -3528,7 +3537,8 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
 		/*
 		 * Remove the partition from the path it this is a whole disk.
 		 */
-		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value)
+		if (strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0 &&
+		    nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value)
 		    == 0 && value && !(name_flags & VDEV_NAME_PATH)) {
 			return (zfs_strip_partition(path));
 		}
@@ -3538,7 +3548,8 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
 		/*
 		 * If it's a raidz device, we need to stick in the parity level.
 		 */
-		if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) {
+		if (strcmp(path, VDEV_TYPE_RAIDZ) == 0 ||
+		    strcmp(path, VDEV_TYPE_DRAID) == 0) {
 			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
 			    &value) == 0);
 			(void) snprintf(buf, sizeof (buf), "%s%llu", path,
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 1e95f8064c8f..5fc5b0b9ffbd 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -86,6 +86,7 @@ KERNEL_C = \
 	spa_history.c \
 	spa_misc.c \
 	spa_stats.c \
+	spa_scan.c \
 	space_map.c \
 	space_reftree.c \
 	txg.c \
@@ -109,6 +110,7 @@ KERNEL_C = \
 	vdev_raidz_math_avx512bw.c \
 	vdev_raidz_math_aarch64_neon.c \
 	vdev_raidz_math_aarch64_neonx2.c \
+	vdev_draid.c \
 	vdev_root.c \
 	zap.c \
 	zap_leaf.c \
diff --git a/module/nvpair/fnvpair.c b/module/nvpair/fnvpair.c
index a91b9524d8a0..4efb9048f745 100644
--- a/module/nvpair/fnvpair.c
+++ b/module/nvpair/fnvpair.c
@@ -73,15 +73,26 @@ fnvlist_size(nvlist_t *nvl)
  * Returns allocated buffer of size *sizep.  Caller must free the buffer with
  * fnvlist_pack_free().
  */
-char *
-fnvlist_pack(nvlist_t *nvl, size_t *sizep)
+static char *
+fnvlist_pack_enc(nvlist_t *nvl, size_t *sizep, int encoding)
 {
 	char *packed = 0;
-	VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE,
-	    KM_SLEEP), ==, 0);
+	VERIFY3U(nvlist_pack(nvl, &packed, sizep, encoding, KM_SLEEP), ==, 0);
 	return (packed);
 }
 
+char *
+fnvlist_pack(nvlist_t *nvl, size_t *sizep)
+{
+	return (fnvlist_pack_enc(nvl, sizep, NV_ENCODE_NATIVE));
+}
+
+char *
+fnvlist_pack_xdr(nvlist_t *nvl, size_t *sizep)
+{
+	return (fnvlist_pack_enc(nvl, sizep, NV_ENCODE_XDR));
+}
+
 /*ARGSUSED*/
 void
 fnvlist_pack_free(char *pack, size_t size)
diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c
index f9c20896d460..9008650d6482 100644
--- a/module/zcommon/zfs_namecheck.c
+++ b/module/zcommon/zfs_namecheck.c
@@ -326,7 +326,9 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what)
 		return (-1);
 	}
 
-	if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) {
+	if (strcmp(pool, "mirror") == 0 ||
+	    strcmp(pool, "raidz") == 0 ||
+	    strcmp(pool, "draid") == 0) {
 		if (why)
 			*why = NAME_ERR_RESERVED;
 		return (-1);
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index f8d54f4dde77..0a7a23113520 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -58,6 +58,7 @@ $(MODULE)-objs += spa_config.o
 $(MODULE)-objs += spa_errlog.o
 $(MODULE)-objs += spa_history.o
 $(MODULE)-objs += spa_misc.o
+$(MODULE)-objs += spa_scan.o
 $(MODULE)-objs += spa_stats.o
 $(MODULE)-objs += space_map.o
 $(MODULE)-objs += space_reftree.o
@@ -76,6 +77,7 @@ $(MODULE)-objs += vdev_queue.o
 $(MODULE)-objs += vdev_raidz.o
 $(MODULE)-objs += vdev_raidz_math.o
 $(MODULE)-objs += vdev_raidz_math_scalar.o
+$(MODULE)-objs += vdev_draid.o
 $(MODULE)-objs += vdev_root.o
 $(MODULE)-objs += zap.o
 $(MODULE)-objs += zap_leaf.o
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index f5ef2268d2fd..201c809e46c9 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -24,6 +24,7 @@
  * Copyright 2016 Gary Mills
  */
 
+#include <sys/spa_scan.h>
 #include <sys/dsl_scan.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
@@ -78,6 +79,8 @@ unsigned long zfs_free_max_blocks = 100000;
 #define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
 	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
 	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
+#define	DSL_SCAN_IS_REBUILD(scn) \
+	((scn)->scn_phys.scn_func == POOL_SCAN_REBUILD)
 
 /*
  * Enable/disable the processing of the free_bpobj object.
@@ -89,6 +92,7 @@ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
 	NULL,
 	dsl_scan_scrub_cb,	/* POOL_SCAN_SCRUB */
 	dsl_scan_scrub_cb,	/* POOL_SCAN_RESILVER */
+	spa_scan_rebuild_cb,	/* POOL_SCAN_REBUILD */
 };
 
 int
@@ -339,7 +343,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 		spa_history_log_internal(spa, "scan done", tx,
 		    "errors=%llu", spa_get_errlog_size(spa));
 
-	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+	if (DSL_SCAN_IS_SCRUB_RESILVER(scn) || DSL_SCAN_IS_REBUILD(scn)) {
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_scrub_inflight > 0) {
 			cv_wait(&spa->spa_scrub_io_cv,
@@ -1526,11 +1530,18 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 	if (dsl_scan_restarting(scn, tx)) {
 		pool_scan_func_t func = POOL_SCAN_SCRUB;
 		dsl_scan_done(scn, B_FALSE, tx);
-		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
-			func = POOL_SCAN_RESILVER;
+		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
+			if (scn->scn_is_sequential)
+				func = POOL_SCAN_REBUILD;
+			else
+				func = POOL_SCAN_RESILVER;
+		}
 		zfs_dbgmsg("restarting scan func=%u txg=%llu",
 		    func, tx->tx_txg);
-		dsl_scan_setup_sync(&func, tx);
+		if (func == POOL_SCAN_REBUILD)
+			spa_scan_setup_sync(tx);
+		else
+			dsl_scan_setup_sync(&func, tx);
 	}
 
 	/*
@@ -1553,6 +1564,18 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 	if (!scn->scn_async_stalled && !dsl_scan_active(scn))
 		return;
 
+	if (DSL_SCAN_IS_REBUILD(scn)) {
+		if (scn->scn_visited_this_txg == 19890604) {
+			ASSERT(!scn->scn_pausing);
+			/* finished with scan. */
+			dsl_scan_done(scn, B_TRUE, tx);
+			scn->scn_visited_this_txg = 0;
+			dsl_scan_sync_state(scn, tx);
+		}
+		/* Rebuild is mostly handled in the open-context scan thread */
+		return;
+	}
+
 	scn->scn_visited_this_txg = 0;
 	scn->scn_pausing = B_FALSE;
 	scn->scn_sync_start_time = gethrtime();
@@ -1754,6 +1777,8 @@ dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
 	} else {
 		dp->dp_scan->scn_restart_txg = txg;
 	}
+	dp->dp_scan->scn_vd = NULL;
+	dp->dp_scan->scn_is_sequential = B_FALSE;
 	zfs_dbgmsg("restarting resilver txg=%llu", txg);
 }
 
@@ -1836,6 +1861,44 @@ dsl_scan_scrub_done(zio_t *zio)
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
+static int zfs_no_resilver_skip = 1;
+
+static boolean_t
+dsl_scan_need_resilver(spa_t *spa, const dva_t *dva,
+    size_t size, uint64_t phys_birth)
+{
+	vdev_t *vd;
+	uint64_t offset;
+
+	if (DVA_GET_GANG(dva)) {
+		/*
+		 * Gang members may be spread across multiple
+		 * vdevs, so the best estimate we have is the
+		 * scrub range, which has already been checked.
+		 * XXX -- it would be better to change our
+		 * allocation policy to ensure that all
+		 * gang members reside on the same vdev.
+		 */
+		return (B_TRUE);
+	}
+
+	vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
+		return (B_FALSE);
+
+	if (zfs_no_resilver_skip != 0)
+		return (B_TRUE);
+
+	offset = DVA_GET_OFFSET(dva);
+	if (vd->vdev_ops == &vdev_raidz_ops)
+		return (vdev_raidz_need_resilver(vd, offset, size));
+
+	if (vd->vdev_ops == &vdev_draid_ops)
+		return (vdev_draid_need_resilver(vd, offset, size));
+
+	return (B_TRUE);
+}
+
 static int
 dsl_scan_scrub_cb(dsl_pool_t *dp,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
@@ -1875,33 +1938,19 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
-		vdev_t *vd = vdev_lookup_top(spa,
-		    DVA_GET_VDEV(&bp->blk_dva[d]));
+		const dva_t *dva = &bp->blk_dva[d];
 
 		/*
 		 * Keep track of how much data we've examined so that
 		 * zpool(1M) status can make useful progress reports.
 		 */
-		scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
-		spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
+		scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva);
+		spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);
 
 		/* if it's a resilver, this may not be in the target range */
-		if (!needs_io) {
-			if (DVA_GET_GANG(&bp->blk_dva[d])) {
-				/*
-				 * Gang members may be spread across multiple
-				 * vdevs, so the best estimate we have is the
-				 * scrub range, which has already been checked.
-				 * XXX -- it would be better to change our
-				 * allocation policy to ensure that all
-				 * gang members reside on the same vdev.
-				 */
-				needs_io = B_TRUE;
-			} else {
-				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
-				    phys_birth, 1);
-			}
-		}
+		if (!needs_io)
+			needs_io = dsl_scan_need_resilver(spa, dva,
+			    size, phys_birth);
 	}
 
 	if (needs_io && !zfs_no_scrub_io) {
@@ -1981,6 +2030,10 @@ MODULE_PARM_DESC(zfs_free_min_time_ms, "Min millisecs to free per txg");
 module_param(zfs_resilver_min_time_ms, int, 0644);
 MODULE_PARM_DESC(zfs_resilver_min_time_ms, "Min millisecs to resilver per txg");
 
+module_param(zfs_no_resilver_skip, int, 0644);
+MODULE_PARM_DESC(zfs_no_resilver_skip,
+	"Set to disable skipping spurious resilver IO");
+
 module_param(zfs_no_scrub_io, int, 0644);
 MODULE_PARM_DESC(zfs_no_scrub_io, "Set to disable scrub I/O");
 
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 5e413c06518b..02c265c62e7b 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -30,6 +30,7 @@
 #include <sys/space_map.h>
 #include <sys/metaslab_impl.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_draid_impl.h>
 #include <sys/zio.h>
 #include <sys/spa_impl.h>
 #include <sys/zfeature.h>
@@ -1103,8 +1104,8 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
  * tree looking for a block that matches the specified criteria.
  */
 static uint64_t
-metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
-    uint64_t align)
+metaslab_block_picker(metaslab_t *msp, avl_tree_t *t, uint64_t *cursor,
+    uint64_t size, uint64_t align)
 {
 	range_seg_t *rs = metaslab_block_find(t, *cursor, size);
 
@@ -1112,8 +1113,27 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
 		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
 
 		if (offset + size <= rs->rs_end) {
-			*cursor = offset + size;
-			return (offset);
+			vdev_t *vd = msp->ms_group->mg_vd;
+			uint64_t next_offset;
+
+			if (vd->vdev_ops != &vdev_draid_ops) {
+				*cursor = offset + size;
+				return (offset);
+			}
+
+			next_offset = vdev_draid_check_block(vd, offset, size);
+			if (next_offset == offset) {
+				*cursor = offset + size;
+				return (offset);
+			}
+
+			offset = P2ROUNDUP(next_offset, align);
+			if (offset + size <= rs->rs_end) {
+				ASSERT3U(offset, ==,
+				    vdev_draid_check_block(vd, offset, size));
+				*cursor = offset + size;
+				return (offset);
+			}
 		}
 		rs = AVL_NEXT(t, rs);
 	}
@@ -1126,7 +1146,7 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
 		return (-1ULL);
 
 	*cursor = 0;
-	return (metaslab_block_picker(t, cursor, size, align));
+	return (metaslab_block_picker(msp, t, cursor, size, align));
 }
 #endif /* WITH_FF/DF/CF_BLOCK_ALLOCATOR */
 
@@ -1150,7 +1170,7 @@ metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
 	avl_tree_t *t = &msp->ms_tree->rt_root;
 
-	return (metaslab_block_picker(t, cursor, size, align));
+	return (metaslab_block_picker(msp, t, cursor, size, align));
 }
 
 static metaslab_ops_t metaslab_ff_ops = {
@@ -1202,7 +1222,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 		*cursor = 0;
 	}
 
-	return (metaslab_block_picker(t, cursor, size, 1ULL));
+	return (metaslab_block_picker(msp, t, cursor, size, 1ULL));
 }
 
 static metaslab_ops_t metaslab_df_ops = {
@@ -1408,6 +1428,12 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 	ms->ms_id = id;
 	ms->ms_start = id << vd->vdev_ms_shift;
 	ms->ms_size = 1ULL << vd->vdev_ms_shift;
+	if (vd->vdev_ops == &vdev_draid_ops) {
+		uint64_t astart = vdev_draid_get_astart(vd, ms->ms_start);
+
+		ms->ms_size -= astart - ms->ms_start;
+		ms->ms_start = astart;
+	}
 
 	/*
 	 * We only open space map objects that already exist. All others
@@ -2706,6 +2732,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 
 	VERIFY(!msp->ms_condensing);
+	VERIFY(!msp->ms_rebuilding);
 
 	start = mc->mc_ops->msop_alloc(msp, size);
 	if (start != -1ULL) {
@@ -2718,7 +2745,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 		range_tree_remove(rt, start, size);
 
 		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
-			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
+			vdev_dirty(vd, VDD_METASLAB, msp, txg);
 
 		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
 
@@ -2737,18 +2764,26 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 
 static uint64_t
 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
+    uint64_t psize, uint64_t asize, uint64_t txg, uint64_t min_distance,
+    dva_t *dva, int d)
 {
+	vdev_t *vd = mg->mg_vd;
 	metaslab_t *msp = NULL;
 	metaslab_t *search;
 	uint64_t offset = -1ULL;
+	boolean_t hybrid_mirror = B_FALSE;
 	uint64_t activation_weight;
 	uint64_t target_distance;
 	int i;
 
+	if (vd->vdev_ops == &vdev_draid_ops &&
+	    psize <= (1ULL << vd->vdev_top->vdev_ashift)) {
+		hybrid_mirror = B_TRUE;
+	}
+
 	activation_weight = METASLAB_WEIGHT_PRIMARY;
 	for (i = 0; i < d; i++) {
-		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+		if (DVA_GET_VDEV(&dva[i]) == vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_SECONDARY;
 			break;
 		}
@@ -2789,10 +2824,15 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 				continue;
 			}
 
+			if (vd->vdev_ops == &vdev_draid_ops &&
+			    hybrid_mirror !=
+			    vdev_draid_ms_mirrored(vd, msp->ms_id))
+				continue;
+
 			/*
 			 * If the selected metaslab is condensing, skip it.
 			 */
-			if (msp->ms_condensing)
+			if (msp->ms_condensing || msp->ms_rebuilding)
 				continue;
 
 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
@@ -2868,7 +2908,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 		 * we can't manipulate this metaslab until it's committed
 		 * to disk.
 		 */
-		if (msp->ms_condensing) {
+		if (msp->ms_condensing || msp->ms_rebuilding) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_CONDENSING);
 			mutex_exit(&msp->ms_lock);
@@ -2932,12 +2972,13 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 
 static uint64_t
 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
+    uint64_t psize, uint64_t asize, uint64_t txg, uint64_t min_distance,
+    dva_t *dva, int d)
 {
 	uint64_t offset;
 	ASSERT(mg->mg_initialized);
 
-	offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
+	offset = metaslab_group_alloc_normal(mg, zal, psize, asize, txg,
 	    min_distance, dva, d);
 
 	mutex_enter(&mg->mg_lock);
@@ -3130,8 +3171,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 		asize = vdev_psize_to_asize(vd, psize);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
-		offset = metaslab_group_alloc(mg, zal, asize, txg, distance,
-		    dva, d);
+		offset = metaslab_group_alloc(mg, zal, psize, asize, txg,
+		    distance, dva, d);
 
 		if (offset != -1ULL) {
 			/*
@@ -3314,6 +3355,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 	}
 
 	VERIFY(!msp->ms_condensing);
+	VERIFY(!msp->ms_rebuilding);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 638ec59efd64..5a5ac3be724c 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -51,6 +51,7 @@
 #include <sys/ddt.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_disk.h>
+#include <sys/vdev_draid_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/uberblock_impl.h>
@@ -70,6 +71,7 @@
 #include <sys/systeminfo.h>
 #include <sys/spa_boot.h>
 #include <sys/zfs_ioctl.h>
+#include <sys/spa_scan.h>
 #include <sys/dsl_scan.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_destroy.h>
@@ -3746,6 +3748,72 @@ spa_l2cache_drop(spa_t *spa)
 	}
 }
 
+static int
+spa_add_draid_spare(nvlist_t *nvroot, vdev_t *rvd)
+{
+	int i, j, n;
+	nvlist_t **oldspares, **newspares;
+	uint_t nspares;
+	vdev_t *c;
+	struct vdev_draid_configuration *cfg;
+
+	for (i = 0, n = 0; i < rvd->vdev_children; i++) {
+		c = rvd->vdev_child[i];
+
+		if (c->vdev_ops == &vdev_draid_ops) {
+			cfg = c->vdev_tsd;
+			ASSERT(cfg != NULL);
+			n += cfg->dcf_spare;
+		}
+	}
+
+	if (n == 0)
+		return (0);
+
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+	    &oldspares, &nspares) != 0)
+		nspares = 0;
+
+	newspares = kmem_alloc(sizeof (*newspares) * (n + nspares), KM_SLEEP);
+	for (i = 0; i < nspares; i++)
+		newspares[i] = fnvlist_dup(oldspares[i]);
+
+	for (i = 0, n = nspares; i < rvd->vdev_children; i++) {
+		c = rvd->vdev_child[i];
+
+		if (c->vdev_ops != &vdev_draid_ops)
+			continue;
+
+		cfg = c->vdev_tsd;
+		for (j = 0; j < cfg->dcf_spare; j++) {
+			nvlist_t *ds = fnvlist_alloc();
+			char path[64];
+
+			snprintf(path, sizeof (path), VDEV_DRAID_SPARE_PATH_FMT,
+			    (long unsigned)c->vdev_nparity,
+			    (long unsigned)c->vdev_id, (long unsigned)j);
+			fnvlist_add_string(ds, ZPOOL_CONFIG_PATH, path);
+			fnvlist_add_string(ds,
+			    ZPOOL_CONFIG_TYPE, VDEV_TYPE_DRAID_SPARE);
+			fnvlist_add_uint64(ds, ZPOOL_CONFIG_IS_LOG, 0);
+			fnvlist_add_uint64(ds, ZPOOL_CONFIG_IS_SPARE, 1);
+			fnvlist_add_uint64(ds, ZPOOL_CONFIG_WHOLE_DISK, 1);
+			fnvlist_add_uint64(ds,
+			    ZPOOL_CONFIG_ASHIFT, c->vdev_ashift);
+
+			newspares[n] = ds;
+			n++;
+		}
+	}
+
+	(void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES);
+	fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, newspares, n);
+	for (i = 0; i < n; i++)
+		nvlist_free(newspares[i]);
+	kmem_free(newspares, sizeof (*newspares) * n);
+	return (0);
+}
+
 /*
  * Pool Creation
  */
@@ -3850,6 +3918,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 
 	if (error == 0 &&
 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
+	    (error = spa_add_draid_spare(nvroot, rvd)) == 0 &&
 	    (error = spa_validate_aux(spa, nvroot, txg,
 	    VDEV_ALLOC_ADD)) == 0) {
 		for (c = 0; c < rvd->vdev_children; c++) {
@@ -4579,6 +4648,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 	char *oldvdpath, *newvdpath;
 	int newvd_isspare;
 	int error;
+	boolean_t rebuild = B_FALSE;
 	ASSERTV(vdev_t *rvd = spa->spa_root_vdev);
 
 	ASSERT(spa_writeable(spa));
@@ -4610,6 +4680,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
 		return (spa_vdev_exit(spa, newrootvd, txg, error));
 
+	/*
+	 * dRAID spare can only replace a child drive of its parent
+	 * dRAID vdev
+	 */
+	if (newvd->vdev_ops == &vdev_draid_spare_ops &&
+	    oldvd->vdev_top != vdev_draid_spare_get_parent(newvd))
+		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
 	/*
 	 * Spares can't replace logs
 	 */
@@ -4727,8 +4805,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 	 */
 	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
-	vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
-	    dtl_max_txg - TXG_INITIAL);
+	vdev_dtl_dirty(newvd, DTL_MISSING,
+	    TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
 
 	if (newvd->vdev_isspare) {
 		spa_spare_activate(newvd);
@@ -4744,12 +4822,20 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 	 */
 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
+	if (spa_scan_enabled(spa) &&
+	    (tvd->vdev_ops == &vdev_mirror_ops ||
+	    newvd->vdev_ops == &vdev_draid_spare_ops))
+		rebuild = B_TRUE; /* HH: let zpool cmd choose */
+
 	/*
 	 * Schedule the resilver to restart in the future. We do this to
 	 * ensure that dmu_sync-ed blocks have been stitched into the
 	 * respective datasets.
 	 */
-	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+	if (rebuild)
+		spa_scan_start(spa, oldvd, dtl_max_txg);
+	else
+		dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
 
 	if (spa->spa_bootfs)
 		spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
diff --git a/module/zfs/spa_scan.c b/module/zfs/spa_scan.c
new file mode 100644
index 000000000000..927911ef3456
--- /dev/null
+++ b/module/zfs/spa_scan.c
@@ -0,0 +1,383 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#include <sys/vdev_impl.h>
+#include <sys/vdev_draid_impl.h>
+#include <sys/spa_impl.h>
+#include <sys/spa_scan.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dsl_scan.h>
+#include <sys/zio.h>
+#include <sys/dmu_tx.h>
+
+static void
+spa_scan_done(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
+	ASSERT(zio->io_bp != NULL);
+
+	abd_free(zio->io_abd);
+	kmem_free(zio->io_private, sizeof (blkptr_t));
+
+	scn->scn_phys.scn_examined += DVA_GET_ASIZE(&zio->io_bp->blk_dva[0]);
+	spa->spa_scan_pass_exam += DVA_GET_ASIZE(&zio->io_bp->blk_dva[0]);
+
+	mutex_enter(&spa->spa_scrub_lock);
+
+	spa->spa_scrub_inflight--;
+	cv_broadcast(&spa->spa_scrub_io_cv);
+
+	if (zio->io_error && (zio->io_error != ECKSUM ||
+	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+		spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
+	}
+
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int spa_scan_max_rebuild = 4096;
+
+static void
+spa_scan_rebuild_block(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t asize)
+{
+	/* HH: maybe bp can be on the stack */
+	blkptr_t *bp = kmem_alloc(sizeof (*bp), KM_SLEEP);
+	dva_t *dva = bp->blk_dva;
+	uint64_t psize;
+	spa_t *spa = vd->vdev_spa;
+	ASSERTV(uint64_t ashift = vd->vdev_top->vdev_ashift);
+
+	ASSERT(vd->vdev_ops == &vdev_draid_ops ||
+	    vd->vdev_ops == &vdev_mirror_ops);
+
+	if (vd->vdev_ops == &vdev_mirror_ops) {
+		psize = asize;
+		ASSERT3U(asize, ==, vdev_psize_to_asize(vd, psize));
+	} else if (vdev_draid_ms_mirrored(vd, offset >> vd->vdev_ms_shift)) {
+		ASSERT0((asize >> ashift) % (1 + vd->vdev_nparity));
+		psize = asize / (1 + vd->vdev_nparity);
+	} else {
+		struct vdev_draid_configuration *cfg = vd->vdev_tsd;
+
+		ASSERT0((asize >> ashift) % (cfg->dcf_data + vd->vdev_nparity));
+		psize = (asize / (cfg->dcf_data + vd->vdev_nparity)) *
+		    cfg->dcf_data;
+	}
+
+	mutex_enter(&spa->spa_scrub_lock);
+	while (spa->spa_scrub_inflight > spa_scan_max_rebuild)
+		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+	spa->spa_scrub_inflight++;
+	mutex_exit(&spa->spa_scrub_lock);
+
+	BP_ZERO(bp);
+
+	DVA_SET_VDEV(&dva[0], vd->vdev_id);
+	DVA_SET_OFFSET(&dva[0], offset);
+	DVA_SET_GANG(&dva[0], 0);
+	DVA_SET_ASIZE(&dva[0], asize);
+
+	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
+	BP_SET_LSIZE(bp, psize);
+	BP_SET_PSIZE(bp, psize);
+	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
+	BP_SET_TYPE(bp, DMU_OT_NONE);
+	BP_SET_LEVEL(bp, 0);
+	BP_SET_DEDUP(bp, 0);
+	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+	zio_nowait(zio_read(pio, spa, bp,
+	    abd_alloc(psize, B_FALSE), psize, spa_scan_done, bp,
+	    ZIO_PRIORITY_SCRUB, ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW |
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_RESILVER, NULL));
+}
+
+static void
+spa_scan_rebuild(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t length)
+{
+	uint64_t max_asize, chunksz;
+
+	if (vd->vdev_ops == &vdev_draid_ops &&
+	    vdev_draid_ms_mirrored(vd, offset >> vd->vdev_ms_shift))
+		max_asize = SPA_MAXBLOCKSIZE * (1 + vd->vdev_nparity);
+	else
+		max_asize = vdev_psize_to_asize(vd, SPA_MAXBLOCKSIZE);
+
+	while (length > 0) {
+		chunksz = MIN(length, max_asize);
+		spa_scan_rebuild_block(pio, vd, offset, chunksz);
+
+		length -= chunksz;
+		offset += chunksz;
+	}
+}
+
+typedef struct {
+	vdev_t	*ssa_vd;
+	uint64_t ssa_dtl_max;
+} spa_scan_arg_t;
+
+static void
+spa_scan_thread(void *arg)
+{
+	spa_scan_arg_t *sscan = arg;
+	vdev_t *vd = sscan->ssa_vd->vdev_top;
+	spa_t *spa = vd->vdev_spa;
+	zio_t *pio = zio_root(spa, NULL, NULL, 0);
+	range_tree_t *allocd_segs;
+	kmutex_t lock;
+	uint64_t msi;
+	int err;
+
+	/*
+	 * Wait for newvd's DTL to propagate upward when
+	 * spa_vdev_exit() calls vdev_dtl_reassess().
+	 */
+	txg_wait_synced(spa->spa_dsl_pool, sscan->ssa_dtl_max);
+
+	mutex_init(&lock, NULL, MUTEX_DEFAULT, NULL);
+	allocd_segs = range_tree_create(NULL, NULL, &lock);
+
+	for (msi = 0; msi < vd->vdev_ms_count; msi++) {
+		metaslab_t *msp = vd->vdev_ms[msi];
+
+		ASSERT0(range_tree_space(allocd_segs));
+
+		mutex_enter(&msp->ms_lock);
+
+		while (msp->ms_condensing) {
+			mutex_exit(&msp->ms_lock);
+
+			zfs_sleep_until(gethrtime() + 100 * MICROSEC);
+
+			mutex_enter(&msp->ms_lock);
+		}
+
+		VERIFY(!msp->ms_condensing);
+		VERIFY(!msp->ms_rebuilding);
+		msp->ms_rebuilding = B_TRUE;
+
+		/*
+		 * If the metaslab has ever been allocated from (ms_sm!=NULL),
+		 * read the allocated segments from the space map object
+		 * into svr_allocd_segs. Since we do this while holding
+		 * svr_lock and ms_sync_lock, concurrent frees (which
+		 * would have modified the space map) will wait for us
+		 * to finish loading the spacemap, and then take the
+		 * appropriate action (see free_from_removing_vdev()).
+		 */
+		if (msp->ms_sm != NULL) {
+			space_map_t *sm = NULL;
+
+			/*
+			 * We have to open a new space map here, because
+			 * ms_sm's sm_length and sm_alloc may not reflect
+			 * what's in the object contents, if we are in between
+			 * metaslab_sync() and metaslab_sync_done().
+			 *
+			 * Note: space_map_open() drops and reacquires the
+			 * caller-provided lock.  Therefore we can not provide
+			 * any lock that we are using (e.g. ms_lock, svr_lock).
+			 */
+			VERIFY0(space_map_open(&sm,
+			    spa->spa_dsl_pool->dp_meta_objset,
+			    msp->ms_sm->sm_object, msp->ms_sm->sm_start,
+			    msp->ms_sm->sm_size, msp->ms_sm->sm_shift, &lock));
+			mutex_enter(&lock);
+			space_map_update(sm);
+			VERIFY0(space_map_load(sm, allocd_segs, SM_ALLOC));
+			mutex_exit(&lock);
+			space_map_close(sm);
+
+			/*
+			 * When we are resuming from a paused removal (i.e.
+			 * when importing a pool with a removal in progress),
+			 * discard any state that we have already processed.
+			 * range_tree_clear(svr->svr_allocd_segs, 0,
+			 * start_offset);
+			 */
+		}
+		mutex_exit(&msp->ms_lock);
+
+		zfs_dbgmsg("Scanning %llu segments for metaslab %llu",
+		    avl_numnodes(&allocd_segs->rt_root), msp->ms_id);
+
+		mutex_enter(&lock);
+		while (range_tree_space(allocd_segs) != 0) {
+			boolean_t mirror;
+			uint64_t offset, length;
+			range_seg_t *rs = avl_first(&allocd_segs->rt_root);
+
+			ASSERT(rs != NULL);
+			offset = rs->rs_start;
+			length = rs->rs_end - rs->rs_start;
+
+			range_tree_remove(allocd_segs, offset, length);
+			mutex_exit(&lock);
+
+			draid_dbg(1, "MS ("U64FMT" at "U64FMT"K) segment: "
+			    U64FMT"K + "U64FMT"K\n",
+			    msp->ms_id, msp->ms_start >> 10,
+			    (offset - msp->ms_start) >> 10, length >> 10);
+
+			if (vd->vdev_ops == &vdev_mirror_ops) {
+				spa_scan_rebuild(pio, vd, offset, length);
+				mutex_enter(&lock);
+				continue;
+			}
+
+			ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+			mirror = vdev_draid_ms_mirrored(vd, msi);
+
+			while (length > 0) {
+				uint64_t group, group_left, chunksz;
+				char *action = "Skipping";
+
+				/*
+				 * HH: make sure we don't cross redundancy
+				 * group boundary
+				 */
+				group =
+				    vdev_draid_offset2group(vd, offset, mirror);
+				group_left = vdev_draid_group2offset(vd,
+				    group + 1, mirror) - offset;
+				ASSERT(!vdev_draid_is_remainder_group(vd,
+				    group, mirror));
+				ASSERT3U(group_left, <=,
+				    vdev_draid_get_groupsz(vd, mirror));
+
+				chunksz = MIN(length, group_left);
+				if (vdev_draid_group_degraded(vd,
+				    sscan->ssa_vd, offset, chunksz, mirror)) {
+					action = "Fixing";
+					spa_scan_rebuild(pio, vd,
+					    offset, chunksz);
+				}
+
+				draid_dbg(1, "\t%s: "U64FMT"K + "U64FMT
+				    "K (%s)\n",
+				    action, offset >> 10, chunksz >> 10,
+				    mirror ? "mirrored" : "dRAID");
+
+				length -= chunksz;
+				offset += chunksz;
+			}
+
+			mutex_enter(&lock);
+		}
+		mutex_exit(&lock);
+
+		mutex_enter(&msp->ms_lock);
+
+		/* HH: wait for rebuild IOs to complete for this metaslab? */
+		msp->ms_rebuilding = B_FALSE;
+
+		mutex_exit(&msp->ms_lock);
+	}
+
+	range_tree_destroy(allocd_segs);
+	mutex_destroy(&lock);
+	kmem_free(sscan, sizeof (*sscan));
+
+	err = zio_wait(pio);
+	if (err != 0) /* HH: handle error */
+		err = SET_ERROR(err);
+	/* HH: we don't use scn_visited_this_txg anyway */
+	spa->spa_dsl_pool->dp_scan->scn_visited_this_txg = 19890604;
+}
+
+void
+spa_scan_start(spa_t *spa, vdev_t *oldvd, uint64_t txg)
+{
+	dsl_scan_t *scan = spa->spa_dsl_pool->dp_scan;
+	spa_scan_arg_t *sscan_arg;
+
+	scan->scn_vd = oldvd->vdev_top;
+	scan->scn_restart_txg = txg;
+	scan->scn_is_sequential = B_TRUE;
+
+	sscan_arg = kmem_alloc(sizeof (*sscan_arg), KM_SLEEP);
+	sscan_arg->ssa_vd = oldvd;
+	sscan_arg->ssa_dtl_max = txg;
+	(void) thread_create(NULL, 0, spa_scan_thread, sscan_arg, 0, NULL,
+	    TS_RUN, defclsyspri);
+}
+
+void
+spa_scan_setup_sync(dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+	spa_t *spa = scn->scn_dp->dp_spa;
+
+	ASSERT(scn->scn_vd != NULL);
+	ASSERT(scn->scn_is_sequential);
+	ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
+
+	bzero(&scn->scn_phys, sizeof (scn->scn_phys));
+	scn->scn_phys.scn_func = POOL_SCAN_REBUILD;
+	scn->scn_phys.scn_state = DSS_SCANNING;
+	scn->scn_phys.scn_min_txg = 0;
+	scn->scn_phys.scn_max_txg = tx->tx_txg;
+	scn->scn_phys.scn_ddt_class_max = 0;
+	scn->scn_phys.scn_start_time = gethrestime_sec();
+	scn->scn_phys.scn_errors = 0;
+	/* Rebuild only examines blocks on one vdev */
+	scn->scn_phys.scn_to_examine = scn->scn_vd->vdev_stat.vs_alloc;
+	scn->scn_restart_txg = 0;
+	scn->scn_done_txg = 0;
+
+	scn->scn_sync_start_time = gethrtime();
+	scn->scn_pausing = B_FALSE;
+	spa->spa_scrub_active = B_TRUE;
+	spa_scan_stat_init(spa);
+
+	spa->spa_scrub_started = B_TRUE;
+}
+
+int
+spa_scan_rebuild_cb(dsl_pool_t *dp,
+    const blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+	/* Rebuild happens in open context and does not use this callback */
+	ASSERT0(1);
+	return (-ENOTSUP);
+}
+
+boolean_t
+spa_scan_enabled(const spa_t *spa)
+{
+	if (spa_scan_max_rebuild > 0)
+		return (B_TRUE);
+	else
+		return (B_FALSE);
+}
+
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+module_param(spa_scan_max_rebuild, int, 0644);
+MODULE_PARM_DESC(spa_scan_max_rebuild, "Max concurrent SPA rebuild I/Os");
+#endif
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index caf92899d0c8..4ebbd812ba1b 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -34,6 +34,7 @@
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_draid_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
@@ -62,6 +63,8 @@ int metaslabs_per_vdev = 200;
 static vdev_ops_t *vdev_ops_table[] = {
 	&vdev_root_ops,
 	&vdev_raidz_ops,
+	&vdev_draid_ops,
+	&vdev_draid_spare_ops,
 	&vdev_mirror_ops,
 	&vdev_replacing_ops,
 	&vdev_spare_ops,
@@ -139,6 +142,16 @@ vdev_get_min_asize(vdev_t *vd)
 		return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
 		    pvd->vdev_children);
 
+	if (pvd->vdev_ops == &vdev_draid_ops) {
+		struct vdev_draid_configuration *cfg = pvd->vdev_tsd;
+
+		ASSERT(cfg != NULL);
+		ASSERT3U(pvd->vdev_nparity, ==, cfg->dcf_parity);
+		ASSERT3U(pvd->vdev_children, ==, cfg->dcf_children);
+		return (pvd->vdev_min_asize /
+		    (pvd->vdev_children - cfg->dcf_spare));
+	}
+
 	return (pvd->vdev_min_asize);
 }
 
@@ -350,6 +363,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	vd->vdev_ops = ops;
 	vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_ishole = (ops == &vdev_hole_ops);
+	vd->vdev_cfg = NULL;
 
 	/*
 	 * Initialize rate limit structs for events.  We rate limit ZIO delay
@@ -394,6 +408,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	char *type;
 	uint64_t guid = 0, islog, nparity;
 	vdev_t *vd;
+	nvlist_t *draidcfg = NULL;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
@@ -448,7 +463,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	 * Set the nparity property for RAID-Z vdevs.
 	 */
 	nparity = -1ULL;
-	if (ops == &vdev_raidz_ops) {
+	if (ops == &vdev_raidz_ops || ops == &vdev_draid_ops) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
 		    &nparity) == 0) {
 			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
@@ -480,13 +495,24 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	}
 	ASSERT(nparity != -1ULL);
 
+	if (ops == &vdev_draid_ops) {
+		if (nvlist_lookup_nvlist(nv,
+		    ZPOOL_CONFIG_DRAIDCFG, &draidcfg) != 0)
+			return (SET_ERROR(EINVAL));
+		if (!vdev_draid_config_validate(NULL, draidcfg))
+			return (SET_ERROR(EINVAL));
+	}
+
 	vd = vdev_alloc_common(spa, id, guid, ops);
 
 	vd->vdev_islog = islog;
 	vd->vdev_nparity = nparity;
+	if (ops == &vdev_draid_ops)
+		vd->vdev_cfg = fnvlist_dup(draidcfg);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
 		vd->vdev_path = spa_strdup(vd->vdev_path);
+
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
 		vd->vdev_devid = spa_strdup(vd->vdev_devid);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
@@ -694,6 +720,9 @@ vdev_free(vdev_t *vd)
 	if (vd->vdev_isl2cache)
 		spa_l2cache_remove(vd);
 
+	if (vd->vdev_cfg)
+		fnvlist_free(vd->vdev_cfg);
+
 	txg_list_destroy(&vd->vdev_ms_list);
 	txg_list_destroy(&vd->vdev_dtl_list);
 
@@ -1061,6 +1090,9 @@ vdev_probe(vdev_t *vd, zio_t *zio)
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return (NULL);
+
 	/*
 	 * Don't probe the probe.
 	 */
@@ -1397,6 +1429,7 @@ vdev_open(vdev_t *vd)
 	 * vdev open for business.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
+	    vd->vdev_ops != &vdev_draid_spare_ops &&
 	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    VDEV_AUX_ERR_EXCEEDED);
@@ -2603,6 +2636,9 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
 	tvd = vd->vdev_top;
 	mg = tvd->vdev_mg;
 	generation = spa->spa_config_generation + 1;
@@ -2773,6 +2809,15 @@ vdev_is_dead(vdev_t *vd)
 	    vd->vdev_ops == &vdev_missing_ops);
 }
 
+boolean_t
+vdev_is_dead_at(vdev_t *vd, uint64_t zio_offset)
+{
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		zio_offset -= VDEV_LABEL_START_SIZE;
+
+	return (vdev_draid_is_dead(vd, zio_offset));
+}
+
 boolean_t
 vdev_readable(vdev_t *vd)
 {
@@ -3033,7 +3078,8 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 				uint64_t *processed = &scn_phys->scn_processed;
 
 				/* XXX cleanup? */
-				if (vd->vdev_ops->vdev_op_leaf)
+				if (vd->vdev_ops->vdev_op_leaf &&
+				    vd->vdev_ops != &vdev_draid_spare_ops)
 					atomic_add_64(processed, psize);
 				vs->vs_scan_processed += psize;
 			}
@@ -3096,19 +3142,22 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 		return;
 
 	mutex_enter(&vd->vdev_stat_lock);
-	if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
+	if (type == ZIO_TYPE_READ && !vdev_is_dead_at(vd, zio->io_offset)) {
 		if (zio->io_error == ECKSUM)
 			vs->vs_checksum_errors++;
 		else
 			vs->vs_read_errors++;
 	}
-	if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
+	if (type == ZIO_TYPE_WRITE && !vdev_is_dead_at(vd, zio->io_offset))
 		vs->vs_write_errors++;
 	mutex_exit(&vd->vdev_stat_lock);
 
-	if (type == ZIO_TYPE_WRITE && txg != 0 &&
+	/* HH: todo proper rebuild IO error handling... */
+	if (type == ZIO_TYPE_WRITE && vd->vdev_ops != &vdev_draid_spare_ops &&
+	    txg != 0 &&
 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
-	    (flags & ZIO_FLAG_SCAN_THREAD) ||
+	    ((flags & ZIO_FLAG_SCAN_THREAD) &&
+	    !spa->spa_dsl_pool->dp_scan->scn_is_sequential) ||
 	    spa->spa_claiming)) {
 		/*
 		 * This is either a normal write (not a repair), or it's
diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c
new file mode 100644
index 000000000000..02c1c83d44de
--- /dev/null
+++ b/module/zfs/vdev_draid.c
@@ -0,0 +1,1551 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2016 Intel Corporation.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_draid_impl.h>
+#include <sys/dsl_scan.h>
+#include <sys/abd.h>
+#include <sys/zio.h>
+#include <sys/nvpair.h>
+#include <sys/zio_checksum.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+
+#ifdef _KERNEL
+#include <linux/kernel.h>
+#else
+#include <libintl.h>
+#endif
+
+#include "vdev_raidz.h"
+
+
+int draid_debug_lvl = 1;
+
+void
+vdev_draid_debug_zio(zio_t *zio, boolean_t mirror)
+{
+	int c;
+
+	draid_dbg(3, "%s zio: off "U64FMT"K sz "U64FMT"K data %p\n",
+	    mirror ? "Mirror" : "dRAID", zio->io_offset >> 10,
+	    zio->io_size >> 10, zio->io_abd);
+
+	if (mirror) {
+	} else {
+		raidz_map_t *rm = zio->io_vsd;
+
+		for (c = 0; c < rm->rm_scols; c++) {
+			char t = 'D';
+			raidz_col_t *rc = &rm->rm_col[c];
+			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
+
+			if (c >= rm->rm_cols) {
+				t = 'S';
+			} else if (c < rm->rm_firstdatacol) {
+				switch (c) {
+				case 0:
+					t = 'P';
+					break;
+				case 1:
+					t = 'Q';
+					break;
+				case 2:
+					t = 'R';
+					break;
+				default:
+					ASSERT0(c);
+				}
+			}
+
+			draid_dbg(3, "%c: dev %lu (%s) off %luK, sz %luK, "
+			    "err %d, skipped %d, tried %d\n", t, rc->rc_devidx,
+			    cvd->vdev_path != NULL ? cvd->vdev_path : "NA",
+			    rc->rc_offset >> 10, rc->rc_size >> 10,
+			    rc->rc_error, rc->rc_skipped, rc->rc_tried);
+		}
+	}
+}
+
+/* A child vdev is divided into slices */
+static unsigned int slice_shift = 0;
+#define	DRAID_SLICESHIFT (SPA_MAXBLOCKSHIFT + slice_shift)
+/* 2 ** slice_shift * SPA_MAXBLOCKSIZE */
+#define	DRAID_SLICESIZE  (1ULL << DRAID_SLICESHIFT)
+#define	DRAID_SLICEMASK  (DRAID_SLICESIZE - 1)
+
+static int
+vdev_draid_get_permutation(uint64_t *p, uint64_t nr,
+    const struct vdev_draid_configuration *cfg)
+{
+	uint64_t i;
+	uint64_t ncols = cfg->dcf_children;
+	uint64_t off = nr % (cfg->dcf_bases * ncols);
+	uint64_t base = off / ncols;
+	uint64_t dev = off % ncols;
+
+	for (i = 0; i < ncols; i++) {
+		const uint64_t *base_perm = cfg->dcf_base_perms +
+		    (base * ncols);
+
+		p[i] = (base_perm[i] + dev) % ncols;
+	}
+
+	return (0);
+}
+
+noinline static raidz_map_t *
+vdev_draid_map_alloc(zio_t *zio, uint64_t unit_shift,
+    const struct vdev_draid_configuration *cfg, uint64_t **array)
+{
+	const uint64_t ndata = cfg->dcf_data;
+	const uint64_t nparity = cfg->dcf_parity;
+	const uint64_t nspare = cfg->dcf_spare;
+	const uint64_t ncols = cfg->dcf_children;
+	/* The starting DRAID (parent) vdev sector of the block. */
+	const uint64_t b = zio->io_offset >> unit_shift;
+	/* The zio's size in units of the vdev's minimum sector size. */
+	const uint64_t psize = zio->io_size >> unit_shift;
+	const uint64_t slice = DRAID_SLICESIZE >> unit_shift;
+	uint64_t o, q, r, c, bc, acols, scols, asize, tot;
+	uint64_t perm, perm_off, group, group_offset, group_left, abd_off;
+	raidz_map_t *rm;
+	uint64_t *permutation;
+	ASSERTV(vdev_t *vd = zio->io_vd);
+
+	ASSERT(!vdev_draid_ms_mirrored(vd,
+	    zio->io_offset >> vd->vdev_ms_shift));
+	ASSERT3U(ncols % (nparity + ndata), ==, nspare);
+	ASSERT0(b % (nparity + ndata));
+	ASSERT0(P2PHASE(DRAID_SLICESIZE, 1ULL << unit_shift));
+
+	/* HH: may not actually need the nspare columns for normal IO */
+	permutation = kmem_alloc(sizeof (permutation[0]) * ncols, KM_SLEEP);
+
+	perm = b / ((ncols - nspare) * slice);
+	perm_off = b % ((ncols - nspare) * slice);
+	group = perm_off / ((nparity + ndata) * slice);
+	group_offset = perm_off % ((nparity + ndata) * slice);
+	ASSERT0(group_offset % (nparity + ndata));
+
+	group_left = (slice - group_offset / (nparity + ndata)) * ndata;
+	ASSERT3U(psize, <=, group_left);
+
+	/* The starting byte offset on each child vdev. */
+	o = (perm * slice + group_offset / (nparity + ndata)) << unit_shift;
+
+	/*
+	 * "Quotient": The number of data sectors for this stripe on all but
+	 * the "big column" child vdevs that also contain "remainder" data.
+	 */
+	q = psize / ndata;
+
+	/*
+	 * "Remainder": The number of partial stripe data sectors in this I/O.
+	 * This will add a sector to some, but not all, child vdevs.
+	 */
+	r = psize - q * ndata;
+
+	/* The number of "big columns" - those which contain remainder data. */
+	bc = (r == 0 ? 0 : r + nparity);
+
+	/*
+	 * The total number of data and parity sectors associated with
+	 * this I/O.
+	 */
+	tot = psize + nparity * (q + (r == 0 ? 0 : 1));
+
+	/* acols: The columns that will be accessed. */
+	/* scols: The columns that will be accessed or skipped. */
+	if (q == 0) {
+		/* Our I/O request doesn't span all child vdevs. */
+		acols = bc;
+	} else {
+		acols = nparity + ndata;
+	}
+	scols = nparity + ndata;
+
+	ASSERT3U(acols, <=, scols);
+
+	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
+	rm->rm_cols = acols;
+	rm->rm_scols = scols;
+	rm->rm_bigcols = bc;
+	rm->rm_skipstart = bc;
+	rm->rm_missingdata = 0;
+	rm->rm_missingparity = 0;
+	rm->rm_firstdatacol = nparity;
+	rm->rm_abd_copy = NULL;
+	rm->rm_reports = 0;
+	rm->rm_freed = 0;
+	rm->rm_ecksuminjected = 0;
+	rm->rm_declustered = B_TRUE;
+
+	VERIFY0(vdev_draid_get_permutation(permutation, perm, cfg));
+
+	for (c = 0, asize = 0; c < scols; c++) {
+		uint64_t i = group * (nparity + ndata) + c;
+
+		ASSERT3U(i, <, ncols - nspare);
+
+		rm->rm_col[c].rc_devidx = permutation[i];
+		rm->rm_col[c].rc_offset = o;
+		rm->rm_col[c].rc_abd = NULL;
+		rm->rm_col[c].rc_gdata = NULL;
+		rm->rm_col[c].rc_error = 0;
+		rm->rm_col[c].rc_tried = 0;
+		rm->rm_col[c].rc_skipped = 0;
+
+		if (c >= acols)
+			rm->rm_col[c].rc_size = 0;
+		else if (c < bc)
+			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+		else
+			rm->rm_col[c].rc_size = q << unit_shift;
+
+		asize += rm->rm_col[c].rc_size;
+	}
+
+	ASSERT3U(asize, ==, tot << unit_shift);
+	rm->rm_asize = roundup(asize, (ndata + nparity) << unit_shift);
+	rm->rm_nskip = roundup(tot, ndata + nparity) - tot;
+	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
+	ASSERT3U(rm->rm_nskip, <, ndata);
+
+	if (rm->rm_nskip == 0 ||
+	    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) == 0)
+		rm->rm_abd_skip = NULL;
+	else
+		rm->rm_abd_skip =
+		    abd_alloc_linear(rm->rm_nskip << unit_shift, B_TRUE);
+
+	for (c = 0; c < rm->rm_firstdatacol; c++)
+		rm->rm_col[c].rc_abd =
+		    abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
+
+	abd_off = 0;
+	rm->rm_col[c].rc_abd = abd_get_offset(zio->io_abd, abd_off);
+	abd_off += rm->rm_col[c].rc_size;
+
+	for (c = c + 1; c < acols; c++) {
+		rm->rm_col[c].rc_abd = abd_get_offset(zio->io_abd, abd_off);
+		abd_off += rm->rm_col[c].rc_size;
+	}
+
+	if (array == NULL)
+		kmem_free(permutation, sizeof (permutation[0]) * ncols);
+	else
+		*array = permutation; /* caller will free */
+	rm->rm_ops = vdev_raidz_math_get_ops();
+	zio->io_vsd = rm;
+	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+	return (rm);
+}
+
+noinline static mirror_map_t *
+vdev_draid_mirror_map_alloc(zio_t *zio, uint64_t unit_shift,
+    const struct vdev_draid_configuration *cfg, uint64_t **array)
+{
+	const uint64_t nparity = cfg->dcf_parity;
+	const uint64_t copies = nparity + 1;
+	const uint64_t nspare = cfg->dcf_spare;
+	const uint64_t ncols = cfg->dcf_children;
+	/* The starting DRAID (parent) vdev sector of the block. */
+	const uint64_t b = zio->io_offset >> unit_shift;
+	const uint64_t slice = DRAID_SLICESIZE >> unit_shift;
+	vdev_t *vd = zio->io_vd;
+	uint64_t o, c, perm, perm_off, group, group_offset;
+	mirror_map_t *mm;
+	uint64_t *permutation;
+	ASSERTV(const uint64_t psize = zio->io_size >> unit_shift);
+
+	ASSERT(vdev_draid_ms_mirrored(vd, zio->io_offset >> vd->vdev_ms_shift));
+	ASSERT3U(ncols % (nparity + cfg->dcf_data), ==, nspare);
+	ASSERT0(P2PHASE(DRAID_SLICESIZE, 1ULL << unit_shift));
+
+	perm = b / ((ncols - nspare) * slice);
+	perm_off = b % ((ncols - nspare) * slice);
+	group = perm_off / (copies * slice);
+	ASSERT3U(group, <, (ncols - nspare) / copies);
+	group_offset = perm_off % (copies * slice);
+	ASSERT0(group_offset % copies);
+	ASSERT3U(psize, <=, slice - group_offset / copies);
+	/* The starting byte offset on each child vdev. */
+	o = (perm * slice + group_offset / copies) << unit_shift;
+
+	mm = vdev_mirror_map_alloc(copies, B_FALSE, B_FALSE);
+	permutation = kmem_alloc(sizeof (permutation[0]) * ncols, KM_SLEEP);
+	VERIFY0(vdev_draid_get_permutation(permutation, perm, cfg));
+
+	for (c = 0; c < mm->mm_children; c++) {
+		int idx = group * copies + c;
+		mirror_child_t *mc = &mm->mm_child[c];
+
+		/* The remainder group is not usable for IO */
+		ASSERT3U(idx, <, ((ncols - nspare) / copies) * copies);
+
+		mc->mc_vd = vd->vdev_child[permutation[idx]];
+		mc->mc_offset = o;
+	}
+
+	if (array == NULL)
+		kmem_free(permutation, sizeof (permutation[0]) * ncols);
+	else
+		*array = permutation; /* caller will free */
+
+	zio->io_vsd = mm;
+	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
+	return (mm);
+}
+
+static inline void
+vdev_draid_assert_vd(const vdev_t *vd)
+{
+	struct vdev_draid_configuration *cfg = vd->vdev_tsd;
+
+	ASSERT(cfg != NULL);
+	ASSERT(cfg->dcf_zero_abd != NULL);
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+	ASSERT3U(vd->vdev_nparity, ==, cfg->dcf_parity);
+	ASSERT3U(vd->vdev_children, ==, cfg->dcf_children);
+}
+
+uint64_t
+vdev_draid_get_groupsz(const vdev_t *vd, boolean_t mirror)
+{
+	struct vdev_draid_configuration *cfg = vd->vdev_tsd;
+	uint64_t copies;
+
+	vdev_draid_assert_vd(vd);
+
+	copies = mirror ?
+	    vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data;
+	return (copies << DRAID_SLICESHIFT);
+}
+
+#define	DRAID_PERM_ASIZE(vd) (((vd)->vdev_children - \
+	((struct vdev_draid_configuration *)(vd)->vdev_tsd)->dcf_spare) \
+	<< DRAID_SLICESHIFT)
+
+uint64_t
+vdev_draid_offset2group(const vdev_t *vd, uint64_t offset, boolean_t mirror)
+{
+	uint64_t perm, perm_off, group, copies, groups_per_perm;
+	struct vdev_draid_configuration *cfg = vd->vdev_tsd;
+
+	vdev_draid_assert_vd(vd);
+
+	perm = offset / DRAID_PERM_ASIZE(vd);
+	perm_off = offset % DRAID_PERM_ASIZE(vd);
+	group = perm_off / vdev_draid_get_groupsz(vd, mirror);
+
+	copies = mirror ?
+	    vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data;
+	groups_per_perm = (vd->vdev_children - cfg->dcf_spare + copies - 1)
+	    / copies;
+
+	return (perm * groups_per_perm + group);
+}
+
+uint64_t
+vdev_draid_group2offset(const vdev_t *vd, uint64_t group, boolean_t mirror)
+{
+	struct vdev_draid_configuration *cfg = vd->vdev_tsd;
+	uint64_t copies, groups_per_perm, offset;
+
+	vdev_draid_assert_vd(vd);
+
+	copies = mirror ?
+	    vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data;
+	groups_per_perm = (vd->vdev_children - cfg->dcf_spare + copies - 1)
+	    / copies;
+
+	offset = DRAID_PERM_ASIZE(vd) * (group / groups_per_perm);
+	offset +=
+	    vdev_draid_get_groupsz(vd, mirror) * (group % groups_per_perm);
+	return (offset);
+}
+
+boolean_t
+vdev_draid_is_remainder_group(const vdev_t *vd,
+    uint64_t group, boolean_t mirror)
+{
+	struct vdev_draid_configuration *cfg = vd->vdev_tsd;
+	uint64_t copies, groups_per_perm;
+
+	vdev_draid_assert_vd(vd);
+
+	copies = mirror ?
+	    vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data;
+	groups_per_perm = (vd->vdev_children - cfg->dcf_spare + copies - 1)
+	    / copies;
+
+	if ((vd->vdev_children - cfg->dcf_spare) % copies == 0)
+		return (B_FALSE);
+
+	/* Currently only mirror can have remainder group */
+	ASSERT(mirror);
+
+	/* The last group in each permutation is the remainder */
+	if (group % groups_per_perm == groups_per_perm - 1)
+		return (B_TRUE);
+	else
+		return (B_FALSE);
+}
+
+uint64_t
+vdev_draid_get_astart(const vdev_t *vd, const uint64_t start)
+{
+	uint64_t astart, perm_off, copies;
+	boolean_t mirror =
+	    vdev_draid_ms_mirrored(vd, start >> vd->vdev_ms_shift);
+	uint64_t group = vdev_draid_offset2group(vd, start, mirror);
+	struct vdev_draid_configuration *cfg = vd->vdev_tsd;
+
+	vdev_draid_assert_vd(vd);
+
+	if (vdev_draid_is_remainder_group(vd, group, mirror))
+		return (start);
+
+	perm_off = start % DRAID_PERM_ASIZE(vd);
+	copies = mirror ?
+	    vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data;
+	astart = roundup(perm_off, copies << vd->vdev_ashift);
+	astart += start - perm_off;
+
+	ASSERT3U(astart, >=, start);
+	return (astart);
+}
+
+uint64_t
+vdev_draid_check_block(const vdev_t *vd, uint64_t start, uint64_t size)
+{
+	boolean_t mirror =
+	    vdev_draid_ms_mirrored(vd, start >> vd->vdev_ms_shift);
+	uint64_t group = vdev_draid_offset2group(vd, start, mirror);
+	uint64_t end = start + size - 1;
+
+	ASSERT3U(size, <, vdev_draid_get_groupsz(vd, mirror));
+	ASSERT3U(start >> vd->vdev_ms_shift, ==, end >> vd->vdev_ms_shift);
+
+	/*
+	 * A block is good if it:
+	 * - does not cross group boundary, AND
+	 * - does not use a remainder group
+	 */
+	if (group == vdev_draid_offset2group(vd, end, mirror) &&
+	    !vdev_draid_is_remainder_group(vd, group, mirror)) {
+		ASSERT3U(start, ==, vdev_draid_get_astart(vd, start));
+		return (start);
+	}
+
+	group++;
+	if (vdev_draid_is_remainder_group(vd, group, mirror))
+		group++;
+	ASSERT(!vdev_draid_is_remainder_group(vd, group, mirror));
+	return (vdev_draid_group2offset(vd, group, mirror));
+}
+
+boolean_t
+vdev_draid_ms_mirrored(const vdev_t *vd, uint64_t ms_id)
+{
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+	/* HH: dedicate 1/20 ms for hybrid mirror */
+	if ((ms_id % 20) == 19)
+		return (B_TRUE);
+	else
+		return (B_FALSE);
+}
+
+static vdev_t *vdev_dspare_get_child(vdev_t *vd, uint64_t offset);
+
+/*
+ * dRAID spare does not fit into the DTL model. While it has child vdevs,
+ * there is no redundancy among them, and the effective child vdev is
+ * determined by offset. Moreover, DTLs of a child vdev before the spare
+ * becomes active are invalid, because the spare blocks were not in use yet.
+ *
+ * Here we are essentially doing a vdev_dtl_reassess() on the fly, by replacing
+ * a dRAID spare with the child vdev under the offset. Note that it is a
+ * recursive process because the child vdev can be another dRAID spare, and so
+ * on.
+ */
+boolean_t
+vdev_draid_missing(vdev_t *vd, uint64_t offset, uint64_t txg, uint64_t size)
+{
+	int c;
+
+	if (vdev_dtl_contains(vd, DTL_MISSING, txg, size))
+		return (B_TRUE);
+
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		vd = vdev_dspare_get_child(vd, offset);
+
+	if (vd->vdev_ops != &vdev_spare_ops)
+		return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
+
+	if (vdev_dtl_contains(vd, DTL_MISSING, txg, size))
+		return (B_TRUE);
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (!vdev_readable(cvd))
+			continue;
+
+		if (!vdev_draid_missing(cvd, offset, txg, size))
+			return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
+boolean_t
+vdev_draid_readable(vdev_t *vd, uint64_t offset)
+{
+	int c;
+
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		vd = vdev_dspare_get_child(vd, offset);
+
+	if (vd->vdev_ops != &vdev_spare_ops)
+		return (vdev_readable(vd));
+
+	for (c = 0; c < vd->vdev_children; c++)
+		if (vdev_draid_readable(vd->vdev_child[c], offset))
+			return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+boolean_t
+vdev_draid_is_dead(vdev_t *vd, uint64_t offset)
+{
+	int c;
+
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		vd = vdev_dspare_get_child(vd, offset);
+
+	if (vd->vdev_ops != &vdev_spare_ops)
+		return (vdev_is_dead(vd));
+
+	for (c = 0; c < vd->vdev_children; c++)
+		if (!vdev_draid_is_dead(vd->vdev_child[c], offset))
+			return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+static boolean_t
+vdev_draid_guid_exists(vdev_t *vd, uint64_t guid, uint64_t offset)
+{
+	int c;
+
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		vd = vdev_dspare_get_child(vd, offset);
+
+	if (vd->vdev_guid == guid)
+		return (B_TRUE);
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		return (B_FALSE);
+
+	for (c = 0; c < vd->vdev_children; c++)
+		if (vdev_draid_guid_exists(vd->vdev_child[c], guid, offset))
+			return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+static boolean_t
+vdev_draid_vd_degraded(vdev_t *vd, const vdev_t *oldvd, uint64_t offset)
+{
+	if (oldvd == NULL) /* Resilver */
+		return (!vdev_dtl_empty(vd, DTL_PARTIAL));
+
+	/* Rebuild */
+	ASSERT(oldvd->vdev_ops->vdev_op_leaf);
+	ASSERT(oldvd->vdev_ops != &vdev_draid_spare_ops);
+
+	return (vdev_draid_guid_exists(vd, oldvd->vdev_guid, offset));
+}
+
+boolean_t
+vdev_draid_group_degraded(vdev_t *vd, vdev_t *oldvd,
+    uint64_t offset, uint64_t size, boolean_t mirror)
+{
+	uint64_t ashift = vd->vdev_top->vdev_ashift;
+	uint64_t group = vdev_draid_offset2group(vd, offset, mirror);
+	struct vdev_draid_configuration *cfg = vd->vdev_tsd;
+	boolean_t degraded = B_FALSE;
+	zio_t *zio;
+	int c, dummy_data;
+	uint64_t *perm;
+	char buf[128];
+
+	vdev_draid_assert_vd(vd);
+	ASSERT(!vdev_draid_is_remainder_group(vd, group, mirror));
+
+	zio = kmem_alloc(sizeof (*zio), KM_SLEEP);
+	zio->io_vd = vd;
+	zio->io_offset = offset;
+	zio->io_size = MAX(SPA_MINBLOCKSIZE, 1ULL << ashift);
+	zio->io_abd = abd_get_from_buf(&dummy_data, zio->io_size);
+
+	buf[0] = '\0';
+	if (mirror) {
+		mirror_map_t *mm =
+		    vdev_draid_mirror_map_alloc(zio, ashift, cfg, &perm);
+
+		ASSERT3U(mm->mm_children, ==, cfg->dcf_parity + 1);
+
+		for (c = 0; c < mm->mm_children; c++) {
+			mirror_child_t *mc = &mm->mm_child[c];
+			char *status = "";
+
+			if (vdev_draid_vd_degraded(mc->mc_vd,
+			    oldvd, mc->mc_offset)) {
+				degraded = B_TRUE;
+				status = "*";
+			}
+			snprintf(buf + strlen(buf), sizeof (buf) - strlen(buf),
+			    U64FMT"%s ", mc->mc_vd->vdev_id, status);
+		}
+	} else {
+		raidz_map_t *rm = vdev_draid_map_alloc(zio, ashift, cfg, &perm);
+
+		ASSERT3U(rm->rm_scols, ==, cfg->dcf_parity + cfg->dcf_data);
+
+		for (c = 0; c < rm->rm_scols; c++) {
+			raidz_col_t *rc = &rm->rm_col[c];
+			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+			char *status = "";
+
+			if (vdev_draid_vd_degraded(cvd, oldvd, rc->rc_offset)) {
+				degraded = B_TRUE;
+				status = "*";
+			}
+			snprintf(buf + strlen(buf), sizeof (buf) - strlen(buf),
+			    U64FMT"%s ", cvd->vdev_id, status);
+		}
+	}
+
+	snprintf(buf + strlen(buf), sizeof (buf) - strlen(buf), "spares: ");
+	for (c = 0; c < cfg->dcf_spare; c++)
+		snprintf(buf + strlen(buf), sizeof (buf) - strlen(buf),
+		    U64FMT" ", perm[cfg->dcf_children - 1 - c]);
+	draid_dbg(4, "%s %s at "U64FMT"K of "U64FMT"K: %s\n",
+	    degraded ? "Degraded" : "Healthy",
+	    mirror ? "mirror" : "draid",
+	    offset >> 10, size >> 10, buf);
+
+	kmem_free(perm, sizeof (perm[0]) * cfg->dcf_children);
+	(*zio->io_vsd_ops->vsd_free)(zio);
+	abd_put(zio->io_abd);
+	kmem_free(zio, sizeof (*zio));
+	return (degraded);
+}
+
+boolean_t
+vdev_draid_config_validate(const vdev_t *vd, nvlist_t *config)
+{
+	int i;
+	uint_t c;
+	uint8_t *perm = NULL;
+	uint64_t n, d, p, s, b;
+
+	if (nvlist_lookup_uint64(config,
+	    ZPOOL_CONFIG_DRAIDCFG_CHILDREN, &n) != 0) {
+#ifndef _KERNEL
+		fprintf(stderr, "Missing %s in configuration\n",
+		    ZPOOL_CONFIG_DRAIDCFG_CHILDREN);
+#endif
+		return (B_FALSE);
+	}
+
+	if (n - 1 > VDEV_DRAID_U8_MAX) {
+#ifndef _KERNEL
+		fprintf(stderr, "%s configuration too invalid: %lu\n",
+		    ZPOOL_CONFIG_DRAIDCFG_CHILDREN, n);
+#endif
+		return (B_FALSE);
+	}
+	if (vd != NULL && n != vd->vdev_children)
+		return (B_FALSE);
+
+	if (nvlist_lookup_uint64(config,
+	    ZPOOL_CONFIG_DRAIDCFG_PARITY, &p) != 0) {
+#ifndef _KERNEL
+		fprintf(stderr, "Missing %s in configuration\n",
+		    ZPOOL_CONFIG_DRAIDCFG_PARITY);
+#endif
+		return (B_FALSE);
+	}
+
+	if (vd != NULL && p != vd->vdev_nparity)
+		return (B_FALSE);
+
+	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_DATA, &d) != 0) {
+#ifndef _KERNEL
+		fprintf(stderr, "Missing %s in configuration\n",
+		    ZPOOL_CONFIG_DRAIDCFG_DATA);
+#endif
+		return (B_FALSE);
+	}
+
+	if (nvlist_lookup_uint64(config,
+	    ZPOOL_CONFIG_DRAIDCFG_SPARE, &s) != 0) {
+#ifndef _KERNEL
+		fprintf(stderr, "Missing %s in configuration\n",
+		    ZPOOL_CONFIG_DRAIDCFG_SPARE);
+#endif
+		return (B_FALSE);
+	}
+
+	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_BASE, &b) != 0) {
+#ifndef _KERNEL
+		fprintf(stderr, "Missing %s in configuration\n",
+		    ZPOOL_CONFIG_DRAIDCFG_BASE);
+#endif
+		return (B_FALSE);
+	}
+
+	if (n == 0 || d == 0 || p == 0 || s == 0 || b == 0) {
+#ifndef _KERNEL
+		fprintf(stderr, "Zero n/d/p/s/b\n");
+#endif
+		return (B_FALSE);
+	}
+
+	if (p > VDEV_RAIDZ_MAXPARITY) {
+#ifndef _KERNEL
+		fprintf(stderr, gettext("Invalid parity %lu\n"), p);
+#endif
+		return (B_FALSE);
+	}
+
+	if ((n - s) % (p + d) != 0) {
+#ifndef _KERNEL
+		fprintf(stderr, "%lu mod %lu is not 0\n", n - s, p + d);
+#endif
+		return (B_FALSE);
+	}
+
+	if (nvlist_lookup_uint8_array(config,
+	    ZPOOL_CONFIG_DRAIDCFG_PERM, &perm, &c) != 0) {
+#ifndef _KERNEL
+		fprintf(stderr, "Missing %s in configuration\n",
+		    ZPOOL_CONFIG_DRAIDCFG_PERM);
+#endif
+		return (B_FALSE);
+	}
+
+	if (c != b * n) {
+#ifndef _KERNEL
+		fprintf(stderr,
+		    "Permutation array has %u items, but %lu expected\n",
+		    c, b * n);
+#endif
+		return (B_FALSE);
+	}
+
+	for (i = 0; i < b; i++) {
+		int j, k;
+		for (j = 0; j < n; j++) {
+			uint64_t val = perm[i * n + j];
+
+			if (val >= n) {
+#ifndef _KERNEL
+				fprintf(stderr,
+				    "Invalid value %lu in permutation %d\n",
+				    val, i);
+#endif
+				return (B_FALSE);
+			}
+
+			for (k = 0; k < j; k++) {
+				if (val == perm[i * n + k]) {
+#ifndef _KERNEL
+					fprintf(stderr,
+					    "Duplicated value %lu in "
+					    "permutation %d\n",
+					    val, i);
+#endif
+					return (B_FALSE);
+				}
+			}
+		}
+	}
+
+	return (B_TRUE);
+}
+
+boolean_t
+vdev_draid_config_add(nvlist_t *top, nvlist_t *draidcfg)
+{
+	char *type;
+	uint64_t parity;
+	nvlist_t **children = NULL;
+	uint_t c = 0;
+
+	if (draidcfg == NULL)
+		return (B_FALSE);
+
+	type = fnvlist_lookup_string(top, ZPOOL_CONFIG_TYPE);
+	if (strcmp(type, VDEV_TYPE_DRAID) != 0)
+		return (B_FALSE);
+
+	parity = fnvlist_lookup_uint64(top, ZPOOL_CONFIG_NPARITY);
+	if (parity != fnvlist_lookup_uint64(draidcfg,
+	    ZPOOL_CONFIG_DRAIDCFG_PARITY))
+		return (B_FALSE);
+
+	VERIFY0(nvlist_lookup_nvlist_array(top,
+	    ZPOOL_CONFIG_CHILDREN, &children, &c));
+	if (c !=
+	    fnvlist_lookup_uint64(draidcfg, ZPOOL_CONFIG_DRAIDCFG_CHILDREN))
+		return (B_FALSE);
+
+	/* HH: todo: check permutation array csum */
+	fnvlist_add_nvlist(top, ZPOOL_CONFIG_DRAIDCFG, draidcfg);
+	return (B_TRUE);
+}
+
+static struct vdev_draid_configuration *
+vdev_draid_config_create(vdev_t *vd)
+{
+/*
+ * HH: should probably allocate draid_zero_page page aligned, when need to deal
+ * with ashift larger than a page
+ */
+#ifdef _KERNEL
+#define	draid_zero_page empty_zero_page
+#else
+static char draid_zero_page[PAGE_SIZE];
+#endif
+
+	int i, j;
+	uint_t c;
+	uint64_t children;
+	uint8_t *perms = NULL;
+	uint64_t *base_perms;
+	nvlist_t *nvl = vd->vdev_cfg;
+	struct vdev_draid_configuration *cfg;
+
+	ASSERT(nvl != NULL);
+
+	if (!vdev_draid_config_validate(vd, nvl))
+		return (NULL);
+
+	cfg = kmem_alloc(sizeof (*cfg), KM_SLEEP);
+	cfg->dcf_children = fnvlist_lookup_uint64(nvl,
+	    ZPOOL_CONFIG_DRAIDCFG_CHILDREN);
+	cfg->dcf_data = fnvlist_lookup_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_DATA);
+	cfg->dcf_parity = fnvlist_lookup_uint64(nvl,
+	    ZPOOL_CONFIG_DRAIDCFG_PARITY);
+	cfg->dcf_spare = fnvlist_lookup_uint64(nvl,
+	    ZPOOL_CONFIG_DRAIDCFG_SPARE);
+	cfg->dcf_bases = fnvlist_lookup_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_BASE);
+
+	VERIFY0(nvlist_lookup_uint8_array(nvl,
+	    ZPOOL_CONFIG_DRAIDCFG_PERM, &perms, &c));
+
+	base_perms = kmem_alloc(sizeof (uint64_t) * c, KM_SLEEP);
+	for (i = 0, children = cfg->dcf_children; i < cfg->dcf_bases; i++)
+		for (j = 0; j < children; j++)
+			base_perms[i * children + j] = perms[i * children + j];
+	cfg->dcf_base_perms = base_perms;
+
+	ASSERT3U(1ULL << vd->vdev_top->vdev_ashift, <=, PAGE_SIZE);
+	cfg->dcf_zero_abd = abd_get_from_buf(draid_zero_page, PAGE_SIZE);
+	return (cfg);
+}
+
+static int
+vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+    uint64_t *ashift)
+{
+	vdev_t *cvd;
+	struct vdev_draid_configuration *cfg = vd->vdev_tsd;
+	uint64_t nparity = vd->vdev_nparity;
+	int c;
+	int lasterror = 0;
+	int numerrors = 0;
+
+	ASSERT(nparity > 0);
+
+	if (nparity > VDEV_RAIDZ_MAXPARITY ||
+	    vd->vdev_children < nparity + 1) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (SET_ERROR(EINVAL));
+	}
+
+	/* vd->vdev_tsd must be set before vdev_open_children(vd) */
+	if (cfg == NULL) {
+		cfg = vdev_draid_config_create(vd);
+		if (cfg == NULL)
+			return (SET_ERROR(EINVAL));
+		vd->vdev_tsd = cfg;
+	} else {
+		ASSERT(vd->vdev_reopening);
+	}
+
+	vdev_open_children(vd);
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		cvd = vd->vdev_child[c];
+
+		if (cvd->vdev_open_error != 0) {
+			lasterror = cvd->vdev_open_error;
+			numerrors++;
+			continue;
+		}
+
+		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
+		*ashift = MAX(*ashift, cvd->vdev_ashift);
+	}
+
+	/* HH: asize becomes tricky with hybrid mirror */
+	*asize *= vd->vdev_children - cfg->dcf_spare;
+	*max_asize *= vd->vdev_children - cfg->dcf_spare;
+	/* HH: because of the draid_zero_page array */
+	ASSERT3U(*ashift, <=, PAGE_SHIFT);
+
+	if (numerrors > nparity) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+		return (lasterror);
+	}
+
+	return (0);
+}
+
+static void
+vdev_draid_close(vdev_t *vd)
+{
+	int c;
+	struct vdev_draid_configuration *cfg = vd->vdev_tsd;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		vdev_close(vd->vdev_child[c]);
+
+	if (vd->vdev_reopening || cfg == NULL)
+		return;
+
+	abd_put(cfg->dcf_zero_abd);
+	kmem_free((void *)cfg->dcf_base_perms,
+	    sizeof (uint64_t) * cfg->dcf_bases * cfg->dcf_children);
+	kmem_free(cfg, sizeof (*cfg));
+	vd->vdev_tsd = NULL;
+}
+
+static uint64_t
+vdev_draid_asize(vdev_t *vd, uint64_t psize)
+{
+	uint64_t asize;
+	uint64_t ashift = vd->vdev_top->vdev_ashift;
+	uint64_t nparity = vd->vdev_nparity;
+	struct vdev_draid_configuration *cfg = vd->vdev_tsd;
+
+	vdev_draid_assert_vd(vd);
+
+	asize = ((psize - 1) >> ashift) + 1;
+
+	if (asize == 1) { /* mirror */
+		asize += nparity;
+	} else { /* draid */
+		asize = roundup(asize, cfg->dcf_data);
+		asize += nparity * (asize / cfg->dcf_data);
+		ASSERT0(asize % (nparity + cfg->dcf_data));
+	}
+
+	return (asize << ashift);
+}
+
+boolean_t
+vdev_draid_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
+{
+	boolean_t mirror =
+	    vdev_draid_ms_mirrored(vd, offset >> vd->vdev_ms_shift);
+
+	/* A block cannot cross redundancy group boundary */
+	ASSERT3U(offset, ==,
+	    vdev_draid_check_block(vd, offset, vdev_draid_asize(vd, psize)));
+
+	return (vdev_draid_group_degraded(vd, NULL, offset, psize, mirror));
+}
+
+/*
+ * Start an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ *   1. Generate the parity data
+ *   2. Create child zio write operations to each column's vdev, for both
+ *      data and parity.
+ *   3. If the column skips any sectors for padding, create optional dummy
+ *      write zio children for those areas to improve aggregation continuity.
+ * - For read operations:
+ *   1. Create child zio read operations to each data column's vdev to read
+ *      the range of data required for zio.
+ *   2. If this is a scrub or resilver operation, or if any of the data
+ *      vdevs have had errors, then create zio read operations to the parity
+ *      columns' VDevs as well.
+ */
+static void
+vdev_draid_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	uint64_t ashift = vd->vdev_top->vdev_ashift;
+	vdev_t *cvd;
+	raidz_map_t *rm;
+	raidz_col_t *rc;
+	int c, i;
+	struct vdev_draid_configuration *cfg = vd->vdev_tsd;
+
+	vdev_draid_assert_vd(vd);
+
+	if (vdev_draid_ms_mirrored(vd, zio->io_offset >> vd->vdev_ms_shift)) {
+		(void) vdev_draid_mirror_map_alloc(zio, ashift, cfg, NULL);
+
+		ASSERT(zio->io_vsd != NULL);
+		ASSERT(zio->io_size <= (1ULL << ashift) ||
+		    ((zio->io_flags & ZIO_FLAG_RESILVER) &&
+		    zio->io_spa->spa_dsl_pool->dp_scan->scn_is_sequential));
+		vdev_mirror_ops.vdev_op_io_start(zio);
+		return;
+	}
+
+	rm = vdev_draid_map_alloc(zio, ashift, cfg, NULL);
+
+	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
+
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		vdev_raidz_generate_parity(rm);
+
+		for (c = 0; c < rm->rm_cols; c++) {
+			rc = &rm->rm_col[c];
+			cvd = vd->vdev_child[rc->rc_devidx];
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_offset, rc->rc_abd, rc->rc_size,
+			    zio->io_type, zio->io_priority, 0,
+			    vdev_raidz_child_done, rc));
+		}
+
+		/*
+		 * Unlike raidz, it's mandatory to fill skip sectors with zero.
+		 */
+		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
+			ASSERT3U(c, <, rm->rm_scols);
+			ASSERT3U(c, >, rm->rm_firstdatacol);
+
+			rc = &rm->rm_col[c];
+			cvd = vd->vdev_child[rc->rc_devidx];
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_offset + rc->rc_size, cfg->dcf_zero_abd,
+			    1ULL << ashift, zio->io_type, zio->io_priority,
+			    0, NULL, NULL)); /* HH: handle skip write error */
+		}
+
+		zio_execute(zio);
+		return;
+	}
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ);
+	/*
+	 * HH: sequential resilver must do IO at redundancy group boundary, i.e.
+	 * rm->rm_nskip must be 0
+	 */
+	ASSERT((zio->io_flags & ZIO_FLAG_RESILVER) == 0 ||
+	    !zio->io_spa->spa_dsl_pool->dp_scan->scn_is_sequential ||
+	    rm->rm_nskip == 0);
+
+	/*
+	 * Iterate over the columns in reverse order so that we hit the parity
+	 * last -- any errors along the way will force us to read the parity.
+	 */
+	for (c = rm->rm_cols - 1; c >= 0; c--) {
+		rc = &rm->rm_col[c];
+		cvd = vd->vdev_child[rc->rc_devidx];
+		if (!vdev_draid_readable(cvd, rc->rc_offset)) {
+			if (c >= rm->rm_firstdatacol)
+				rm->rm_missingdata++;
+			else
+				rm->rm_missingparity++;
+			rc->rc_error = SET_ERROR(ENXIO);
+			rc->rc_tried = 1;	/* don't even try */
+			rc->rc_skipped = 1;
+			continue;
+		}
+		if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) {
+			if (c >= rm->rm_firstdatacol)
+				rm->rm_missingdata++;
+			else
+				rm->rm_missingparity++;
+			rc->rc_error = SET_ERROR(ESTALE);
+			rc->rc_skipped = 1;
+			continue;
+		}
+		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
+		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_offset, rc->rc_abd, rc->rc_size,
+			    zio->io_type, zio->io_priority, 0,
+			    vdev_raidz_child_done, rc));
+		}
+	}
+
+	/*
+	 * Check skip sectors for scrub/resilver. For sequential rebuild,
+	 * this is a no-op because rm->rm_nskip is always zero.
+	 */
+	if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
+		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
+			abd_t *abd;
+
+			ASSERT3U(c, <, rm->rm_scols);
+			ASSERT3U(c, >, rm->rm_firstdatacol);
+
+			rc = &rm->rm_col[c];
+			cvd = vd->vdev_child[rc->rc_devidx];
+
+			if (!vdev_draid_readable(cvd,
+			    rc->rc_offset + rc->rc_size)) {
+				rc->rc_abd_skip = NULL;
+				continue;
+			}
+
+			abd = abd_get_offset(rm->rm_abd_skip, i << ashift);
+			*((int *)abd_to_buf(abd)) = 1;
+			rc->rc_abd_skip = abd;
+
+			/* Skip sector to be written in vdev_draid_io_done() */
+			if (vdev_draid_missing(cvd,
+			    rc->rc_offset + rc->rc_size, zio->io_txg, 1))
+				continue;
+
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_offset + rc->rc_size, abd,
+			    1ULL << ashift, ZIO_TYPE_READ,
+			    zio->io_priority, 0, NULL, NULL));
+		}
+	}
+
+	zio_execute(zio);
+}
+
+int
+vdev_draid_hide_skip_sectors(raidz_map_t *rm)
+{
+	int c, cols;
+	size_t size = rm->rm_col[0].rc_size;
+
+	ASSERT(rm->rm_declustered);
+
+	for (c = rm->rm_cols; c < rm->rm_scols; c++) {
+		void *buf;
+		raidz_col_t *rc = &rm->rm_col[c];
+
+		ASSERT0(rc->rc_size);
+		ASSERT0(rc->rc_error);
+		ASSERT0(rc->rc_tried);
+		ASSERT0(rc->rc_skipped);
+		ASSERT(rc->rc_abd == NULL);
+
+		rc->rc_size = size;
+		rc->rc_abd = abd_alloc_linear(size, B_TRUE);
+		buf = abd_to_buf(rc->rc_abd);
+		bzero(buf, size);
+	}
+
+	cols = rm->rm_cols;
+	rm->rm_cols = rm->rm_scols;
+	return (cols);
+}
+
+void
+vdev_draid_restore_skip_sectors(raidz_map_t *rm, int cols)
+{
+	int c;
+
+	ASSERT(rm->rm_declustered);
+	ASSERT3U(cols, >, rm->rm_firstdatacol);
+	ASSERT3U(cols, <=, rm->rm_scols);
+
+	for (c = cols; c < rm->rm_scols; c++) {
+		raidz_col_t *rc = &rm->rm_col[c];
+
+		ASSERT0(rc->rc_error);
+		ASSERT0(rc->rc_tried);
+		ASSERT0(rc->rc_skipped);
+		ASSERT(rc->rc_abd != NULL);
+
+		abd_free(rc->rc_abd);
+		rc->rc_size = 0;
+		rc->rc_abd = NULL;
+	}
+
+	rm->rm_cols = cols;
+}
+
+void
+vdev_draid_fix_skip_sectors(zio_t *zio)
+{
+	int c, i;
+	char *zero;
+	vdev_t *vd = zio->io_vd;
+	raidz_map_t *rm = zio->io_vsd;
+	struct vdev_draid_configuration *cfg = vd->vdev_tsd;
+	const uint64_t size = 1ULL << vd->vdev_top->vdev_ashift;
+
+	ASSERT(rm->rm_declustered);
+	vdev_draid_assert_vd(vd);
+
+	if (rm->rm_abd_skip == NULL)
+		return;
+
+	zero = abd_to_buf(cfg->dcf_zero_abd);
+	for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
+		char *skip;
+		boolean_t good_skip;
+		raidz_col_t *rc = &rm->rm_col[c];
+
+		ASSERT3U(c, <, rm->rm_scols);
+		ASSERT3U(c, >, rm->rm_firstdatacol);
+
+		if (rc->rc_abd_skip == NULL)
+			continue;
+
+		skip = abd_to_buf(rc->rc_abd_skip);
+		good_skip = (memcmp(skip, zero, size) == 0);
+		abd_put(rc->rc_abd_skip);
+		rc->rc_abd_skip = NULL;
+
+		if (good_skip || !spa_writeable(zio->io_spa))
+			continue;
+
+		zio_nowait(zio_vdev_child_io(zio, NULL,
+		    vd->vdev_child[rc->rc_devidx],
+		    rc->rc_offset + rc->rc_size, cfg->dcf_zero_abd,
+		    size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+		    ZIO_FLAG_IO_REPAIR, NULL, NULL));
+	}
+}
+
+static void
+vdev_draid_io_done(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+
+	if (vdev_draid_ms_mirrored(vd, zio->io_offset >> vd->vdev_ms_shift))
+		vdev_mirror_ops.vdev_op_io_done(zio); /* hybrid mirror */
+	else
+		vdev_raidz_ops.vdev_op_io_done(zio); /* declustered raidz */
+}
+
+static void
+vdev_draid_state_change(vdev_t *vd, int faulted, int degraded)
+{
+	if (faulted > vd->vdev_nparity)
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_NO_REPLICAS);
+	else if (degraded + faulted != 0)
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+	else
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+vdev_ops_t vdev_draid_ops = {
+	vdev_draid_open,
+	vdev_draid_close,
+	vdev_draid_asize,
+	vdev_draid_io_start,
+	vdev_draid_io_done,
+	vdev_draid_state_change,
+	NULL,
+	NULL,
+	VDEV_TYPE_DRAID,	/* name of this vdev type */
+	B_FALSE			/* not a leaf vdev */
+};
+
+#include <sys/spa_impl.h>
+
+typedef struct {
+	vdev_t	*dsp_draid;
+	uint64_t dsp_id;
+} vdev_dspare_t;
+
+static vdev_t *
+vdev_dspare_get_child(vdev_t *vd, uint64_t offset)
+{
+	vdev_t *draid;
+	uint64_t *permutation, spareidx;
+	vdev_dspare_t *dspare = vd->vdev_tsd;
+	struct vdev_draid_configuration *cfg;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
+	ASSERT3U(offset, <,
+	    vd->vdev_psize - VDEV_LABEL_START_SIZE - VDEV_LABEL_END_SIZE);
+	ASSERT(dspare != NULL);
+	draid = dspare->dsp_draid;
+	vdev_draid_assert_vd(draid);
+	cfg = draid->vdev_tsd;
+	ASSERT3U(dspare->dsp_id, <, cfg->dcf_spare);
+
+	permutation = kmem_alloc(sizeof (permutation[0]) * draid->vdev_children,
+	    KM_SLEEP);
+	VERIFY0(vdev_draid_get_permutation(permutation,
+	    offset >> DRAID_SLICESHIFT, cfg));
+	spareidx = permutation[draid->vdev_children - 1 - dspare->dsp_id];
+	ASSERT3U(spareidx, <, draid->vdev_children);
+	kmem_free(permutation, sizeof (permutation[0]) * draid->vdev_children);
+
+	return (draid->vdev_child[spareidx]);
+}
+
+vdev_t *
+vdev_draid_spare_get_parent(vdev_t *vd)
+{
+	vdev_dspare_t *dspare = vd->vdev_tsd;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
+	ASSERT(dspare != NULL);
+	ASSERT(dspare->dsp_draid != NULL);
+
+	return (dspare->dsp_draid);
+}
+
+nvlist_t *
+vdev_draid_spare_read_config(vdev_t *vd)
+{
+	int i;
+	uint64_t guid;
+	spa_t *spa = vd->vdev_spa;
+	spa_aux_vdev_t *sav = &spa->spa_spares;
+	nvlist_t *nv = fnvlist_alloc();
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
+
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa));
+	fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa));
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid);
+
+	if (vd->vdev_isspare)
+		fnvlist_add_uint64(nv,
+		    ZPOOL_CONFIG_POOL_STATE, POOL_STATE_ACTIVE);
+	else
+		fnvlist_add_uint64(nv,
+		    ZPOOL_CONFIG_POOL_STATE, POOL_STATE_SPARE);
+
+	for (i = 0, guid = vd->vdev_guid; i < sav->sav_count; i++) {
+		if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops &&
+		    strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) {
+			guid = sav->sav_vdevs[i]->vdev_guid;
+			break;
+		}
+	}
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid);
+
+	/* HH: ZPOOL_CONFIG_UNSPARE and ZPOOL_CONFIG_RESILVER_TXG? */
+	return (nv);
+}
+
+static int
+vdev_dspare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *ashift)
+{
+	uint64_t draid_id, nparity, spare_id;
+	uint64_t asize, max_asize;
+	vdev_t *draid;
+	vdev_dspare_t *dspare;
+	struct vdev_draid_configuration *cfg;
+
+	if (vd->vdev_tsd != NULL) {
+		ASSERT(vd->vdev_reopening);
+		dspare = vd->vdev_tsd;
+		draid = dspare->dsp_draid;
+		cfg = draid->vdev_tsd;
+		goto skip_open;
+	}
+
+	if (sscanf(vd->vdev_path, VDEV_DRAID_SPARE_PATH_FMT,
+	    (long unsigned *)&nparity, (long unsigned *)&draid_id,
+	    (long unsigned *)&spare_id) != 3)
+		return (SET_ERROR(EINVAL));
+
+	if (draid_id >= vd->vdev_spa->spa_root_vdev->vdev_children)
+		return (SET_ERROR(EINVAL));
+
+	draid = vd->vdev_spa->spa_root_vdev->vdev_child[draid_id];
+	if (draid->vdev_ops != &vdev_draid_ops)
+		return (SET_ERROR(EINVAL));
+	if (draid->vdev_nparity != nparity)
+		return (SET_ERROR(EINVAL));
+
+	cfg = draid->vdev_tsd;
+	ASSERT(cfg != NULL);
+	if (spare_id >= cfg->dcf_spare)
+		return (SET_ERROR(EINVAL));
+
+	dspare = kmem_alloc(sizeof (*dspare), KM_SLEEP);
+	dspare->dsp_draid = draid;
+	dspare->dsp_id = spare_id;
+	vd->vdev_tsd = dspare;
+
+skip_open:
+	vdev_draid_assert_vd(draid);
+
+	asize = draid->vdev_asize / (draid->vdev_children - cfg->dcf_spare);
+	max_asize = draid->vdev_max_asize /
+	    (draid->vdev_children - cfg->dcf_spare);
+
+	*ashift = draid->vdev_ashift;
+	*psize = asize + (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
+	*max_psize = max_asize + (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
+	return (0);
+}
+
+static void
+vdev_dspare_close(vdev_t *vd)
+{
+	vdev_dspare_t *dspare = vd->vdev_tsd;
+
+	if (vd->vdev_reopening || dspare == NULL)
+		return;
+
+	vd->vdev_tsd = NULL;
+	kmem_free(dspare, sizeof (*dspare));
+}
+
+static uint64_t
+vdev_dspare_asize(vdev_t *vd, uint64_t psize)
+{
+	/* HH: this function should never get called */
+	ASSERT0(psize);
+	return (0);
+}
+
+static void
+vdev_dspare_child_done(zio_t *zio)
+{
+	zio_t *pio = zio->io_private;
+
+	pio->io_error = zio->io_error;
+}
+
+static void
+vdev_dspare_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_t *cvd;
+	uint64_t offset = zio->io_offset;
+
+	/* HH: if dspare gets a FLUSH, so do all children of the draid vdev */
+	if (zio->io_type == ZIO_TYPE_IOCTL) {
+		zio->io_error = 0;
+		zio_execute(zio);
+		return;
+	}
+
+	/*
+	 * HH: at pool creation, dspare gets some writes with
+	 * ZIO_FLAG_SPECULATIVE and ZIO_FLAG_NODATA.
+	 * Need to understand and handle them right.
+	 */
+	if (zio->io_flags & ZIO_FLAG_NODATA) {
+		zio->io_error = 0;
+		zio_execute(zio);
+		return;
+	}
+
+	if (offset < VDEV_LABEL_START_SIZE ||
+	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) {
+		ASSERT(zio->io_flags & ZIO_FLAG_PHYSICAL);
+
+		/*
+		 * HH: dspare should not get any label IO as it is pretending
+		 * to be a leaf disk. Later should catch and fix all places
+		 * that still does label IO to dspare.
+		 */
+		zio->io_error = SET_ERROR(ENODATA);
+		zio_interrupt(zio);
+		return;
+	}
+
+	offset -= VDEV_LABEL_START_SIZE; /* See zio_vdev_child_io() */
+	cvd = vdev_dspare_get_child(vd, offset);
+	if (zio->io_type == ZIO_TYPE_READ && !vdev_readable(cvd)) {
+		zio->io_error = SET_ERROR(ENXIO);
+		zio_interrupt(zio);
+		/*
+		 * Parent vdev should have avoided reading from me in the first
+		 * place, unless this is a mirror scrub.
+		 */
+		draid_dbg(1, "Read from dead spare %s:%s:%s at "U64FMT"\n",
+		    vd->vdev_path,
+		    cvd->vdev_ops->vdev_op_type,
+		    cvd->vdev_path != NULL ? cvd->vdev_path : "NA",
+		    offset);
+		return;
+	}
+
+	/* dspare IO does not cross slice boundary */
+	ASSERT3U(offset >> DRAID_SLICESHIFT, ==,
+	    (offset + zio->io_size - 1) >> DRAID_SLICESHIFT);
+	zio_nowait(zio_vdev_child_io(zio, NULL, cvd, offset, zio->io_abd,
+	    zio->io_size, zio->io_type, zio->io_priority, 0,
+	    vdev_dspare_child_done, zio));
+	zio_execute(zio);
+}
+
+static void
+vdev_dspare_io_done(zio_t *zio)
+{
+}
+
+vdev_ops_t vdev_draid_spare_ops = {
+	vdev_dspare_open,
+	vdev_dspare_close,
+	vdev_dspare_asize,
+	vdev_dspare_io_start,
+	vdev_dspare_io_done,
+	NULL,
+	NULL,
+	NULL,
+	VDEV_TYPE_DRAID_SPARE,
+	B_TRUE
+};
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+module_param(draid_debug_lvl, int, 0644);
+MODULE_PARM_DESC(draid_debug_lvl, "dRAID debugging verbose level");
+#endif
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 54c54237bfd5..9e65205bb29d 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -141,6 +141,7 @@
 #include <sys/zap.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_draid_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/zio.h>
@@ -384,8 +385,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
 
 	if (vd->vdev_nparity != 0) {
-		ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
-		    VDEV_TYPE_RAIDZ) == 0);
+		ASSERT(vd->vdev_ops == &vdev_raidz_ops ||
+		    vd->vdev_ops == &vdev_draid_ops);
 
 		/*
 		 * Make sure someone hasn't managed to sneak a fancy new vdev
@@ -405,6 +406,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
 	}
 
+	if (vd->vdev_cfg != NULL) {
+		ASSERT(vd->vdev_ops == &vdev_draid_ops);
+		ASSERT(vdev_draid_config_validate(vd, vd->vdev_cfg));
+
+		fnvlist_add_nvlist(nv, ZPOOL_CONFIG_DRAIDCFG, vd->vdev_cfg);
+	}
+
 	if (vd->vdev_wholedisk != -1ULL)
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 		    vd->vdev_wholedisk);
@@ -601,6 +609,9 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
 	if (!vdev_readable(vd))
 		return (NULL);
 
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return (vdev_draid_spare_read_config(vd));
+
 	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
 	vp = abd_to_buf(vp_abd);
 
@@ -869,6 +880,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 		ASSERT(reason == VDEV_LABEL_REPLACE);
 	}
 
+	if (vd->vdev_ops == &vdev_draid_spare_ops) {
+		error = 0;
+		goto skip;
+	}
+
 	/*
 	 * Initialize its label.
 	 */
@@ -990,6 +1006,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	abd_free(ub_abd);
 	abd_free(vp_abd);
 
+skip:
 	/*
 	 * If this vdev hasn't been previously identified as a spare, then we
 	 * mark it as such only if a) we are labeling it as a spare, or b) it
@@ -1079,7 +1096,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
 	for (c = 0; c < vd->vdev_children; c++)
 		vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
 
-	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
+	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) &&
+	    vd->vdev_ops != &vdev_draid_spare_ops) {
 		for (l = 0; l < VDEV_LABELS; l++) {
 			for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
 				vdev_label_read(zio, vd, l,
@@ -1160,6 +1178,13 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return;
 
+	/*
+	 * HH: no need to sync ub on dspare - if dspare gets a ub sync, so
+	 * does the parent draid vdev
+	 */
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return;
+
 	if (!vdev_writeable(vd))
 		return;
 
@@ -1264,6 +1289,9 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return;
 
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return;
+
 	if (!vdev_writeable(vd))
 		return;
 
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 256431e6b334..7aeaab885209 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -30,32 +30,11 @@
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_draid_impl.h>
 #include <sys/zio.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
 
-/*
- * Virtual device vector for mirroring.
- */
-
-typedef struct mirror_child {
-	vdev_t		*mc_vd;
-	uint64_t	mc_offset;
-	int		mc_error;
-	int		mc_load;
-	uint8_t		mc_tried;
-	uint8_t		mc_skipped;
-	uint8_t		mc_speculative;
-} mirror_child_t;
-
-typedef struct mirror_map {
-	int		*mm_preferred;
-	int		mm_preferred_cnt;
-	int		mm_children;
-	boolean_t	mm_replacing;
-	boolean_t	mm_root;
-	mirror_child_t	mm_child[];
-} mirror_map_t;
 
 static int vdev_mirror_shift = 21;
 
@@ -85,7 +64,7 @@ vdev_mirror_map_size(int children)
 	    sizeof (int) * children);
 }
 
-static inline mirror_map_t *
+mirror_map_t *
 vdev_mirror_map_alloc(int children, boolean_t replacing, boolean_t root)
 {
 	mirror_map_t *mm;
@@ -108,7 +87,7 @@ vdev_mirror_map_free(zio_t *zio)
 	kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
 }
 
-static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
+const zio_vsd_ops_t vdev_mirror_vsd_ops = {
 	vdev_mirror_map_free,
 	zio_vsd_default_cksum_report
 };
@@ -331,6 +310,12 @@ vdev_mirror_preferred_child_randomize(zio_t *zio)
 	return (mm->mm_preferred[p]);
 }
 
+static boolean_t
+vdev_mirror_child_readable(mirror_child_t *mc)
+{
+	return (vdev_draid_readable(mc->mc_vd, mc->mc_offset));
+}
+
 /*
  * Try to find a vdev whose DTL doesn't contain the block we want to read
  * prefering vdevs based on determined load.
@@ -356,14 +341,15 @@ vdev_mirror_child_select(zio_t *zio)
 		if (mc->mc_tried || mc->mc_skipped)
 			continue;
 
-		if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) {
+		if (mc->mc_vd == NULL ||
+		    !vdev_mirror_child_readable(mc)) {
 			mc->mc_error = SET_ERROR(ENXIO);
 			mc->mc_tried = 1;	/* don't even try */
 			mc->mc_skipped = 1;
 			continue;
 		}
 
-		if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
+		if (vdev_draid_missing(mc->mc_vd, mc->mc_offset, txg, 1)) {
 			mc->mc_error = SET_ERROR(ESTALE);
 			mc->mc_skipped = 1;
 			mc->mc_speculative = 1;
@@ -420,7 +406,12 @@ vdev_mirror_io_start(zio_t *zio)
 	mirror_child_t *mc;
 	int c, children;
 
-	mm = vdev_mirror_map_init(zio);
+	if (zio->io_vsd != NULL) { /* dRAID hybrid mirror */
+		ASSERT3P(zio->io_vd->vdev_ops, ==, &vdev_draid_ops);
+		mm = zio->io_vsd;
+	} else {
+		mm = vdev_mirror_map_init(zio);
+	}
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index c073f1374fa5..c127f329f3fc 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -35,6 +35,7 @@
 #include <sys/fm/fs/zfs.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_raidz_impl.h>
+#include <sys/vdev_draid_impl.h>
 
 /*
  * Virtual device vector for RAID-Z.
@@ -145,6 +146,11 @@ vdev_raidz_map_free(raidz_map_t *rm)
 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 		abd_put(rm->rm_col[c].rc_abd);
 
+	if (rm->rm_abd_skip != NULL) {
+		ASSERT(rm->rm_declustered);
+		abd_free(rm->rm_abd_skip);
+	}
+
 	if (rm->rm_abd_copy != NULL)
 		abd_free(rm->rm_abd_copy);
 
@@ -317,7 +323,7 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 	ASSERT3U(offset, ==, size);
 }
 
-static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
+const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 	vdev_raidz_map_free_vsd,
 	vdev_raidz_cksum_report
 };
@@ -392,6 +398,8 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 	rm->rm_reports = 0;
 	rm->rm_freed = 0;
 	rm->rm_ecksuminjected = 0;
+	rm->rm_abd_skip = NULL;
+	rm->rm_declustered = B_FALSE;
 
 	asize = 0;
 
@@ -609,6 +617,22 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 			}
 		}
 	}
+
+	if (!rm->rm_declustered)
+		return;
+
+	/* IO doesn't span all child vdevs. */
+	for (; c < rm->rm_scols; c++) {
+		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+
+		/*
+		 * Treat skip sectors as though they are full of 0s.
+		 * Note that there's therefore nothing needed for P.
+		 */
+		for (i = 0; i < pcnt; i++) {
+			VDEV_RAIDZ_64MUL_2(q[i], mask);
+		}
+	}
 }
 
 static void
@@ -660,6 +684,24 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 			}
 		}
 	}
+
+	if (!rm->rm_declustered)
+		return;
+
+	/* IO doesn't span all child vdevs. */
+	for (; c < rm->rm_scols; c++) {
+		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+		r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
+
+		/*
+		 * Treat skip sectors as though they are full of 0s.
+		 * Note that there's therefore nothing needed for P.
+		 */
+		for (i = 0; i < pcnt; i++) {
+			VDEV_RAIDZ_64MUL_2(q[i], mask);
+			VDEV_RAIDZ_64MUL_4(r[i], mask);
+		}
+	}
 }
 
 /*
@@ -1485,8 +1527,8 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
 {
 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
 	int ntgts;
-	int i, c, ret;
-	int code;
+	int i, c, code;
+	int cols = 0;
 	int nbadparity, nbaddata;
 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
 
@@ -1521,25 +1563,32 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
 	ASSERT(nbaddata >= 0);
 	ASSERT(nbaddata + nbadparity == ntgts);
 
+	if (rm->rm_declustered)
+		cols = vdev_draid_hide_skip_sectors(rm);
+
 	dt = &tgts[nbadparity];
 
 	/* Reconstruct using the new math implementation */
-	ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata);
-	if (ret != RAIDZ_ORIGINAL_IMPL)
-		return (ret);
+	code = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata);
+	if (code != RAIDZ_ORIGINAL_IMPL)
+		goto out;
 
 	/*
 	 * See if we can use any of our optimized reconstruction routines.
 	 */
 	switch (nbaddata) {
 	case 1:
-		if (parity_valid[VDEV_RAIDZ_P])
-			return (vdev_raidz_reconstruct_p(rm, dt, 1));
+		if (parity_valid[VDEV_RAIDZ_P]) {
+			code = vdev_raidz_reconstruct_p(rm, dt, 1);
+			goto out;
+		}
 
 		ASSERT(rm->rm_firstdatacol > 1);
 
-		if (parity_valid[VDEV_RAIDZ_Q])
-			return (vdev_raidz_reconstruct_q(rm, dt, 1));
+		if (parity_valid[VDEV_RAIDZ_Q]) {
+			code = vdev_raidz_reconstruct_q(rm, dt, 1);
+			goto out;
+		}
 
 		ASSERT(rm->rm_firstdatacol > 2);
 		break;
@@ -1548,8 +1597,10 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
 		ASSERT(rm->rm_firstdatacol > 1);
 
 		if (parity_valid[VDEV_RAIDZ_P] &&
-		    parity_valid[VDEV_RAIDZ_Q])
-			return (vdev_raidz_reconstruct_pq(rm, dt, 2));
+		    parity_valid[VDEV_RAIDZ_Q]) {
+			code = vdev_raidz_reconstruct_pq(rm, dt, 2);
+			goto out;
+		}
 
 		ASSERT(rm->rm_firstdatacol > 2);
 
@@ -1559,6 +1610,9 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
 	ASSERT(code > 0);
+out:
+	if (rm->rm_declustered)
+		vdev_draid_restore_skip_sectors(rm, cols);
 	return (code);
 }
 
@@ -1631,7 +1685,7 @@ vdev_raidz_asize(vdev_t *vd, uint64_t psize)
 	return (asize);
 }
 
-static void
+void
 vdev_raidz_child_done(zio_t *zio)
 {
 	raidz_col_t *rc = zio->io_private;
@@ -1641,6 +1695,38 @@ vdev_raidz_child_done(zio_t *zio)
 	rc->rc_skipped = 0;
 }
 
+boolean_t
+vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
+{
+	uint64_t unit_shift = vd->vdev_top->vdev_ashift;
+	uint64_t dcols = vd->vdev_children;
+	uint64_t nparity = vd->vdev_nparity;
+	uint64_t b = offset >> unit_shift;
+	uint64_t s = ((psize - 1) >> unit_shift) + 1;
+	/* The first column for this stripe. */
+	uint64_t f = b % dcols;
+	uint64_t c, devidx;
+
+	if (s + nparity >= dcols) /* spans all child vdevs */
+		return (B_TRUE);
+
+	for (c = 0; c < s + nparity; c++) {
+		vdev_t *cvd;
+
+		/*
+		 * dsl_scan_need_resilver() already checked vd with
+		 * vdev_dtl_contains(). So here just check cvd with
+		 * vdev_dtl_empty(), cheaper and a good approximation.
+		 */
+		devidx = (f + c) % dcols;
+		cvd = vd->vdev_child[devidx];
+		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
 /*
  * Start an IO operation on a RAIDZ VDev
  *
@@ -1835,6 +1921,8 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
 		abd_free(orig[c]);
 	}
 
+	if (ret != 0 && rm->rm_declustered)
+		vdev_draid_debug_zio(zio, B_FALSE);
 	return (ret);
 }
 
@@ -2285,6 +2373,9 @@ vdev_raidz_io_done(zio_t *zio)
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
+
+	if (rm->rm_declustered)
+		vdev_draid_fix_skip_sectors(zio);
 }
 
 static void
diff --git a/module/zfs/vdev_raidz.h b/module/zfs/vdev_raidz.h
new file mode 100644
index 000000000000..2c704d1ffb6b
--- /dev/null
+++ b/module/zfs/vdev_raidz.h
@@ -0,0 +1,33 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 Intel Corporation.
+ */
+
+#include <sys/zio.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_raidz_impl.h>
+
+extern const zio_vsd_ops_t vdev_raidz_vsd_ops;
+
+extern void vdev_raidz_generate_parity(raidz_map_t *rm);
+extern void vdev_raidz_child_done(zio_t *zio);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index d0466709b2e4..6cfafae233f4 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1069,10 +1069,11 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
     enum zio_flag flags, zio_done_func_t *done, void *private)
 {
 	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
+	ASSERTV(vdev_t *piovd = pio->io_vd);
 	zio_t *zio;
 
-	ASSERT(vd->vdev_parent ==
-	    (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
+	ASSERT((piovd != NULL && piovd->vdev_ops == &vdev_draid_spare_ops) ||
+	    vd->vdev_parent == (piovd ? piovd : pio->io_spa->spa_root_vdev));
 
 	if (type == ZIO_TYPE_READ && bp != NULL) {
 		/*
@@ -3255,10 +3256,20 @@ zio_vdev_io_start(zio_t *zio)
 	 * discard unnecessary repairs as we work our way down the vdev tree.
 	 * The same logic applies to any form of nested replication:
 	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
+	 *
+	 * Leaf DTL_PARTIAL can be empty when a legitimate write comes from
+	 * a dRAID spare vdev. For example, when a dRAID spare is first
+	 * used, its spare blocks need to be written to but the leaf vdev's
+	 * of such blocks can have empty DTL_PARTIAL.
+	 *
+	 * There seemed no clean way to allow such writes while bypassing
+	 * spurious ones. At this point, just avoid all bypassing for dRAID
+	 * for correctness.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
 	    zio->io_txg != 0 &&	/* not a delegated i/o */
+	    vd->vdev_top->vdev_ops != &vdev_draid_ops &&
 	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		zio_vdev_io_bypass(zio);
@@ -3266,6 +3277,7 @@ zio_vdev_io_start(zio_t *zio)
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf &&
+	    vd->vdev_ops != &vdev_draid_spare_ops &&
 	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
 
 		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
@@ -3301,8 +3313,8 @@ zio_vdev_io_done(zio_t *zio)
 	if (zio->io_delay)
 		zio->io_delay = gethrtime() - zio->io_delay;
 
-	if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
-
+	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+	    vd->vdev_ops != &vdev_draid_spare_ops) {
 		vdev_queue_io_done(zio);
 
 		if (zio->io_type == ZIO_TYPE_WRITE)