From aaa67a231597cdae63a0d5f184a57d016fe3d34b Mon Sep 17 00:00:00 2001 From: Isaac Huang <he.huang@intel.com> Date: Mon, 27 Feb 2017 11:27:56 -0700 Subject: [PATCH] Port of draid from coral-beta-draid branch Up to and including b6ca80d723b4b9139d3133114a17d019149eb1a4 Signed-off-by: Isaac Huang <he.huang@intel.com> --- .gitignore | 1 + cmd/Makefile.am | 2 +- cmd/draidcfg/.gitignore | 1 + cmd/draidcfg/Makefile.am | 20 + cmd/draidcfg/draid_permutation.c | 763 +++++++++++++++ cmd/draidcfg/draid_permutation.h | 41 + cmd/draidcfg/draidcfg.c | 343 +++++++ cmd/zdb/zdb.c | 3 +- cmd/zpool/zpool_main.c | 23 +- cmd/zpool/zpool_vdev.c | 91 +- configure.ac | 1 + include/libzfs.h | 6 +- include/sys/Makefile.am | 2 + include/sys/dsl_scan.h | 2 + include/sys/fs/zfs.h | 12 + include/sys/metaslab_impl.h | 1 + include/sys/nvpair.h | 1 + include/sys/spa_scan.h | 47 + include/sys/vdev.h | 1 + include/sys/vdev_draid_impl.h | 105 ++ include/sys/vdev_impl.h | 32 + include/sys/vdev_raidz_impl.h | 3 + lib/libzfs/libzfs_import.c | 72 +- lib/libzfs/libzfs_pool.c | 15 +- lib/libzpool/Makefile.am | 2 + module/nvpair/fnvpair.c | 19 +- module/zcommon/zfs_namecheck.c | 4 +- module/zfs/Makefile.in | 2 + module/zfs/dsl_scan.c | 101 +- module/zfs/metaslab.c | 74 +- module/zfs/spa.c | 92 +- module/zfs/spa_scan.c | 383 ++++++++ module/zfs/vdev.c | 61 +- module/zfs/vdev_draid.c | 1551 ++++++++++++++++++++++++++++++ module/zfs/vdev_label.c | 34 +- module/zfs/vdev_mirror.c | 45 +- module/zfs/vdev_raidz.c | 117 ++- module/zfs/vdev_raidz.h | 33 + module/zfs/zio.c | 20 +- 39 files changed, 3988 insertions(+), 138 deletions(-) create mode 100644 cmd/draidcfg/.gitignore create mode 100644 cmd/draidcfg/Makefile.am create mode 100644 cmd/draidcfg/draid_permutation.c create mode 100644 cmd/draidcfg/draid_permutation.h create mode 100644 cmd/draidcfg/draidcfg.c create mode 100644 include/sys/spa_scan.h create mode 100644 include/sys/vdev_draid_impl.h create mode 100644 module/zfs/spa_scan.c create mode 100644 module/zfs/vdev_draid.c create mode 100644 module/zfs/vdev_raidz.h diff --git a/.gitignore b/.gitignore index 9bdb57abd947..50f26fede982 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ *.mod.c *~ *.swp +*.orig .deps .libs .dirstamp diff --git a/cmd/Makefile.am b/cmd/Makefile.am index 04aa7c6333da..bf5a78dc8886 100644 --- a/cmd/Makefile.am +++ b/cmd/Makefile.am @@ -1,3 +1,3 @@ SUBDIRS = zfs zpool zdb zhack zinject zstreamdump ztest zpios SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat dbufstat zed -SUBDIRS += arc_summary raidz_test +SUBDIRS += arc_summary raidz_test draidcfg diff --git a/cmd/draidcfg/.gitignore b/cmd/draidcfg/.gitignore new file mode 100644 index 000000000000..ad7c307b04e3 --- /dev/null +++ b/cmd/draidcfg/.gitignore @@ -0,0 +1 @@ +/draidcfg diff --git a/cmd/draidcfg/Makefile.am b/cmd/draidcfg/Makefile.am new file mode 100644 index 000000000000..f587d271860e --- /dev/null +++ b/cmd/draidcfg/Makefile.am @@ -0,0 +1,20 @@ +include $(top_srcdir)/config/Rules.am + +AM_CPPFLAGS += -DDEBUG + +DEFAULT_INCLUDES += \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib/libspl/include + +bin_PROGRAMS = draidcfg + +draidcfg_SOURCES = \ + draidcfg.c \ + draid_permutation.c \ + draid_permutation.h + +draidcfg_LDADD = \ + $(top_builddir)/lib/libnvpair/libnvpair.la \ + $(top_builddir)/lib/libzpool/libzpool.la \ + $(top_builddir)/lib/libzfs/libzfs.la +draidcfg_LDADD += -lm diff --git a/cmd/draidcfg/draid_permutation.c b/cmd/draidcfg/draid_permutation.c new file mode 100644 index 000000000000..4753f3f31f66 --- /dev/null +++ b/cmd/draidcfg/draid_permutation.c @@ -0,0 +1,763 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016 Intel Corporation. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <assert.h> +#include <unistd.h> +#include <string.h> +#include <math.h> + +#include "draid_permutation.h" + + +#define MAX_GROUPSIZE 32 +#define MAX_GROUPS 128 +#define MAX_SPARES 100 +#define MAX_DEVS (MAX_GROUPSIZE * MAX_GROUPS + MAX_SPARES) +#define MAX_ROWS 16384 + +#define UNOPT 0 +#define EVAL_WORST 1 +#define EVAL_MEAN 2 +#define EVAL_RMS 3 + +static int verbose = 0; + +typedef struct +{ + int groupsz; + int ngroups; + int nspares; + int ndevs; + int nrows; + /* each row maps all drives, groups from 0, spares down from ndevs-1 */ + int **rows; + int nbroken; /* # broken drives */ + int *broken; /* which drives are broken */ +} map_t; + +typedef struct +{ + int value; + int order; +} pair_t; + +static void +permute_devs(int *in, int *out, int ndevs) +{ + pair_t tmp[MAX_DEVS]; + int i; + int j; + + if (ndevs == 2) { /* swap */ + i = in[0]; + j = in[1]; + out[0] = j; + out[1] = i; + return; + } + + for (i = 0; i < ndevs; i++) { /* assign random order */ + tmp[i].value = in[i]; + tmp[i].order = mrand48(); + } + + for (i = 1; i < ndevs; i++) /* sort */ + for (j = 0; j < i; j++) + if (tmp[i].order < tmp[j].order) { + pair_t t = tmp[i]; + tmp[i] = tmp[j]; + tmp[j] = t; + } + + for (i = 0; i < ndevs; i++) + out[i] = tmp[i].value; +} + +static void +print_map(map_t *map) +{ + int i; + int j; + + for (i = 0; i < map->nrows; i++) { + for (j = 0; j < map->ndevs; j++) { + if (j == map->ndevs - map->nspares) + printf("S "); + + printf("%2d ", map->rows[i][j]); + } + printf("\n"); + } +} + +static void +check_map(map_t *map) +{ + int i; + int j; + int nrows = map->nrows; + int ndevs = map->ndevs; + int **rows = map->rows; + int devcounts[MAX_DEVS]; + int brokencounts[MAX_DEVS]; + + ASSERT(map->groupsz <= MAX_GROUPSIZE); + ASSERT(map->ngroups <= MAX_GROUPS); + ASSERT(map->nspares <= MAX_SPARES); + ASSERT(map->ndevs == map->nspares + map->ngroups * map->groupsz); + ASSERT(map->nrows <= MAX_ROWS); + ASSERT(map->nbroken <= MAX_SPARES); + + /* Ensure each dev appears once in every row */ + memset(devcounts, 0, sizeof (int) * map->ndevs); + + for (i = 0; i < nrows; i++) { + int *row = rows[i]; + + for (j = 0; j < ndevs; j++) { + int dev = row[j]; + + ASSERT(0 <= dev && dev < ndevs); + ASSERT(devcounts[dev] == i); + devcounts[dev] = i+1; + } + } + + /* Ensure broken drives only appear once */ + memset(brokencounts, 0, sizeof (int) * map->ndevs); + + for (i = 0; i < map->nbroken; i++) { + int dev = map->broken[i]; + + ASSERT(0 <= dev && dev < map->ndevs); /* valid drive */ + ASSERT(brokencounts[i] == 0); /* not used already */ + brokencounts[i] = 1; + } +} + +static map_t * +dup_map(map_t *oldmap) +{ + int groupsz = oldmap->groupsz; + int ngroups = oldmap->ngroups; + int nspares = oldmap->nspares; + int ndevs = oldmap->ndevs; + int nrows = oldmap->nrows; + map_t *map = malloc(sizeof (map_t)); + int i; + + ASSERT(nrows <= MAX_ROWS); + ASSERT(ndevs <= MAX_DEVS); + + map->groupsz = groupsz; + map->ngroups = ngroups; + map->nspares = nspares; + map->ndevs = ndevs; + map->nrows = nrows; + map->rows = malloc(sizeof (int *) * nrows); + + for (i = 0; i < nrows; i++) { + map->rows[i] = malloc(sizeof (int) * ndevs); + memcpy(map->rows[i], oldmap->rows[i], sizeof (int) * ndevs); + } + + /* Init to no failures (nothing broken) */ + map->broken = malloc(sizeof (int) * nspares); + map->nbroken = 0; + + check_map(map); + return (map); +} + +static map_t * +new_map(int groupsz, int ngroups, int nspares, int nrows) +{ + map_t *map = malloc(sizeof (map_t)); + int ndevs = nspares + ngroups * groupsz; + int i; + int j; + + ASSERT(nrows <= MAX_ROWS); + ASSERT(ndevs <= MAX_DEVS); + + map->groupsz = groupsz; + map->ngroups = ngroups; + map->nspares = nspares; + map->ndevs = ndevs; + map->nrows = nrows; + map->rows = malloc(sizeof (int *) * nrows); + + for (i = 0; i < nrows; i++) { + map->rows[i] = malloc(sizeof (int) * ndevs); + + if (i == 0) + for (j = 0; j < ndevs; j++) + map->rows[i][j] = j; + else + permute_devs(map->rows[i-1], map->rows[i], ndevs); + } + + /* Init to no failures (nothing broken) */ + map->broken = malloc(sizeof (int) * nspares); + map->nbroken = 0; + + check_map(map); + return (map); +} + +static void +free_map(map_t *map) +{ + int i; + + free(map->broken); + for (i = 0; i < map->nrows; i++) + free(map->rows[i]); + free(map->rows); + free(map); +} + +static inline int +is_broken(map_t *map, int dev) +{ + int i; + + for (i = 0; i < map->nbroken; i++) + if (dev == map->broken[i]) + return (1); + + return (0); +} + +static int +eval_resilver(map_t *map, int print) +{ + /* Evaluate how resilvering I/O will be distributed */ + int i; + int j; + int k; + int spare; + int dev; + int ndevs = map->ndevs; + int nspares = map->nspares; + int ngroups = map->ngroups; + int groupsz = map->groupsz; + int nrows = map->nrows; + int writes[MAX_DEVS]; + int reads[MAX_DEVS]; + int max_reads = 0; + int max_writes = 0; + int max_ios = 0; + + memset(reads, 0, sizeof (int) * ndevs); + memset(writes, 0, sizeof (int) * ndevs); + + /* resilver all rows */ + for (i = 0; i < nrows; i++) { + int *row = map->rows[i]; + + /* resilver all groups with broken drives */ + for (j = 0; j < ngroups; j++) { + int fix = 0; + + /* See if any disk in this group is broken */ + for (k = 0; k < groupsz && !fix; k++) + fix = is_broken(map, row[j*groupsz + k]); + + if (!fix) + continue; + + /* + * This group needs fixing + * Read all the non-broken drives and write all the + * broken drives to their hot spare for this row + */ + spare = ndevs - nspares; + for (k = 0; k < groupsz; k++) { + dev = row[j*groupsz + k]; + + if (!is_broken(map, dev)) { + reads[dev]++; + } else { + ASSERT(spare < ndevs); + + while (is_broken(map, row[spare])) { + spare++; + ASSERT(spare < ndevs); + } + writes[row[spare++]]++; + } + } + } + } + + /* find drives with most I/O */ + for (i = 0; i < ndevs; i++) { + if (reads[i] > max_reads) + max_reads = reads[i]; + if (writes[i] > max_writes) + max_writes = writes[i]; + + if (reads[i] + writes[i] > max_ios) + max_ios = reads[i] + writes[i]; + } + + if (print) { + printf("Reads: "); + for (i = 0; i < ndevs; i++) + printf(" %5.3f", ((double)reads[i]*ngroups)/nrows); + printf("\n"); + printf("Writes: "); + for (i = 0; i < ndevs; i++) + printf(" %5.3f", ((double)writes[i]*ngroups)/nrows); + printf("\n"); + } + + return (max_ios); +} + +static double +eval_decluster(map_t *map, int how, int faults, int print) +{ + int f1; + int f2; + int ios; + int worst1 = -1; + int worst2 = -1; + int n = 0; + long sum = 0; + long sumsq = 0; + long max_ios = 0; + double val; + + ASSERT(eval_resilver(map, 0) == 0); /* not broken already */ + ASSERT(faults == 1 || faults == 2); + + map->nbroken = faults; + + for (f1 = 0; f1 < map->ndevs; f1++) { + map->broken[0] = f1; + + if (faults < 2) { + ios = eval_resilver(map, 0); /* eval single failure */ + n++; + sum += ios; + sumsq += ios*ios; + if (max_ios < ios) { + worst1 = f1; + max_ios = ios; + } + } else { /* eval double failure */ + for (f2 = f1 + 1; f2 < map->ndevs; f2++) { + map->broken[1] = f2; /* use 2nd hot spare */ + + ios = eval_resilver(map, 0); + n++; + sum += ios; + sumsq += ios*ios; + if (max_ios < ios) { + worst1 = f1; + worst2 = f2; + max_ios = ios; + } + } + } + } + map->nbroken = 0; + + if (print) { + map->nbroken = faults; + map->broken[0] = worst1; + map->broken[2] = worst2; + + eval_resilver(map, 1); + + map->nbroken = 0; + } + + switch (how) { + case EVAL_WORST: + /* + * imbalance from worst possible drive failure + * insensitive to failures handled better + */ + val = max_ios; + break; + case EVAL_MEAN: + /* + * average over all possible drive failures + * sensitive to all possible failures + */ + val = ((double)sum)/n; + break; + case EVAL_RMS: + /* + * root mean square over all possible drive failures + * penalizes higher imbalance more + */ + val = sqrt(((double)sumsq)/n); + break; + default: + ASSERT(0); + } + return ((val/map->nrows)*map->ngroups); +} + +static int +rand_in_range(int min, int count) +{ + return (min + drand48()*count); +} + +static void +permute_map(map_t *map, int temp) +{ + static int prev_temp; + + int nrows = (temp < 1) ? 1 : (temp > 100) ? + map->nrows : rand_in_range(1, (map->nrows * temp)/100); + int row = rand_in_range(0, map->nrows - nrows); + int ncols = map->ndevs; + int col = rand_in_range(0, map->ndevs - ncols); + int i; + + if (verbose > 0 && + temp != prev_temp && + (temp < 10 || (temp % 10 == 0))) + printf("Permute t %3d (%d-%d, %d-%d)\n", + temp, col, ncols, row, nrows); + prev_temp = temp; + + for (i = row; i < row + nrows; i++) + permute_devs(&map->rows[i][col], &map->rows[i][col], ncols); +} + +static map_t * +develop_map(map_t *map) +{ + map_t *dmap = new_map(map->groupsz, map->ngroups, + map->nspares, map->nrows * map->ndevs); + int base; + int dev; + int i; + + for (base = 0; base < map->nrows; base++) + for (dev = 0; dev < map->ndevs; dev++) + for (i = 0; i < map->ndevs; i++) + dmap->rows[base*map->ndevs + dev][i] = + (map->rows[base][i] + dev) % map->ndevs; + + return (dmap); +} + +static map_t * +optimize_map(map_t *map, int eval, int faults) +{ + double temp = 100.0; + double alpha = 0.995; + double epsilon = 0.001; + double val = eval_decluster(map, eval, faults, 0); + int ups = 0; + int downs = 0; + int sames = 0; + int iter = 0; + + while (temp > epsilon) { + map_t *map2 = dup_map(map); + double val2; + double delta; + + permute_map(map2, (int)temp); + + val2 = eval_decluster(map2, eval, faults, 0); + delta = (val2 - val); + + if (delta < 0 || exp(-10000*delta/temp) > drand48()) { + if (delta > 0) + ups++; + else if (delta < 0) + downs++; + else + sames++; + + free_map(map); + map = map2; + val = val2; + } else { + free_map(map2); + } + + temp *= alpha; + + if ((++iter % 100) == 0) { + if (verbose > 0) + printf("%f (%d ups, %d sames, %d downs)\n", + val, ups, sames, downs); + ups = downs = sames = 0; + } + } + + if (verbose > 0) + printf("%d iters, %d ups %d sames %d downs\n", + iter, ups, sames, downs); + return (map); +} + +static void +print_map_stats(map_t *map, int optimize, int print_ios) +{ + double score = eval_decluster(map, EVAL_WORST, 1, 0); + + printf("%6s (%2d x %2d + %2d) x %5d: %2.3f\n", + (optimize == UNOPT) ? "Unopt" : + (optimize == EVAL_WORST) ? "Worst" : + (optimize == EVAL_MEAN) ? "Mean" : "Rms", + map->ngroups, map->groupsz, map->nspares, map->nrows, score); + + if (map->ndevs < 80 && score >= 1.05) + printf("Warning score %6.3f has over 5 percent imbalance!\n", + score); + else if (score >= 1.1) + printf("Warning score %6.3f has over 10 percent imbalance!\n", + score); + +#ifdef FOOO + printf("Single: worst %6.3f mean %6.3f\n", + eval_decluster(map, EVAL_WORST, 1, 0), + eval_decluster(map, EVAL_MEAN, 1, 0)); + + printf("Double: worst %6.3f mean %6.3f\n", + eval_decluster(map, EVAL_WORST, 2, 0), + eval_decluster(map, EVAL_MEAN, 2, 0)); +#endif + + if (print_ios) { + eval_decluster(map, EVAL_WORST, 1, 1); + eval_decluster(map, EVAL_WORST, 2, 1); + } +} + +int +draid_permutation_generate(struct vdev_draid_configuration *cfg) +{ + const int loop = 16; /* HH: make this a parameter */ + const int faults = 1; + const int eval = EVAL_WORST; + + int groupsz = cfg->dcf_data + cfg->dcf_parity; + int nspares = cfg->dcf_spare; + int ngroups = (cfg->dcf_children - nspares) / groupsz; + int nrows; + int i, fd, urand_fd; + long int best_seed; + map_t *best_map; + + fd = open("/dev/random", O_RDONLY | O_NONBLOCK); + if (fd == -1) { + perror("Cannot open /dev/random\n"); + return (-1); + } + urand_fd = open("/dev/urandom", O_RDONLY); + + /* HH: fine tune these heuristics */ + if (cfg->dcf_children - nspares > 80) + nrows = 128; /* 81 - ? */ + else if (cfg->dcf_children - nspares > 40) + nrows = 64; /* 41 - 80 */ + else + nrows = 32; /* 1 - 40 */ + + for (i = 0, best_map = NULL; i < loop; i++) { + int rc; + long int seed; + map_t *map, *omap; + + rc = read(fd, &seed, sizeof (seed)); + if (rc != sizeof (seed)) { + printf("Not enough entropy at /dev/random: read %d, " + "wanted %lu.\n", rc, sizeof (seed)); + /* urand_fd may not be valid but it does not matter */ + rc = read(urand_fd, &seed, sizeof (seed)); + if (rc != sizeof (seed)) + break; + printf("Using /dev/urandom instead.\n"); + } + + srand48(seed); + + map = new_map(groupsz, ngroups, nspares, nrows); + omap = optimize_map(dup_map(map), eval, faults); + if (eval_decluster(omap, eval, faults, 0) > + eval_decluster(map, eval, faults, 0)) { + /* + * optimize_map() may create a worse map, because the + * simulated annealing process may accept worse + * neighbors to avoid getting stuck in local optima + */ + free_map(omap); + } else { + free_map(map); + map = omap; + } + + if (best_map == NULL || + eval_decluster(map, eval, faults, 0) < + eval_decluster(best_map, eval, faults, 0)) { + if (best_map != NULL) + free_map(best_map); + best_map = map; + best_seed = seed; + } else { + free_map(map); + } + } + + close(fd); + close(urand_fd); + if (i != loop) + fprintf(stderr, "Early termination at loop %d. Generated " + "permutations may not be optimal!\n", i + 1); + + if (best_map != NULL) { + int j; + map_t *dmap; + uint64_t *perms; + + assert(best_map->nrows == nrows); + assert(best_map->ndevs == cfg->dcf_children); + + perms = malloc(sizeof (*perms) * nrows * best_map->ndevs); + assert(perms != NULL); + + for (i = 0; i < nrows; i++) + for (j = 0; j < best_map->ndevs; j++) + perms[i * best_map->ndevs + j] = + best_map->rows[i][j]; + + cfg->dcf_bases = nrows; + cfg->dcf_base_perms = perms; + + if (verbose > 1) + print_map(best_map); + dmap = develop_map(best_map); + free_map(best_map); + print_map_stats(dmap, eval, 0); + printf("Seed chosen: %lx\n", best_seed); + free_map(dmap); + return (0); + } else { + return (-1); + } +} + +int +debug_main(int argc, char **argv) +{ + int ngroups = 0; + int groupsz = 0; + int nspares = 0; + int nrows = 0; + int optimize = UNOPT; + int faults = 1; + int develop = 0; + map_t *map; + int c; + + while ((c = getopt(argc, argv, "g:d:s:n:vUWMR12D")) != -1) + switch (c) { + case 'D': + develop = 1; + break; + case 'g': + sscanf(optarg, "%d", &ngroups); + break; + case 'd': + sscanf(optarg, "%d", &groupsz); + break; + case 's': + sscanf(optarg, "%d", &nspares); + break; + case 'n': + sscanf(optarg, "%d", &nrows); + break; + case 'v': + verbose++; + break; + case 'U': + optimize = UNOPT; + break; + case 'W': + optimize = EVAL_WORST; + break; + case 'M': + optimize = EVAL_MEAN; + break; + case 'R': + optimize = EVAL_RMS; + break; + case '1': + faults = 1; + break; + case '2': + faults = 2; + break; + default: + fprintf(stderr, "arg???\n"); + return (1); + } + + if (ngroups <= 0 || groupsz <= 0 || nspares <= 0 || nrows <= 0) { + fprintf(stderr, "missing arg???\n"); + return (1); + } + + map = new_map(groupsz, ngroups, nspares, nrows); + if (verbose > 1) + print_map(map); + + if (verbose > 0) + print_map_stats(map, UNOPT, 1); + + if (optimize != UNOPT) { + map = optimize_map(map, optimize, faults); + + if (verbose > 1) + print_map(map); + if (verbose > 0) + print_map_stats(map, optimize, 1); + } + + if (develop) { + map_t *dmap = develop_map(map); + + free_map(map); + map = dmap; + } + + print_map_stats(map, optimize, verbose > 0); + return (0); +} diff --git a/cmd/draidcfg/draid_permutation.h b/cmd/draidcfg/draid_permutation.h new file mode 100644 index 000000000000..8562ccf09852 --- /dev/null +++ b/cmd/draidcfg/draid_permutation.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016 Intel Corporation. + */ + + +#ifndef _DRAID_PERMUTATION_H +#define _DRAID_PERMUTATION_H + +#include <sys/vdev_draid_impl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern int draid_permutation_generate(struct vdev_draid_configuration *); + +#ifdef __cplusplus +} +#endif + +#endif /* _DRAID_PERMUTATION_H */ diff --git a/cmd/draidcfg/draidcfg.c b/cmd/draidcfg/draidcfg.c new file mode 100644 index 000000000000..0c3a8375d18c --- /dev/null +++ b/cmd/draidcfg/draidcfg.c @@ -0,0 +1,343 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016 Intel Corporation. + */ + + +#include <libzfs.h> +#include <libnvpair.h> +#include <stdio.h> +#include <stdlib.h> +#include <libintl.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_draid_impl.h> + +#include "draid_permutation.h" + + +static struct vdev_draid_configuration * +draidcfg_find(const uint64_t data, const uint64_t parity, + const uint64_t spare, const uint64_t children) +{ + /* P D D... P D D... S */ + static const uint64_t bases7[1][7] = {{1, 2, 4, 3, 6, 5, 0}}; + static const uint64_t bases11[1][11] = {{ + 1, 4, 5, 9, 3, 2, 8, 10, 7, 6, 0}}; + static const uint64_t bases19[1][19] = {{ + 1, 5, 6, 11, 17, 9, 7, 16, 4, 10, 12, 3, 15, 18, 14, 13, 8, 2, 0}}; + static const uint64_t bases23[1][23] = {{ + 1, 8, 18, 6, 2, 16, 13, 12, 4, 9, 3, 10, 11, 19, 14, 20, 22, + 15, 5, 17, 21, 7, 0}}; + static const uint64_t bases31[1][31] = {{ + 1, 8, 2, 16, 4, 17, 12, 3, 24, 6, 10, 18, 20, 5, 9, 15, 27, 30, 23, + 29, 7, 25, 14, 19, 28, 26, 22, 21, 13, 11, 0}}; + static const uint64_t bases41[1][41] = {{ + 1, 25, 10, 4, 18, 40, 16, 31, 37, 23, 6, 27, 19, + 24, 26, 35, 14, 22, 17, 15, 36, 39, 32, 21, 33, + 5, 2, 9, 20, 8, 11, 29, 28, 3, 34, 30, 12, 13, 38, 7, 0}}; + + static struct vdev_draid_configuration known_cfgs[6] = { + { + .dcf_data = 2, .dcf_parity = 1, .dcf_spare = 1, .dcf_children = 7, + .dcf_bases = 1, .dcf_base_perms = &bases7[0][0] + }, + { + .dcf_data = 4, .dcf_parity = 1, .dcf_spare = 1, .dcf_children = 11, + .dcf_bases = 1, .dcf_base_perms = &bases11[0][0] + }, + { + .dcf_data = 8, .dcf_parity = 1, .dcf_spare = 1, .dcf_children = 19, + .dcf_bases = 1, .dcf_base_perms = &bases19[0][0] + }, + { + .dcf_data = 8, .dcf_parity = 3, .dcf_spare = 1, .dcf_children = 23, + .dcf_bases = 1, .dcf_base_perms = &bases23[0][0] + }, + { + .dcf_data = 4, .dcf_parity = 1, .dcf_spare = 1, .dcf_children = 31, + .dcf_bases = 1, .dcf_base_perms = &bases31[0][0] + }, + { + .dcf_data = 8, .dcf_parity = 2, .dcf_spare = 1, .dcf_children = 41, + .dcf_bases = 1, .dcf_base_perms = &bases41[0][0] + }, + }; + + int i; + + for (i = 0; i < sizeof (known_cfgs) / sizeof (known_cfgs[0]); i++) { + struct vdev_draid_configuration *cfg = &known_cfgs[i]; + + if (data == cfg->dcf_data && parity == cfg->dcf_parity && + spare == cfg->dcf_spare && children == cfg->dcf_children) + return (cfg); + } + + return (NULL); +} + +static struct vdev_draid_configuration * +draidcfg_create(const uint64_t data, const uint64_t parity, + const uint64_t spare, const uint64_t children) +{ + struct vdev_draid_configuration *cfg = calloc(1, sizeof (*cfg)); + + assert(cfg != NULL); + cfg->dcf_data = data; + cfg->dcf_parity = parity; + cfg->dcf_spare = spare; + cfg->dcf_children = children; + + cfg->dcf_bases = 0; + cfg->dcf_base_perms = NULL; + if (draid_permutation_generate(cfg) != 0) { + free(cfg); + return (NULL); + } + + assert(cfg->dcf_bases != 0); + assert(cfg->dcf_base_perms != NULL); + return (cfg); +} + +static inline void +draidcfg_free(struct vdev_draid_configuration *cfg) +{ + free((void *)cfg->dcf_base_perms); + free(cfg); +} + +static int +draidcfg_create_file(const uint64_t data, const uint64_t parity, + const uint64_t spare, const uint64_t children, const char *path) +{ + FILE *fp; + size_t len; + int ret = 0; + void *packed; + nvlist_t *nvl; + boolean_t freecfg = B_FALSE; + struct vdev_draid_configuration *cfg; + + ASSERT(children != 0); + ASSERT3U(children, <=, VDEV_DRAID_MAX_CHILDREN); + + if (children - 1 > VDEV_DRAID_U8_MAX) { + fprintf(stderr, "Configuration for over %u children " + "is not supported\n", VDEV_DRAID_U8_MAX + 1); + return (1); + } + + cfg = draidcfg_find(data, parity, spare, children); + if (cfg == NULL) { + cfg = draidcfg_create(data, parity, spare, children); + if (cfg == NULL) { + fprintf(stderr, "Cannot create" + "supported configuration\n"); + return (1); + } + freecfg = B_TRUE; + } + + fp = fopen(path, "w+"); + if (fp == NULL) { + fprintf(stderr, "Cannot open file %s for write\n", path); + if (freecfg) + draidcfg_free(cfg); + return (1); + } + + nvl = fnvlist_alloc(); + fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_DATA, data); + fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_PARITY, parity); + fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_SPARE, spare); + fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_CHILDREN, children); + fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_BASE, cfg->dcf_bases); + + if (children - 1 <= VDEV_DRAID_U8_MAX) { + int i, j; + uint8_t *val = calloc(children * cfg->dcf_bases, sizeof (*val)); + + for (i = 0; i < cfg->dcf_bases; i++) { + for (j = 0; j < children; j++) { + uint64_t c = + cfg->dcf_base_perms[i * children + j]; + + ASSERT3U(c, <, children); + ASSERT3U(c, <=, VDEV_DRAID_U8_MAX); + val[i * children + j] = (uint8_t)c; + } + } + + fnvlist_add_uint8_array(nvl, ZPOOL_CONFIG_DRAIDCFG_PERM, + val, children * cfg->dcf_bases); + free(val); + } else { + ASSERT3U(children, ==, 0); /* not supported yet */ + } + + assert(vdev_draid_config_validate(NULL, nvl)); + + packed = fnvlist_pack_xdr(nvl, &len); + if (fwrite(packed, 1, len, fp) != len) { + ret = 1; + fprintf(stderr, "Cannot write %lu bytes to %s\n", len, path); + } + + fnvlist_pack_free(packed, len); + fnvlist_free(nvl); + if (freecfg) + draidcfg_free(cfg); + fclose(fp); + return (ret); +} + +static void +draidcfg_print(nvlist_t *config) +{ + uint_t c; + uint8_t *perm = NULL; + uint64_t n, d, p, s, b, i; + + n = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_CHILDREN); + d = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_DATA); + p = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_PARITY); + s = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_SPARE); + b = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_BASE); + + printf("dRAID%lu vdev of %lu child drives: %lu x (%lu data + " + "%lu parity) and %lu distributed spare\n", + p, n, (n - s) / (d + p), d, p, s); + printf("Using %lu base permutation%s\n", b, b > 1 ? "s" : ""); + + VERIFY0(nvlist_lookup_uint8_array(config, + ZPOOL_CONFIG_DRAIDCFG_PERM, &perm, &c)); + ASSERT3U(c, ==, b * n); + + for (i = 0; i < b; i++) { + int j; + + printf(" "); + for (j = 0; j < n; j++) + printf("%*u,", n > 99 ? 3 : 2, perm[i * n + j]); + printf("\n"); + } +} + +static inline int usage(void) +{ + printf(gettext("draidcfg [-r] [-n children] [-d data] [-p parity]" + " [-s spare] <configfile>\n")); + return (1); +} + +int +main(int argc, char **argv) +{ + boolean_t read = B_FALSE; + char *cfg = NULL; + uint64_t data = 0, parity = 0, spare = 0, children = 0; + int c; + + while ((c = getopt(argc, argv, "rn:d:p:s:")) != -1) { + char *endptr; + uint64_t *p = NULL; + + switch (c) { + case 'r': + read = B_TRUE; + break; + case 'n': + p = &children; + case 'd': + if (p == NULL) + p = &data; + case 'p': + if (p == NULL) + p = &parity; + case 's': + if (p == NULL) + p = &spare; + + errno = 0; + *p = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0') { + fprintf(stderr, + gettext("Invalid -%c value: %s\n"), + c, optarg); + return (usage()); + } + break; + case ':': + fprintf(stderr, gettext("Missing argument for " + "'%c' option\n"), optopt); + return (usage()); + case '?': + fprintf(stderr, gettext("Invalid option '%c'\n"), + optopt); + return (usage()); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + fprintf(stderr, + gettext("Missing configuration file argument\n")); + return (usage()); + } + + cfg = argv[0]; + + if (read) { + nvlist_t *nvl = draidcfg_read_file(cfg); + + if (nvl == NULL) { + return (1); + } else { + draidcfg_print(nvl); + nvlist_free(nvl); + return (0); + } + } + + assert(!read); + + if (data == 0 || parity == 0 || spare == 0 || children == 0) { + fprintf(stderr, + gettext("Missing data/parity/spare/children argument\n")); + return (usage()); + } + + if (parity > VDEV_RAIDZ_MAXPARITY) { + fprintf(stderr, gettext("Invalid parity %lu\n"), parity); + return (usage()); + } + + if (children % (data + parity) != spare) { + fprintf(stderr, gettext("Invalid draid configration\n")); + return (usage()); + } + + return (draidcfg_create_file(data, parity, spare, children, cfg)); +} diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 94b359029c3e..2d6ae9d3d2c5 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -799,7 +799,8 @@ dump_metaslab(metaslab_t *msp) } if (dump_opt['d'] > 5 || dump_opt['m'] > 3) { - ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); + ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift) || + vd->vdev_ops == &vdev_draid_ops); mutex_enter(&msp->ms_lock); dump_spacemap(spa->spa_meta_objset, msp->ms_sm); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index da6744b76b86..26188398a2b3 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -52,6 +52,7 @@ #include <sys/fm/util.h> #include <sys/fm/protocol.h> #include <sys/zfs_ioctl.h> +#include <sys/vdev_draid_impl.h> #include <math.h> #include <libzfs.h> @@ -2238,7 +2239,7 @@ zpool_do_import(int argc, char **argv) char *endptr; /* check options */ - while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:R:stT:VX")) != -1) { + while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:R:stT:VX:")) != -1) { switch (c) { case 'a': do_all = B_TRUE; @@ -5510,7 +5511,8 @@ print_scan_status(pool_scan_stat_t *ps) zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf)); assert(ps->pss_func == POOL_SCAN_SCRUB || - ps->pss_func == POOL_SCAN_RESILVER); + ps->pss_func == POOL_SCAN_RESILVER || + ps->pss_func == POOL_SCAN_REBUILD); /* * Scan is finished or canceled. */ @@ -5519,16 +5521,20 @@ print_scan_status(pool_scan_stat_t *ps) char *fmt = NULL; if (ps->pss_func == POOL_SCAN_SCRUB) { - fmt = gettext("scrub repaired %s in %lluh%um with " + fmt = gettext("scrub repaired %s in %lluh%um%us with " "%llu errors on %s"); } else if (ps->pss_func == POOL_SCAN_RESILVER) { - fmt = gettext("resilvered %s in %lluh%um with " + fmt = gettext("resilvered %s in %lluh%um%us with " + "%llu errors on %s"); + } else if (ps->pss_func == POOL_SCAN_REBUILD) { + fmt = gettext("rebuilt %s in %lluh%um%us with " "%llu errors on %s"); } /* LINTED */ (void) printf(fmt, processed_buf, (u_longlong_t)(minutes_taken / 60), (uint_t)(minutes_taken % 60), + (uint_t)((end - start) % 60), (u_longlong_t)ps->pss_errors, ctime((time_t *)&end)); return; @@ -5539,6 +5545,9 @@ print_scan_status(pool_scan_stat_t *ps) } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver canceled on %s"), ctime(&end)); + } else if (ps->pss_func == POOL_SCAN_REBUILD) { + (void) printf(gettext("rebuild canceled on %s"), + ctime(&end)); } return; } @@ -5554,6 +5563,9 @@ print_scan_status(pool_scan_stat_t *ps) } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver in progress since %s"), ctime(&start)); + } else if (ps->pss_func == POOL_SCAN_REBUILD) { + (void) printf(gettext("rebuild in progress since %s"), + ctime(&start)); } examined = ps->pss_examined ? ps->pss_examined : 1; @@ -5592,6 +5604,9 @@ print_scan_status(pool_scan_stat_t *ps) } else if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("\t%s repaired, %.2f%% done\n"), processed_buf, 100 * fraction_done); + } else if (ps->pss_func == POOL_SCAN_REBUILD) { + (void) printf(gettext("\t%s rebuilt, %.2f%% done\n"), + processed_buf, 100 * fraction_done); } } diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index c0d3076d203d..7fa083a64685 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -83,6 +83,8 @@ #include <blkid/blkid.h> #include "zpool_util.h" #include <sys/zfs_context.h> +#include <sys/vdev_draid_impl.h> + /* * For any given vdev specification, we can have multiple errors. The @@ -618,6 +620,7 @@ is_spare(nvlist_t *config, const char *path) * /dev/xxx Complete disk path * /xxx Full path to file * xxx Shorthand for <zfs_vdev_paths>/xxx + * $draidxxx dRAID spare, see VDEV_DRAID_SPARE_PATH_FMT */ static nvlist_t * make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) @@ -660,6 +663,11 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) /* After is_whole_disk() check restore original passed path */ strlcpy(path, arg, sizeof (path)); + } else if (arg[0] == VDEV_DRAID_SPARE_PATH_FMT[0]) { + ashift = 12; + wholedisk = B_TRUE; + strlcpy(path, arg, sizeof (path)); + type = VDEV_TYPE_DRAID_SPARE; } else { err = is_shorthand_path(arg, path, sizeof (path), &statbuf, &wholedisk); @@ -688,17 +696,19 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) } } - /* - * Determine whether this is a device or a file. - */ - if (wholedisk || S_ISBLK(statbuf.st_mode)) { - type = VDEV_TYPE_DISK; - } else if (S_ISREG(statbuf.st_mode)) { - type = VDEV_TYPE_FILE; - } else { - (void) fprintf(stderr, gettext("cannot use '%s': must be a " - "block device or regular file\n"), path); - return (NULL); + if (type == NULL) { + /* + * Determine whether this is a device or a file. + */ + if (wholedisk || S_ISBLK(statbuf.st_mode)) { + type = VDEV_TYPE_DISK; + } else if (S_ISREG(statbuf.st_mode)) { + type = VDEV_TYPE_FILE; + } else { + fprintf(stderr, gettext("cannot use '%s': must " + "be a block device or regular file\n"), path); + return (NULL); + } } /* @@ -825,7 +835,8 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) rep.zprl_type = type; rep.zprl_children = 0; - if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || + strcmp(type, VDEV_TYPE_DRAID) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &rep.zprl_parity) == 0); @@ -1370,7 +1381,8 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, static const char * is_grouping(const char *type, int *mindev, int *maxdev) { - if (strncmp(type, "raidz", 5) == 0) { + if (strncmp(type, VDEV_TYPE_RAIDZ, 5) == 0 || + strncmp(type, VDEV_TYPE_DRAID, 5) == 0) { const char *p = type + 5; char *end; long nparity; @@ -1390,8 +1402,12 @@ is_grouping(const char *type, int *mindev, int *maxdev) if (mindev != NULL) *mindev = nparity + 1; if (maxdev != NULL) - *maxdev = 255; - return (VDEV_TYPE_RAIDZ); + *maxdev = VDEV_DRAID_MAX_CHILDREN; + + if (strncmp(type, VDEV_TYPE_RAIDZ, 5) == 0) + return (VDEV_TYPE_RAIDZ); + else + return (VDEV_TYPE_DRAID); } if (maxdev != NULL) @@ -1460,6 +1476,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { nvlist_t **child = NULL; int c, children = 0; + nvlist_t *draidcfg = NULL; if (strcmp(type, VDEV_TYPE_SPARE) == 0) { if (spares != NULL) { @@ -1516,6 +1533,34 @@ construct_spec(nvlist_t *props, int argc, char **argv) for (c = 1; c < argc; c++) { if (is_grouping(argv[c], NULL, NULL) != NULL) break; + + if (strcmp(type, VDEV_TYPE_DRAID) == 0 && + strncmp(argv[c], "cfg=", 4) == 0) { + if (draidcfg == NULL) { + draidcfg = + draidcfg_read_file(argv[c] + + 4); + if (draidcfg != NULL) + continue; + fprintf(stderr, + gettext("invalid draid " + "configuration '%s'\n"), + argv[c]); + } else { + fprintf(stderr, + gettext("dRAID config " + "specified more than " + "once: %s\n"), argv[c]); + } + + for (c = 0; c < children - 1; c++) + nvlist_free(child[c]); + free(child); + if (draidcfg != NULL) + nvlist_free(draidcfg); + return (NULL); + } + children++; child = realloc(child, children * sizeof (nvlist_t *)); @@ -1570,7 +1615,8 @@ construct_spec(nvlist_t *props, int argc, char **argv) type) == 0); verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, is_log) == 0); - if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || + strcmp(type, VDEV_TYPE_DRAID) == 0) { verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, mindev - 1) == 0); @@ -1582,6 +1628,19 @@ construct_spec(nvlist_t *props, int argc, char **argv) for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); + + if (draidcfg != NULL) { + ASSERT0(strcmp(type, VDEV_TYPE_DRAID)); + + if (!vdev_draid_config_add(nv, + draidcfg)) + fprintf(stderr, + gettext("ignoring invalid " + "draid config\n")); + + nvlist_free(draidcfg); + draidcfg = NULL; + } } } else { /* diff --git a/configure.ac b/configure.ac index 60416f6aeb0d..7f2880989fa1 100644 --- a/configure.ac +++ b/configure.ac @@ -113,6 +113,7 @@ AC_CONFIG_FILES([ cmd/arc_summary/Makefile cmd/zed/Makefile cmd/raidz_test/Makefile + cmd/draidcfg/Makefile contrib/Makefile contrib/bash_completion.d/Makefile contrib/dracut/Makefile diff --git a/include/libzfs.h b/include/libzfs.h index 0b5fe76dc5f4..ba5e608abd87 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -404,7 +404,6 @@ typedef struct importargs { extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *); /* legacy pool search routines */ -extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **); extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *, char *, uint64_t); @@ -823,6 +822,11 @@ int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); extern int zpool_enable_datasets(zpool_handle_t *, const char *, int); extern int zpool_disable_datasets(zpool_handle_t *, boolean_t); +/* + * dRAID import support + */ +nvlist_t *draidcfg_read_file(const char *); + /* * Mappings between vdev and FRU. */ diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 956643801c66..f01c1fc03517 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -55,6 +55,7 @@ COMMON_H = \ $(top_srcdir)/include/sys/space_reftree.h \ $(top_srcdir)/include/sys/spa.h \ $(top_srcdir)/include/sys/spa_impl.h \ + $(top_srcdir)/include/sys/spa_scan.h \ $(top_srcdir)/include/sys/spa_checksum.h \ $(top_srcdir)/include/sys/sysevent.h \ $(top_srcdir)/include/sys/trace.h \ @@ -85,6 +86,7 @@ COMMON_H = \ $(top_srcdir)/include/sys/vdev_impl.h \ $(top_srcdir)/include/sys/vdev_raidz.h \ $(top_srcdir)/include/sys/vdev_raidz_impl.h \ + $(top_srcdir)/include/sys/vdev_draid_impl.h \ $(top_srcdir)/include/sys/xvattr.h \ $(top_srcdir)/include/sys/zap.h \ $(top_srcdir)/include/sys/zap_impl.h \ diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index 44a11ba57207..9b7571828530 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -118,6 +118,8 @@ typedef struct dsl_scan { boolean_t scn_async_destroying; boolean_t scn_async_stalled; + boolean_t scn_is_sequential; /* sequential rebuild? */ + vdev_t *scn_vd; /* vdev to scan, valid only if scn_is_sequential */ /* for debugging / information */ uint64_t scn_visited_this_txg; diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 962698c2f37a..1223255996ac 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -641,6 +641,15 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" #define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" #define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" + +#define ZPOOL_CONFIG_DRAIDCFG "com.intel:draid_config" +#define ZPOOL_CONFIG_DRAIDCFG_DATA "com.intel:draid_data" +#define ZPOOL_CONFIG_DRAIDCFG_PARITY "com.intel:draid_parity" +#define ZPOOL_CONFIG_DRAIDCFG_SPARE "com.intel:draid_spare" +#define ZPOOL_CONFIG_DRAIDCFG_BASE "com.intel:draid_base" +#define ZPOOL_CONFIG_DRAIDCFG_CHILDREN "com.intel:draid_children" +#define ZPOOL_CONFIG_DRAIDCFG_PERM "com.intel:draid_perm" + /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such @@ -669,6 +678,8 @@ typedef struct zpool_rewind_policy { #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" +#define VDEV_TYPE_DRAID "draid" +#define VDEV_TYPE_DRAID_SPARE "dspare" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" @@ -759,6 +770,7 @@ typedef enum pool_scan_func { POOL_SCAN_NONE, POOL_SCAN_SCRUB, POOL_SCAN_RESILVER, + POOL_SCAN_REBUILD, /* sequential SPA scan */ POOL_SCAN_FUNCS } pool_scan_func_t; diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index f8a713a4f1ff..ba0018497e34 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -338,6 +338,7 @@ struct metaslab { */ boolean_t ms_loaded; boolean_t ms_loading; + boolean_t ms_rebuilding; int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ diff --git a/include/sys/nvpair.h b/include/sys/nvpair.h index d2dfad5ca2b3..9c358c0af920 100644 --- a/include/sys/nvpair.h +++ b/include/sys/nvpair.h @@ -281,6 +281,7 @@ nvlist_t *fnvlist_alloc(void); void fnvlist_free(nvlist_t *); size_t fnvlist_size(nvlist_t *); char *fnvlist_pack(nvlist_t *, size_t *); +char *fnvlist_pack_xdr(nvlist_t *, size_t *); void fnvlist_pack_free(char *, size_t); nvlist_t *fnvlist_unpack(char *, size_t); nvlist_t *fnvlist_dup(nvlist_t *); diff --git a/include/sys/spa_scan.h b/include/sys/spa_scan.h new file mode 100644 index 000000000000..df2a3ed3baa4 --- /dev/null +++ b/include/sys/spa_scan.h @@ -0,0 +1,47 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _SYS_SPA_SCAN_H +#define _SYS_SPA_SCAN_H + +#include <sys/types.h> +#include <sys/spa.h> +#include <sys/dsl_pool.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern boolean_t spa_scan_enabled(const spa_t *); +extern void spa_scan_setup_sync(dmu_tx_t *); +extern void spa_scan_start(spa_t *, vdev_t *, uint64_t); +extern int spa_scan_rebuild_cb(dsl_pool_t *, + const blkptr_t *, const zbookmark_phys_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_SCAN_H */ diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 4f54b1707c54..ed5aad66057e 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -107,6 +107,7 @@ extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); extern void vdev_clear(spa_t *spa, vdev_t *vd); extern boolean_t vdev_is_dead(vdev_t *vd); +extern boolean_t vdev_is_dead_at(vdev_t *vd, uint64_t offset); extern boolean_t vdev_readable(vdev_t *vd); extern boolean_t vdev_writeable(vdev_t *vd); extern boolean_t vdev_allocatable(vdev_t *vd); diff --git a/include/sys/vdev_draid_impl.h b/include/sys/vdev_draid_impl.h new file mode 100644 index 000000000000..33a251dfcbe1 --- /dev/null +++ b/include/sys/vdev_draid_impl.h @@ -0,0 +1,105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _VDEV_DRAID_IMPL_H +#define _VDEV_DRAID_IMPL_H + +#include <sys/types.h> +#include <sys/abd.h> +#include <sys/nvpair.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zio zio_t; +typedef struct vdev vdev_t; +typedef struct raidz_map raidz_map_t; + +struct vdev_draid_configuration { + uint64_t dcf_data; + uint64_t dcf_parity; + uint64_t dcf_spare; + uint64_t dcf_children; + uint64_t dcf_bases; + abd_t *dcf_zero_abd; + const uint64_t *dcf_base_perms; +}; + +extern boolean_t vdev_draid_ms_mirrored(const vdev_t *, uint64_t); +extern boolean_t vdev_draid_group_degraded(vdev_t *, vdev_t *, + uint64_t, uint64_t, boolean_t); +extern uint64_t vdev_draid_check_block(const vdev_t *vd, uint64_t, uint64_t); +extern uint64_t vdev_draid_get_astart(const vdev_t *, const uint64_t); +extern uint64_t vdev_draid_offset2group(const vdev_t *, uint64_t, boolean_t); +extern uint64_t vdev_draid_group2offset(const vdev_t *, uint64_t, boolean_t); +extern boolean_t vdev_draid_is_remainder_group(const vdev_t *, + uint64_t, boolean_t); +extern uint64_t vdev_draid_get_groupsz(const vdev_t *, boolean_t); +extern boolean_t vdev_draid_config_validate(const vdev_t *, nvlist_t *); +extern boolean_t vdev_draid_config_add(nvlist_t *, nvlist_t *); +extern void vdev_draid_fix_skip_sectors(zio_t *); +extern int vdev_draid_hide_skip_sectors(raidz_map_t *); +extern void vdev_draid_restore_skip_sectors(raidz_map_t *, int); +extern boolean_t vdev_draid_readable(vdev_t *, uint64_t); +extern boolean_t vdev_draid_is_dead(vdev_t *, uint64_t); +extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t); +extern vdev_t *vdev_draid_spare_get_parent(vdev_t *); +extern nvlist_t *vdev_draid_spare_read_config(vdev_t *); + +#define VDEV_DRAID_MAX_CHILDREN 255 +#define VDEV_DRAID_U8_MAX ((uint8_t)-1) + +#define VDEV_DRAID_SPARE_PATH_FMT "$"VDEV_TYPE_DRAID"%lu-%lu-s%lu" + +/* trace_printk is GPL only */ +#undef DRAID_USE_TRACE_PRINTK + +#ifdef _KERNEL +#define U64FMT "%llu" +#ifdef DRAID_USE_TRACE_PRINTK +#define draid_print(fmt, ...) trace_printk(fmt, ##__VA_ARGS__) +#else +#define draid_print(fmt, ...) printk(fmt, ##__VA_ARGS__) +#endif +#else +#define U64FMT "%lu" +#define draid_print(fmt, ...) printf(fmt, ##__VA_ARGS__) +#endif + +extern int draid_debug_lvl; +extern void vdev_draid_debug_zio(zio_t *, boolean_t); + +#define draid_dbg(lvl, fmt, ...) \ + do { \ + if (draid_debug_lvl >= (lvl)) \ + draid_print(fmt, ##__VA_ARGS__); \ + } while (0); + + +#ifdef __cplusplus +} +#endif + +#endif /* _VDEV_DRAID_IMPL_H */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index d7f11a2b885d..8b15d1f3b614 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -32,6 +32,7 @@ #include <sys/nvpair.h> #include <sys/space_map.h> #include <sys/vdev.h> +#include <sys/abd.h> #include <sys/dkio.h> #include <sys/uberblock_impl.h> #include <sys/zfs_ratelimit.h> @@ -185,6 +186,7 @@ struct vdev { boolean_t vdev_ishole; /* is a hole in the namespace */ kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */ uint64_t vdev_top_zap; + nvlist_t *vdev_cfg; /* additional configuration */ /* * The queue depth parameters determine how many async writes are @@ -355,12 +357,39 @@ extern vdev_ops_t vdev_root_ops; extern vdev_ops_t vdev_mirror_ops; extern vdev_ops_t vdev_replacing_ops; extern vdev_ops_t vdev_raidz_ops; +extern vdev_ops_t vdev_draid_ops; +extern vdev_ops_t vdev_draid_spare_ops; extern vdev_ops_t vdev_disk_ops; extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; +/* + * Virtual device vector for mirroring. + */ +typedef struct mirror_child { + vdev_t *mc_vd; + uint64_t mc_offset; + int mc_error; + int mc_load; + uint8_t mc_tried; + uint8_t mc_skipped; + uint8_t mc_speculative; +} mirror_child_t; + +typedef struct mirror_map { + int *mm_preferred; + int mm_preferred_cnt; + int mm_children; + boolean_t mm_replacing; + boolean_t mm_root; + mirror_child_t mm_child[]; +} mirror_map_t; + +extern mirror_map_t *vdev_mirror_map_alloc(int, boolean_t, boolean_t); +extern const zio_vsd_ops_t vdev_mirror_vsd_ops; + /* * Common size functions */ @@ -368,6 +397,9 @@ extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); +extern boolean_t vdev_raidz_need_resilver(vdev_t *, uint64_t, size_t); +extern boolean_t vdev_draid_need_resilver(vdev_t *, uint64_t, size_t); + /* * Global variables */ diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 4bd15e3d53c2..39941250e634 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -106,6 +106,7 @@ typedef struct raidz_col { size_t rc_offset; /* device offset */ size_t rc_size; /* I/O size */ abd_t *rc_abd; /* I/O data */ + abd_t *rc_abd_skip; /* Skip sector */ void *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ unsigned int rc_tried; /* Did we attempt this I/O column? */ @@ -123,10 +124,12 @@ typedef struct raidz_map { size_t rm_nskip; /* Skipped sectors for padding */ size_t rm_skipstart; /* Column index of padding start */ abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ + abd_t *rm_abd_skip; /* dRAID skip sectors */ size_t rm_reports; /* # of referencing checksum reports */ unsigned int rm_freed; /* map no longer has referencing ZIO */ unsigned int rm_ecksuminjected; /* checksum error was injected */ raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ + boolean_t rm_declustered; /* dRAID? */ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ } raidz_map_t; diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index ce65840905b7..51dfc4937ddc 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -61,6 +61,7 @@ #include <sys/dktp/fdisk.h> #include <sys/efi_partition.h> #include <sys/vdev_impl.h> +#include <sys/vdev_draid_impl.h> #include <blkid/blkid.h> #include "libzfs.h" #include "libzfs_impl.h" @@ -862,7 +863,7 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config) /* * Determine if the vdev id is a hole in the namespace. */ -boolean_t +static boolean_t vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) { int c; @@ -876,6 +877,64 @@ vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) return (B_FALSE); } +nvlist_t * +draidcfg_read_file(const char *path) +{ + int fd; + struct stat64 sb; + char *buf; + nvlist_t *config; + + if ((fd = open(path, O_RDONLY)) < 0) { + (void) fprintf(stderr, "Cannot open '%s'\n", path); + return (NULL); + } + + if (fstat64(fd, &sb) != 0) { + (void) fprintf(stderr, "Failed to stat '%s'\n", path); + close(fd); + return (NULL); + } + + if (!S_ISREG(sb.st_mode)) { + (void) fprintf(stderr, "Not a regular file '%s'\n", path); + close(fd); + return (NULL); + } + + if ((buf = malloc(sb.st_size)) == NULL) { + (void) fprintf(stderr, "Failed to allocate %llu bytes\n", + (u_longlong_t)sb.st_size); + close(fd); + return (NULL); + } + + if (read(fd, buf, sb.st_size) != sb.st_size) { + (void) fprintf(stderr, "Failed to read %llu bytes\n", + (u_longlong_t)sb.st_size); + close(fd); + free(buf); + return (NULL); + } + + (void) close(fd); + + if (nvlist_unpack(buf, sb.st_size, &config, 0) != 0) { + (void) fprintf(stderr, "Failed to unpack nvlist\n"); + free(buf); + return (NULL); + } + + free(buf); + + if (!vdev_draid_config_validate(NULL, config)) { + nvlist_free(config); + return (NULL); + } + + return (config); +} + /* * Convert our list of pools into the definitive set of configurations. We * start by picking the best config for each toplevel vdev. Once that's done, @@ -1982,17 +2041,6 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) return (ret); } -nvlist_t * -zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv) -{ - importargs_t iarg = { 0 }; - - iarg.paths = argc; - iarg.path = argv; - - return (zpool_find_import_impl(hdl, &iarg)); -} - /* * Given a cache file, return the contents as a list of importable pools. * poolname or guid (but not both) are provided by the caller when trying diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 9abea9023e03..0708b49c487a 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -41,6 +41,7 @@ #include <sys/efi_partition.h> #include <sys/vtoc.h> #include <sys/zfs_ioctl.h> +#include <sys/vdev_draid_impl.h> #include <dlfcn.h> #include "zfs_namecheck.h" @@ -942,6 +943,7 @@ zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) if (ret == 0 && !isopen && (strncmp(pool, "mirror", 6) == 0 || strncmp(pool, "raidz", 5) == 0 || + strncmp(pool, "draid", 5) == 0 || strncmp(pool, "spare", 5) == 0 || strcmp(pool, "log") == 0)) { if (hdl != NULL) @@ -2037,6 +2039,8 @@ vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare, verify(strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || + strncmp(type, VDEV_TYPE_DRAID, + strlen(VDEV_TYPE_DRAID)) == 0 || strncmp(type, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, @@ -2149,6 +2153,7 @@ boolean_t zpool_vdev_is_interior(const char *name) { if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || + strncmp(name, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0 || strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) return (B_TRUE); return (B_FALSE); @@ -2694,6 +2699,10 @@ zpool_vdev_attach(zpool_handle_t *zhp, if (islog) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a log with a spare")); + else if (new_disk[0] == VDEV_DRAID_SPARE_PATH_FMT[0]) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dspare can only replace a child " + "drive in its parent draid vdev")); else if (version >= SPA_VERSION_MULTI_REPLACE) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "already in replacing/spare config; wait " @@ -3519,7 +3528,8 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, /* * Remove the partition from the path it this is a whole disk. */ - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) + if (strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0 && + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) == 0 && value && !(name_flags & VDEV_NAME_PATH)) { return (zfs_strip_partition(path)); } @@ -3529,7 +3539,8 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, /* * If it's a raidz device, we need to stick in the parity level. */ - if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) { + if (strcmp(path, VDEV_TYPE_RAIDZ) == 0 || + strcmp(path, VDEV_TYPE_DRAID) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &value) == 0); (void) snprintf(buf, sizeof (buf), "%s%llu", path, diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 1e95f8064c8f..5fc5b0b9ffbd 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -86,6 +86,7 @@ KERNEL_C = \ spa_history.c \ spa_misc.c \ spa_stats.c \ + spa_scan.c \ space_map.c \ space_reftree.c \ txg.c \ @@ -109,6 +110,7 @@ KERNEL_C = \ vdev_raidz_math_avx512bw.c \ vdev_raidz_math_aarch64_neon.c \ vdev_raidz_math_aarch64_neonx2.c \ + vdev_draid.c \ vdev_root.c \ zap.c \ zap_leaf.c \ diff --git a/module/nvpair/fnvpair.c b/module/nvpair/fnvpair.c index a91b9524d8a0..4efb9048f745 100644 --- a/module/nvpair/fnvpair.c +++ b/module/nvpair/fnvpair.c @@ -73,15 +73,26 @@ fnvlist_size(nvlist_t *nvl) * Returns allocated buffer of size *sizep. Caller must free the buffer with * fnvlist_pack_free(). */ -char * -fnvlist_pack(nvlist_t *nvl, size_t *sizep) +static char * +fnvlist_pack_enc(nvlist_t *nvl, size_t *sizep, int encoding) { char *packed = 0; - VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE, - KM_SLEEP), ==, 0); + VERIFY3U(nvlist_pack(nvl, &packed, sizep, encoding, KM_SLEEP), ==, 0); return (packed); } +char * +fnvlist_pack(nvlist_t *nvl, size_t *sizep) +{ + return (fnvlist_pack_enc(nvl, sizep, NV_ENCODE_NATIVE)); +} + +char * +fnvlist_pack_xdr(nvlist_t *nvl, size_t *sizep) +{ + return (fnvlist_pack_enc(nvl, sizep, NV_ENCODE_XDR)); +} + /*ARGSUSED*/ void fnvlist_pack_free(char *pack, size_t size) diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c index f9c20896d460..9008650d6482 100644 --- a/module/zcommon/zfs_namecheck.c +++ b/module/zcommon/zfs_namecheck.c @@ -326,7 +326,9 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what) return (-1); } - if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) { + if (strcmp(pool, "mirror") == 0 || + strcmp(pool, "raidz") == 0 || + strcmp(pool, "draid") == 0) { if (why) *why = NAME_ERR_RESERVED; return (-1); diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 6712b9b3c04b..9b642b535d5d 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -58,6 +58,7 @@ $(MODULE)-objs += spa_config.o $(MODULE)-objs += spa_errlog.o $(MODULE)-objs += spa_history.o $(MODULE)-objs += spa_misc.o +$(MODULE)-objs += spa_scan.o $(MODULE)-objs += spa_stats.o $(MODULE)-objs += space_map.o $(MODULE)-objs += space_reftree.o @@ -76,6 +77,7 @@ $(MODULE)-objs += vdev_queue.o $(MODULE)-objs += vdev_raidz.o $(MODULE)-objs += vdev_raidz_math.o $(MODULE)-objs += vdev_raidz_math_scalar.o +$(MODULE)-objs += vdev_draid.o $(MODULE)-objs += vdev_root.o $(MODULE)-objs += zap.o $(MODULE)-objs += zap_leaf.o diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index f5ef2268d2fd..201c809e46c9 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -24,6 +24,7 @@ * Copyright 2016 Gary Mills */ +#include <sys/spa_scan.h> #include <sys/dsl_scan.h> #include <sys/dsl_pool.h> #include <sys/dsl_dataset.h> @@ -78,6 +79,8 @@ unsigned long zfs_free_max_blocks = 100000; #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) +#define DSL_SCAN_IS_REBUILD(scn) \ + ((scn)->scn_phys.scn_func == POOL_SCAN_REBUILD) /* * Enable/disable the processing of the free_bpobj object. @@ -89,6 +92,7 @@ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { NULL, dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ + spa_scan_rebuild_cb, /* POOL_SCAN_REBUILD */ }; int @@ -339,7 +343,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) spa_history_log_internal(spa, "scan done", tx, "errors=%llu", spa_get_errlog_size(spa)); - if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + if (DSL_SCAN_IS_SCRUB_RESILVER(scn) || DSL_SCAN_IS_REBUILD(scn)) { mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight > 0) { cv_wait(&spa->spa_scrub_io_cv, @@ -1526,11 +1530,18 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (dsl_scan_restarting(scn, tx)) { pool_scan_func_t func = POOL_SCAN_SCRUB; dsl_scan_done(scn, B_FALSE, tx); - if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) - func = POOL_SCAN_RESILVER; + if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { + if (scn->scn_is_sequential) + func = POOL_SCAN_REBUILD; + else + func = POOL_SCAN_RESILVER; + } zfs_dbgmsg("restarting scan func=%u txg=%llu", func, tx->tx_txg); - dsl_scan_setup_sync(&func, tx); + if (func == POOL_SCAN_REBUILD) + spa_scan_setup_sync(tx); + else + dsl_scan_setup_sync(&func, tx); } /* @@ -1553,6 +1564,18 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (!scn->scn_async_stalled && !dsl_scan_active(scn)) return; + if (DSL_SCAN_IS_REBUILD(scn)) { + if (scn->scn_visited_this_txg == 19890604) { + ASSERT(!scn->scn_pausing); + /* finished with scan. */ + dsl_scan_done(scn, B_TRUE, tx); + scn->scn_visited_this_txg = 0; + dsl_scan_sync_state(scn, tx); + } + /* Rebuild is mostly handled in the open-context scan thread */ + return; + } + scn->scn_visited_this_txg = 0; scn->scn_pausing = B_FALSE; scn->scn_sync_start_time = gethrtime(); @@ -1754,6 +1777,8 @@ dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) } else { dp->dp_scan->scn_restart_txg = txg; } + dp->dp_scan->scn_vd = NULL; + dp->dp_scan->scn_is_sequential = B_FALSE; zfs_dbgmsg("restarting resilver txg=%llu", txg); } @@ -1836,6 +1861,44 @@ dsl_scan_scrub_done(zio_t *zio) mutex_exit(&spa->spa_scrub_lock); } +static int zfs_no_resilver_skip = 1; + +static boolean_t +dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, + size_t size, uint64_t phys_birth) +{ + vdev_t *vd; + uint64_t offset; + + if (DVA_GET_GANG(dva)) { + /* + * Gang members may be spread across multiple + * vdevs, so the best estimate we have is the + * scrub range, which has already been checked. + * XXX -- it would be better to change our + * allocation policy to ensure that all + * gang members reside on the same vdev. + */ + return (B_TRUE); + } + + vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + + if (zfs_no_resilver_skip != 0) + return (B_TRUE); + + offset = DVA_GET_OFFSET(dva); + if (vd->vdev_ops == &vdev_raidz_ops) + return (vdev_raidz_need_resilver(vd, offset, size)); + + if (vd->vdev_ops == &vdev_draid_ops) + return (vdev_draid_need_resilver(vd, offset, size)); + + return (B_TRUE); +} + static int dsl_scan_scrub_cb(dsl_pool_t *dp, const blkptr_t *bp, const zbookmark_phys_t *zb) @@ -1875,33 +1938,19 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, zio_flags |= ZIO_FLAG_SPECULATIVE; for (d = 0; d < BP_GET_NDVAS(bp); d++) { - vdev_t *vd = vdev_lookup_top(spa, - DVA_GET_VDEV(&bp->blk_dva[d])); + const dva_t *dva = &bp->blk_dva[d]; /* * Keep track of how much data we've examined so that * zpool(1M) status can make useful progress reports. */ - scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); - spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); + scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); + spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva); /* if it's a resilver, this may not be in the target range */ - if (!needs_io) { - if (DVA_GET_GANG(&bp->blk_dva[d])) { - /* - * Gang members may be spread across multiple - * vdevs, so the best estimate we have is the - * scrub range, which has already been checked. - * XXX -- it would be better to change our - * allocation policy to ensure that all - * gang members reside on the same vdev. - */ - needs_io = B_TRUE; - } else { - needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, - phys_birth, 1); - } - } + if (!needs_io) + needs_io = dsl_scan_need_resilver(spa, dva, + size, phys_birth); } if (needs_io && !zfs_no_scrub_io) { @@ -1981,6 +2030,10 @@ MODULE_PARM_DESC(zfs_free_min_time_ms, "Min millisecs to free per txg"); module_param(zfs_resilver_min_time_ms, int, 0644); MODULE_PARM_DESC(zfs_resilver_min_time_ms, "Min millisecs to resilver per txg"); +module_param(zfs_no_resilver_skip, int, 0644); +MODULE_PARM_DESC(zfs_no_resilver_skip, + "Set to disable skipping spurious resilver IO"); + module_param(zfs_no_scrub_io, int, 0644); MODULE_PARM_DESC(zfs_no_scrub_io, "Set to disable scrub I/O"); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 3fd7f9049c8b..48268aa1a94a 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -30,6 +30,7 @@ #include <sys/space_map.h> #include <sys/metaslab_impl.h> #include <sys/vdev_impl.h> +#include <sys/vdev_draid_impl.h> #include <sys/zio.h> #include <sys/spa_impl.h> #include <sys/zfeature.h> @@ -1103,8 +1104,8 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) * tree looking for a block that matches the specified criteria. */ static uint64_t -metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, - uint64_t align) +metaslab_block_picker(metaslab_t *msp, avl_tree_t *t, uint64_t *cursor, + uint64_t size, uint64_t align) { range_seg_t *rs = metaslab_block_find(t, *cursor, size); @@ -1112,8 +1113,27 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, uint64_t offset = P2ROUNDUP(rs->rs_start, align); if (offset + size <= rs->rs_end) { - *cursor = offset + size; - return (offset); + vdev_t *vd = msp->ms_group->mg_vd; + uint64_t next_offset; + + if (vd->vdev_ops != &vdev_draid_ops) { + *cursor = offset + size; + return (offset); + } + + next_offset = vdev_draid_check_block(vd, offset, size); + if (next_offset == offset) { + *cursor = offset + size; + return (offset); + } + + offset = P2ROUNDUP(next_offset, align); + if (offset + size <= rs->rs_end) { + ASSERT3U(offset, ==, + vdev_draid_check_block(vd, offset, size)); + *cursor = offset + size; + return (offset); + } } rs = AVL_NEXT(t, rs); } @@ -1126,7 +1146,7 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, return (-1ULL); *cursor = 0; - return (metaslab_block_picker(t, cursor, size, align)); + return (metaslab_block_picker(msp, t, cursor, size, align)); } #endif /* WITH_FF/DF/CF_BLOCK_ALLOCATOR */ @@ -1150,7 +1170,7 @@ metaslab_ff_alloc(metaslab_t *msp, uint64_t size) uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; avl_tree_t *t = &msp->ms_tree->rt_root; - return (metaslab_block_picker(t, cursor, size, align)); + return (metaslab_block_picker(msp, t, cursor, size, align)); } static metaslab_ops_t metaslab_ff_ops = { @@ -1202,7 +1222,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) *cursor = 0; } - return (metaslab_block_picker(t, cursor, size, 1ULL)); + return (metaslab_block_picker(msp, t, cursor, size, 1ULL)); } static metaslab_ops_t metaslab_df_ops = { @@ -1408,6 +1428,12 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; + if (vd->vdev_ops == &vdev_draid_ops) { + uint64_t astart = vdev_draid_get_astart(vd, ms->ms_start); + + ms->ms_size -= astart - ms->ms_start; + ms->ms_start = astart; + } /* * We only open space map objects that already exist. All others @@ -2695,6 +2721,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) metaslab_class_t *mc = msp->ms_group->mg_class; VERIFY(!msp->ms_condensing); + VERIFY(!msp->ms_rebuilding); start = mc->mc_ops->msop_alloc(msp, size); if (start != -1ULL) { @@ -2707,7 +2734,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) range_tree_remove(rt, start, size); if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) - vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); + vdev_dirty(vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size); @@ -2726,18 +2753,26 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) + uint64_t psize, uint64_t asize, uint64_t txg, uint64_t min_distance, + dva_t *dva, int d) { + vdev_t *vd = mg->mg_vd; metaslab_t *msp = NULL; metaslab_t *search; uint64_t offset = -1ULL; + boolean_t hybrid_mirror = B_FALSE; uint64_t activation_weight; uint64_t target_distance; int i; + if (vd->vdev_ops == &vdev_draid_ops && + psize <= (1ULL << vd->vdev_top->vdev_ashift)) { + hybrid_mirror = B_TRUE; + } + activation_weight = METASLAB_WEIGHT_PRIMARY; for (i = 0; i < d; i++) { - if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { + if (DVA_GET_VDEV(&dva[i]) == vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; break; } @@ -2778,10 +2813,15 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, continue; } + if (vd->vdev_ops == &vdev_draid_ops && + hybrid_mirror != + vdev_draid_ms_mirrored(vd, msp->ms_id)) + continue; + /* * If the selected metaslab is condensing, skip it. */ - if (msp->ms_condensing) + if (msp->ms_condensing || msp->ms_rebuilding) continue; was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; @@ -2857,7 +2897,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, * we can't manipulate this metaslab until it's committed * to disk. */ - if (msp->ms_condensing) { + if (msp->ms_condensing || msp->ms_rebuilding) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_CONDENSING); mutex_exit(&msp->ms_lock); @@ -2921,12 +2961,13 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) + uint64_t psize, uint64_t asize, uint64_t txg, uint64_t min_distance, + dva_t *dva, int d) { uint64_t offset; ASSERT(mg->mg_initialized); - offset = metaslab_group_alloc_normal(mg, zal, asize, txg, + offset = metaslab_group_alloc_normal(mg, zal, psize, asize, txg, min_distance, dva, d); mutex_enter(&mg->mg_lock); @@ -3119,8 +3160,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - offset = metaslab_group_alloc(mg, zal, asize, txg, distance, - dva, d); + offset = metaslab_group_alloc(mg, zal, psize, asize, txg, + distance, dva, d); if (offset != -1ULL) { /* @@ -3303,6 +3344,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) } VERIFY(!msp->ms_condensing); + VERIFY(!msp->ms_rebuilding); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index d632d635e916..a14eeec06bc1 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -49,6 +49,7 @@ #include <sys/ddt.h> #include <sys/vdev_impl.h> #include <sys/vdev_disk.h> +#include <sys/vdev_draid_impl.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> #include <sys/uberblock_impl.h> @@ -68,6 +69,7 @@ #include <sys/systeminfo.h> #include <sys/spa_boot.h> #include <sys/zfs_ioctl.h> +#include <sys/spa_scan.h> #include <sys/dsl_scan.h> #include <sys/zfeature.h> #include <sys/dsl_destroy.h> @@ -3750,6 +3752,72 @@ spa_l2cache_drop(spa_t *spa) } } +static int +spa_add_draid_spare(nvlist_t *nvroot, vdev_t *rvd) +{ + int i, j, n; + nvlist_t **oldspares, **newspares; + uint_t nspares; + vdev_t *c; + struct vdev_draid_configuration *cfg; + + for (i = 0, n = 0; i < rvd->vdev_children; i++) { + c = rvd->vdev_child[i]; + + if (c->vdev_ops == &vdev_draid_ops) { + cfg = c->vdev_tsd; + ASSERT(cfg != NULL); + n += cfg->dcf_spare; + } + } + + if (n == 0) + return (0); + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &oldspares, &nspares) != 0) + nspares = 0; + + newspares = kmem_alloc(sizeof (*newspares) * (n + nspares), KM_SLEEP); + for (i = 0; i < nspares; i++) + newspares[i] = fnvlist_dup(oldspares[i]); + + for (i = 0, n = nspares; i < rvd->vdev_children; i++) { + c = rvd->vdev_child[i]; + + if (c->vdev_ops != &vdev_draid_ops) + continue; + + cfg = c->vdev_tsd; + for (j = 0; j < cfg->dcf_spare; j++) { + nvlist_t *ds = fnvlist_alloc(); + char path[64]; + + snprintf(path, sizeof (path), VDEV_DRAID_SPARE_PATH_FMT, + (long unsigned)c->vdev_nparity, + (long unsigned)c->vdev_id, (long unsigned)j); + fnvlist_add_string(ds, ZPOOL_CONFIG_PATH, path); + fnvlist_add_string(ds, + ZPOOL_CONFIG_TYPE, VDEV_TYPE_DRAID_SPARE); + fnvlist_add_uint64(ds, ZPOOL_CONFIG_IS_LOG, 0); + fnvlist_add_uint64(ds, ZPOOL_CONFIG_IS_SPARE, 1); + fnvlist_add_uint64(ds, ZPOOL_CONFIG_WHOLE_DISK, 1); + fnvlist_add_uint64(ds, + ZPOOL_CONFIG_ASHIFT, c->vdev_ashift); + + newspares[n] = ds; + n++; + } + } + + (void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES); + fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, newspares, n); + for (i = 0; i < n; i++) + nvlist_free(newspares[i]); + kmem_free(newspares, sizeof (*newspares) * n); + return (0); +} + /* * Pool Creation */ @@ -3854,6 +3922,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && + (error = spa_add_draid_spare(nvroot, rvd)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { for (c = 0; c < rvd->vdev_children; c++) { @@ -4583,6 +4652,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) char *oldvdpath, *newvdpath; int newvd_isspare; int error; + boolean_t rebuild = B_FALSE; ASSERTV(vdev_t *rvd = spa->spa_root_vdev); ASSERT(spa_writeable(spa)); @@ -4614,6 +4684,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); + /* + * dRAID spare can only replace a child drive of its parent + * dRAID vdev + */ + if (newvd->vdev_ops == &vdev_draid_spare_ops && + oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + /* * Spares can't replace logs */ @@ -4731,8 +4809,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, - dtl_max_txg - TXG_INITIAL); + vdev_dtl_dirty(newvd, DTL_MISSING, + TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); @@ -4748,12 +4826,20 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ vdev_dirty(tvd, VDD_DTL, newvd, txg); + if (spa_scan_enabled(spa) && + (tvd->vdev_ops == &vdev_mirror_ops || + newvd->vdev_ops == &vdev_draid_spare_ops)) + rebuild = B_TRUE; /* HH: let zpool cmd choose */ + /* * Schedule the resilver to restart in the future. We do this to * ensure that dmu_sync-ed blocks have been stitched into the * respective datasets. */ - dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); + if (rebuild) + spa_scan_start(spa, oldvd, dtl_max_txg); + else + dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); if (spa->spa_bootfs) spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); diff --git a/module/zfs/spa_scan.c b/module/zfs/spa_scan.c new file mode 100644 index 000000000000..927911ef3456 --- /dev/null +++ b/module/zfs/spa_scan.c @@ -0,0 +1,383 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#include <sys/vdev_impl.h> +#include <sys/vdev_draid_impl.h> +#include <sys/spa_impl.h> +#include <sys/spa_scan.h> +#include <sys/metaslab_impl.h> +#include <sys/dsl_scan.h> +#include <sys/zio.h> +#include <sys/dmu_tx.h> + +static void +spa_scan_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + + ASSERT(zio->io_bp != NULL); + + abd_free(zio->io_abd); + kmem_free(zio->io_private, sizeof (blkptr_t)); + + scn->scn_phys.scn_examined += DVA_GET_ASIZE(&zio->io_bp->blk_dva[0]); + spa->spa_scan_pass_exam += DVA_GET_ASIZE(&zio->io_bp->blk_dva[0]); + + mutex_enter(&spa->spa_scrub_lock); + + spa->spa_scrub_inflight--; + cv_broadcast(&spa->spa_scrub_io_cv); + + if (zio->io_error && (zio->io_error != ECKSUM || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { + spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; + } + + mutex_exit(&spa->spa_scrub_lock); +} + +static int spa_scan_max_rebuild = 4096; + +static void +spa_scan_rebuild_block(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t asize) +{ + /* HH: maybe bp can be on the stack */ + blkptr_t *bp = kmem_alloc(sizeof (*bp), KM_SLEEP); + dva_t *dva = bp->blk_dva; + uint64_t psize; + spa_t *spa = vd->vdev_spa; + ASSERTV(uint64_t ashift = vd->vdev_top->vdev_ashift); + + ASSERT(vd->vdev_ops == &vdev_draid_ops || + vd->vdev_ops == &vdev_mirror_ops); + + if (vd->vdev_ops == &vdev_mirror_ops) { + psize = asize; + ASSERT3U(asize, ==, vdev_psize_to_asize(vd, psize)); + } else if (vdev_draid_ms_mirrored(vd, offset >> vd->vdev_ms_shift)) { + ASSERT0((asize >> ashift) % (1 + vd->vdev_nparity)); + psize = asize / (1 + vd->vdev_nparity); + } else { + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + ASSERT0((asize >> ashift) % (cfg->dcf_data + vd->vdev_nparity)); + psize = (asize / (cfg->dcf_data + vd->vdev_nparity)) * + cfg->dcf_data; + } + + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > spa_scan_max_rebuild) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_scrub_inflight++; + mutex_exit(&spa->spa_scrub_lock); + + BP_ZERO(bp); + + DVA_SET_VDEV(&dva[0], vd->vdev_id); + DVA_SET_OFFSET(&dva[0], offset); + DVA_SET_GANG(&dva[0], 0); + DVA_SET_ASIZE(&dva[0], asize); + + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + BP_SET_LSIZE(bp, psize); + BP_SET_PSIZE(bp, psize); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + zio_nowait(zio_read(pio, spa, bp, + abd_alloc(psize, B_FALSE), psize, spa_scan_done, bp, + ZIO_PRIORITY_SCRUB, ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | + ZIO_FLAG_CANFAIL | ZIO_FLAG_RESILVER, NULL)); +} + +static void +spa_scan_rebuild(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t length) +{ + uint64_t max_asize, chunksz; + + if (vd->vdev_ops == &vdev_draid_ops && + vdev_draid_ms_mirrored(vd, offset >> vd->vdev_ms_shift)) + max_asize = SPA_MAXBLOCKSIZE * (1 + vd->vdev_nparity); + else + max_asize = vdev_psize_to_asize(vd, SPA_MAXBLOCKSIZE); + + while (length > 0) { + chunksz = MIN(length, max_asize); + spa_scan_rebuild_block(pio, vd, offset, chunksz); + + length -= chunksz; + offset += chunksz; + } +} + +typedef struct { + vdev_t *ssa_vd; + uint64_t ssa_dtl_max; +} spa_scan_arg_t; + +static void +spa_scan_thread(void *arg) +{ + spa_scan_arg_t *sscan = arg; + vdev_t *vd = sscan->ssa_vd->vdev_top; + spa_t *spa = vd->vdev_spa; + zio_t *pio = zio_root(spa, NULL, NULL, 0); + range_tree_t *allocd_segs; + kmutex_t lock; + uint64_t msi; + int err; + + /* + * Wait for newvd's DTL to propagate upward when + * spa_vdev_exit() calls vdev_dtl_reassess(). + */ + txg_wait_synced(spa->spa_dsl_pool, sscan->ssa_dtl_max); + + mutex_init(&lock, NULL, MUTEX_DEFAULT, NULL); + allocd_segs = range_tree_create(NULL, NULL, &lock); + + for (msi = 0; msi < vd->vdev_ms_count; msi++) { + metaslab_t *msp = vd->vdev_ms[msi]; + + ASSERT0(range_tree_space(allocd_segs)); + + mutex_enter(&msp->ms_lock); + + while (msp->ms_condensing) { + mutex_exit(&msp->ms_lock); + + zfs_sleep_until(gethrtime() + 100 * MICROSEC); + + mutex_enter(&msp->ms_lock); + } + + VERIFY(!msp->ms_condensing); + VERIFY(!msp->ms_rebuilding); + msp->ms_rebuilding = B_TRUE; + + /* + * If the metaslab has ever been allocated from (ms_sm!=NULL), + * read the allocated segments from the space map object + * into svr_allocd_segs. Since we do this while holding + * svr_lock and ms_sync_lock, concurrent frees (which + * would have modified the space map) will wait for us + * to finish loading the spacemap, and then take the + * appropriate action (see free_from_removing_vdev()). + */ + if (msp->ms_sm != NULL) { + space_map_t *sm = NULL; + + /* + * We have to open a new space map here, because + * ms_sm's sm_length and sm_alloc may not reflect + * what's in the object contents, if we are in between + * metaslab_sync() and metaslab_sync_done(). + * + * Note: space_map_open() drops and reacquires the + * caller-provided lock. Therefore we can not provide + * any lock that we are using (e.g. ms_lock, svr_lock). + */ + VERIFY0(space_map_open(&sm, + spa->spa_dsl_pool->dp_meta_objset, + msp->ms_sm->sm_object, msp->ms_sm->sm_start, + msp->ms_sm->sm_size, msp->ms_sm->sm_shift, &lock)); + mutex_enter(&lock); + space_map_update(sm); + VERIFY0(space_map_load(sm, allocd_segs, SM_ALLOC)); + mutex_exit(&lock); + space_map_close(sm); + + /* + * When we are resuming from a paused removal (i.e. + * when importing a pool with a removal in progress), + * discard any state that we have already processed. + * range_tree_clear(svr->svr_allocd_segs, 0, + * start_offset); + */ + } + mutex_exit(&msp->ms_lock); + + zfs_dbgmsg("Scanning %llu segments for metaslab %llu", + avl_numnodes(&allocd_segs->rt_root), msp->ms_id); + + mutex_enter(&lock); + while (range_tree_space(allocd_segs) != 0) { + boolean_t mirror; + uint64_t offset, length; + range_seg_t *rs = avl_first(&allocd_segs->rt_root); + + ASSERT(rs != NULL); + offset = rs->rs_start; + length = rs->rs_end - rs->rs_start; + + range_tree_remove(allocd_segs, offset, length); + mutex_exit(&lock); + + draid_dbg(1, "MS ("U64FMT" at "U64FMT"K) segment: " + U64FMT"K + "U64FMT"K\n", + msp->ms_id, msp->ms_start >> 10, + (offset - msp->ms_start) >> 10, length >> 10); + + if (vd->vdev_ops == &vdev_mirror_ops) { + spa_scan_rebuild(pio, vd, offset, length); + mutex_enter(&lock); + continue; + } + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + mirror = vdev_draid_ms_mirrored(vd, msi); + + while (length > 0) { + uint64_t group, group_left, chunksz; + char *action = "Skipping"; + + /* + * HH: make sure we don't cross redundancy + * group boundary + */ + group = + vdev_draid_offset2group(vd, offset, mirror); + group_left = vdev_draid_group2offset(vd, + group + 1, mirror) - offset; + ASSERT(!vdev_draid_is_remainder_group(vd, + group, mirror)); + ASSERT3U(group_left, <=, + vdev_draid_get_groupsz(vd, mirror)); + + chunksz = MIN(length, group_left); + if (vdev_draid_group_degraded(vd, + sscan->ssa_vd, offset, chunksz, mirror)) { + action = "Fixing"; + spa_scan_rebuild(pio, vd, + offset, chunksz); + } + + draid_dbg(1, "\t%s: "U64FMT"K + "U64FMT + "K (%s)\n", + action, offset >> 10, chunksz >> 10, + mirror ? "mirrored" : "dRAID"); + + length -= chunksz; + offset += chunksz; + } + + mutex_enter(&lock); + } + mutex_exit(&lock); + + mutex_enter(&msp->ms_lock); + + /* HH: wait for rebuild IOs to complete for this metaslab? */ + msp->ms_rebuilding = B_FALSE; + + mutex_exit(&msp->ms_lock); + } + + range_tree_destroy(allocd_segs); + mutex_destroy(&lock); + kmem_free(sscan, sizeof (*sscan)); + + err = zio_wait(pio); + if (err != 0) /* HH: handle error */ + err = SET_ERROR(err); + /* HH: we don't use scn_visited_this_txg anyway */ + spa->spa_dsl_pool->dp_scan->scn_visited_this_txg = 19890604; +} + +void +spa_scan_start(spa_t *spa, vdev_t *oldvd, uint64_t txg) +{ + dsl_scan_t *scan = spa->spa_dsl_pool->dp_scan; + spa_scan_arg_t *sscan_arg; + + scan->scn_vd = oldvd->vdev_top; + scan->scn_restart_txg = txg; + scan->scn_is_sequential = B_TRUE; + + sscan_arg = kmem_alloc(sizeof (*sscan_arg), KM_SLEEP); + sscan_arg->ssa_vd = oldvd; + sscan_arg->ssa_dtl_max = txg; + (void) thread_create(NULL, 0, spa_scan_thread, sscan_arg, 0, NULL, + TS_RUN, defclsyspri); +} + +void +spa_scan_setup_sync(dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + spa_t *spa = scn->scn_dp->dp_spa; + + ASSERT(scn->scn_vd != NULL); + ASSERT(scn->scn_is_sequential); + ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); + + bzero(&scn->scn_phys, sizeof (scn->scn_phys)); + scn->scn_phys.scn_func = POOL_SCAN_REBUILD; + scn->scn_phys.scn_state = DSS_SCANNING; + scn->scn_phys.scn_min_txg = 0; + scn->scn_phys.scn_max_txg = tx->tx_txg; + scn->scn_phys.scn_ddt_class_max = 0; + scn->scn_phys.scn_start_time = gethrestime_sec(); + scn->scn_phys.scn_errors = 0; + /* Rebuild only examines blocks on one vdev */ + scn->scn_phys.scn_to_examine = scn->scn_vd->vdev_stat.vs_alloc; + scn->scn_restart_txg = 0; + scn->scn_done_txg = 0; + + scn->scn_sync_start_time = gethrtime(); + scn->scn_pausing = B_FALSE; + spa->spa_scrub_active = B_TRUE; + spa_scan_stat_init(spa); + + spa->spa_scrub_started = B_TRUE; +} + +int +spa_scan_rebuild_cb(dsl_pool_t *dp, + const blkptr_t *bp, const zbookmark_phys_t *zb) +{ + /* Rebuild happens in open context and does not use this callback */ + ASSERT0(1); + return (-ENOTSUP); +} + +boolean_t +spa_scan_enabled(const spa_t *spa) +{ + if (spa_scan_max_rebuild > 0) + return (B_TRUE); + else + return (B_FALSE); +} + + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(spa_scan_max_rebuild, int, 0644); +MODULE_PARM_DESC(spa_scan_max_rebuild, "Max concurrent SPA rebuild I/Os"); +#endif diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index e741a699870a..4db9fb86817a 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -32,6 +32,7 @@ #include <sys/dmu.h> #include <sys/dmu_tx.h> #include <sys/vdev_impl.h> +#include <sys/vdev_draid_impl.h> #include <sys/uberblock_impl.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> @@ -60,6 +61,8 @@ int metaslabs_per_vdev = 200; static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, + &vdev_draid_ops, + &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, @@ -136,6 +139,16 @@ vdev_get_min_asize(vdev_t *vd) if (pvd->vdev_ops == &vdev_raidz_ops) return (pvd->vdev_min_asize / pvd->vdev_children); + if (pvd->vdev_ops == &vdev_draid_ops) { + struct vdev_draid_configuration *cfg = pvd->vdev_tsd; + + ASSERT(cfg != NULL); + ASSERT3U(pvd->vdev_nparity, ==, cfg->dcf_parity); + ASSERT3U(pvd->vdev_children, ==, cfg->dcf_children); + return (pvd->vdev_min_asize / + (pvd->vdev_children - cfg->dcf_spare)); + } + return (pvd->vdev_min_asize); } @@ -347,6 +360,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); + vd->vdev_cfg = NULL; /* * Initialize rate limit structs for events. We rate limit ZIO delay @@ -391,6 +405,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, char *type; uint64_t guid = 0, islog, nparity; vdev_t *vd; + nvlist_t *draidcfg = NULL; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); @@ -445,7 +460,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, * Set the nparity property for RAID-Z vdevs. */ nparity = -1ULL; - if (ops == &vdev_raidz_ops) { + if (ops == &vdev_raidz_ops || ops == &vdev_draid_ops) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) @@ -477,13 +492,24 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } ASSERT(nparity != -1ULL); + if (ops == &vdev_draid_ops) { + if (nvlist_lookup_nvlist(nv, + ZPOOL_CONFIG_DRAIDCFG, &draidcfg) != 0) + return (SET_ERROR(EINVAL)); + if (!vdev_draid_config_validate(NULL, draidcfg)) + return (SET_ERROR(EINVAL)); + } + vd = vdev_alloc_common(spa, id, guid, ops); vd->vdev_islog = islog; vd->vdev_nparity = nparity; + if (ops == &vdev_draid_ops) + vd->vdev_cfg = fnvlist_dup(draidcfg); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, @@ -691,6 +717,9 @@ vdev_free(vdev_t *vd) if (vd->vdev_isl2cache) spa_l2cache_remove(vd); + if (vd->vdev_cfg) + fnvlist_free(vd->vdev_cfg); + txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); @@ -1058,6 +1087,9 @@ vdev_probe(vdev_t *vd, zio_t *zio) ASSERT(vd->vdev_ops->vdev_op_leaf); + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (NULL); + /* * Don't probe the probe. */ @@ -1378,6 +1410,7 @@ vdev_open(vdev_t *vd) * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); @@ -2584,6 +2617,9 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; @@ -2754,6 +2790,15 @@ vdev_is_dead(vdev_t *vd) vd->vdev_ops == &vdev_missing_ops); } +boolean_t +vdev_is_dead_at(vdev_t *vd, uint64_t zio_offset) +{ + if (vd->vdev_ops == &vdev_draid_spare_ops) + zio_offset -= VDEV_LABEL_START_SIZE; + + return (vdev_draid_is_dead(vd, zio_offset)); +} + boolean_t vdev_readable(vdev_t *vd) { @@ -3014,7 +3059,8 @@ vdev_stat_update(zio_t *zio, uint64_t psize) uint64_t *processed = &scn_phys->scn_processed; /* XXX cleanup? */ - if (vd->vdev_ops->vdev_op_leaf) + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } @@ -3077,19 +3123,22 @@ vdev_stat_update(zio_t *zio, uint64_t psize) return; mutex_enter(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { + if (type == ZIO_TYPE_READ && !vdev_is_dead_at(vd, zio->io_offset)) { if (zio->io_error == ECKSUM) vs->vs_checksum_errors++; else vs->vs_read_errors++; } - if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) + if (type == ZIO_TYPE_WRITE && !vdev_is_dead_at(vd, zio->io_offset)) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_WRITE && txg != 0 && + /* HH: todo proper rebuild IO error handling... */ + if (type == ZIO_TYPE_WRITE && vd->vdev_ops != &vdev_draid_spare_ops && + txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || - (flags & ZIO_FLAG_SCAN_THREAD) || + ((flags & ZIO_FLAG_SCAN_THREAD) && + !spa->spa_dsl_pool->dp_scan->scn_is_sequential) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c new file mode 100644 index 000000000000..02c1c83d44de --- /dev/null +++ b/module/zfs/vdev_draid.c @@ -0,0 +1,1551 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016 Intel Corporation. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_draid_impl.h> +#include <sys/dsl_scan.h> +#include <sys/abd.h> +#include <sys/zio.h> +#include <sys/nvpair.h> +#include <sys/zio_checksum.h> +#include <sys/fs/zfs.h> +#include <sys/fm/fs/zfs.h> + +#ifdef _KERNEL +#include <linux/kernel.h> +#else +#include <libintl.h> +#endif + +#include "vdev_raidz.h" + + +int draid_debug_lvl = 1; + +void +vdev_draid_debug_zio(zio_t *zio, boolean_t mirror) +{ + int c; + + draid_dbg(3, "%s zio: off "U64FMT"K sz "U64FMT"K data %p\n", + mirror ? "Mirror" : "dRAID", zio->io_offset >> 10, + zio->io_size >> 10, zio->io_abd); + + if (mirror) { + } else { + raidz_map_t *rm = zio->io_vsd; + + for (c = 0; c < rm->rm_scols; c++) { + char t = 'D'; + raidz_col_t *rc = &rm->rm_col[c]; + vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; + + if (c >= rm->rm_cols) { + t = 'S'; + } else if (c < rm->rm_firstdatacol) { + switch (c) { + case 0: + t = 'P'; + break; + case 1: + t = 'Q'; + break; + case 2: + t = 'R'; + break; + default: + ASSERT0(c); + } + } + + draid_dbg(3, "%c: dev %lu (%s) off %luK, sz %luK, " + "err %d, skipped %d, tried %d\n", t, rc->rc_devidx, + cvd->vdev_path != NULL ? cvd->vdev_path : "NA", + rc->rc_offset >> 10, rc->rc_size >> 10, + rc->rc_error, rc->rc_skipped, rc->rc_tried); + } + } +} + +/* A child vdev is divided into slices */ +static unsigned int slice_shift = 0; +#define DRAID_SLICESHIFT (SPA_MAXBLOCKSHIFT + slice_shift) +/* 2 ** slice_shift * SPA_MAXBLOCKSIZE */ +#define DRAID_SLICESIZE (1ULL << DRAID_SLICESHIFT) +#define DRAID_SLICEMASK (DRAID_SLICESIZE - 1) + +static int +vdev_draid_get_permutation(uint64_t *p, uint64_t nr, + const struct vdev_draid_configuration *cfg) +{ + uint64_t i; + uint64_t ncols = cfg->dcf_children; + uint64_t off = nr % (cfg->dcf_bases * ncols); + uint64_t base = off / ncols; + uint64_t dev = off % ncols; + + for (i = 0; i < ncols; i++) { + const uint64_t *base_perm = cfg->dcf_base_perms + + (base * ncols); + + p[i] = (base_perm[i] + dev) % ncols; + } + + return (0); +} + +noinline static raidz_map_t * +vdev_draid_map_alloc(zio_t *zio, uint64_t unit_shift, + const struct vdev_draid_configuration *cfg, uint64_t **array) +{ + const uint64_t ndata = cfg->dcf_data; + const uint64_t nparity = cfg->dcf_parity; + const uint64_t nspare = cfg->dcf_spare; + const uint64_t ncols = cfg->dcf_children; + /* The starting DRAID (parent) vdev sector of the block. */ + const uint64_t b = zio->io_offset >> unit_shift; + /* The zio's size in units of the vdev's minimum sector size. */ + const uint64_t psize = zio->io_size >> unit_shift; + const uint64_t slice = DRAID_SLICESIZE >> unit_shift; + uint64_t o, q, r, c, bc, acols, scols, asize, tot; + uint64_t perm, perm_off, group, group_offset, group_left, abd_off; + raidz_map_t *rm; + uint64_t *permutation; + ASSERTV(vdev_t *vd = zio->io_vd); + + ASSERT(!vdev_draid_ms_mirrored(vd, + zio->io_offset >> vd->vdev_ms_shift)); + ASSERT3U(ncols % (nparity + ndata), ==, nspare); + ASSERT0(b % (nparity + ndata)); + ASSERT0(P2PHASE(DRAID_SLICESIZE, 1ULL << unit_shift)); + + /* HH: may not actually need the nspare columns for normal IO */ + permutation = kmem_alloc(sizeof (permutation[0]) * ncols, KM_SLEEP); + + perm = b / ((ncols - nspare) * slice); + perm_off = b % ((ncols - nspare) * slice); + group = perm_off / ((nparity + ndata) * slice); + group_offset = perm_off % ((nparity + ndata) * slice); + ASSERT0(group_offset % (nparity + ndata)); + + group_left = (slice - group_offset / (nparity + ndata)) * ndata; + ASSERT3U(psize, <=, group_left); + + /* The starting byte offset on each child vdev. */ + o = (perm * slice + group_offset / (nparity + ndata)) << unit_shift; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + */ + q = psize / ndata; + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + r = psize - q * ndata; + + /* The number of "big columns" - those which contain remainder data. */ + bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ + tot = psize + nparity * (q + (r == 0 ? 0 : 1)); + + /* acols: The columns that will be accessed. */ + /* scols: The columns that will be accessed or skipped. */ + if (q == 0) { + /* Our I/O request doesn't span all child vdevs. */ + acols = bc; + } else { + acols = nparity + ndata; + } + scols = nparity + ndata; + + ASSERT3U(acols, <=, scols); + + rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); + rm->rm_cols = acols; + rm->rm_scols = scols; + rm->rm_bigcols = bc; + rm->rm_skipstart = bc; + rm->rm_missingdata = 0; + rm->rm_missingparity = 0; + rm->rm_firstdatacol = nparity; + rm->rm_abd_copy = NULL; + rm->rm_reports = 0; + rm->rm_freed = 0; + rm->rm_ecksuminjected = 0; + rm->rm_declustered = B_TRUE; + + VERIFY0(vdev_draid_get_permutation(permutation, perm, cfg)); + + for (c = 0, asize = 0; c < scols; c++) { + uint64_t i = group * (nparity + ndata) + c; + + ASSERT3U(i, <, ncols - nspare); + + rm->rm_col[c].rc_devidx = permutation[i]; + rm->rm_col[c].rc_offset = o; + rm->rm_col[c].rc_abd = NULL; + rm->rm_col[c].rc_gdata = NULL; + rm->rm_col[c].rc_error = 0; + rm->rm_col[c].rc_tried = 0; + rm->rm_col[c].rc_skipped = 0; + + if (c >= acols) + rm->rm_col[c].rc_size = 0; + else if (c < bc) + rm->rm_col[c].rc_size = (q + 1) << unit_shift; + else + rm->rm_col[c].rc_size = q << unit_shift; + + asize += rm->rm_col[c].rc_size; + } + + ASSERT3U(asize, ==, tot << unit_shift); + rm->rm_asize = roundup(asize, (ndata + nparity) << unit_shift); + rm->rm_nskip = roundup(tot, ndata + nparity) - tot; + ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); + ASSERT3U(rm->rm_nskip, <, ndata); + + if (rm->rm_nskip == 0 || + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) == 0) + rm->rm_abd_skip = NULL; + else + rm->rm_abd_skip = + abd_alloc_linear(rm->rm_nskip << unit_shift, B_TRUE); + + for (c = 0; c < rm->rm_firstdatacol; c++) + rm->rm_col[c].rc_abd = + abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE); + + abd_off = 0; + rm->rm_col[c].rc_abd = abd_get_offset(zio->io_abd, abd_off); + abd_off += rm->rm_col[c].rc_size; + + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_abd = abd_get_offset(zio->io_abd, abd_off); + abd_off += rm->rm_col[c].rc_size; + } + + if (array == NULL) + kmem_free(permutation, sizeof (permutation[0]) * ncols); + else + *array = permutation; /* caller will free */ + rm->rm_ops = vdev_raidz_math_get_ops(); + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; + return (rm); +} + +noinline static mirror_map_t * +vdev_draid_mirror_map_alloc(zio_t *zio, uint64_t unit_shift, + const struct vdev_draid_configuration *cfg, uint64_t **array) +{ + const uint64_t nparity = cfg->dcf_parity; + const uint64_t copies = nparity + 1; + const uint64_t nspare = cfg->dcf_spare; + const uint64_t ncols = cfg->dcf_children; + /* The starting DRAID (parent) vdev sector of the block. */ + const uint64_t b = zio->io_offset >> unit_shift; + const uint64_t slice = DRAID_SLICESIZE >> unit_shift; + vdev_t *vd = zio->io_vd; + uint64_t o, c, perm, perm_off, group, group_offset; + mirror_map_t *mm; + uint64_t *permutation; + ASSERTV(const uint64_t psize = zio->io_size >> unit_shift); + + ASSERT(vdev_draid_ms_mirrored(vd, zio->io_offset >> vd->vdev_ms_shift)); + ASSERT3U(ncols % (nparity + cfg->dcf_data), ==, nspare); + ASSERT0(P2PHASE(DRAID_SLICESIZE, 1ULL << unit_shift)); + + perm = b / ((ncols - nspare) * slice); + perm_off = b % ((ncols - nspare) * slice); + group = perm_off / (copies * slice); + ASSERT3U(group, <, (ncols - nspare) / copies); + group_offset = perm_off % (copies * slice); + ASSERT0(group_offset % copies); + ASSERT3U(psize, <=, slice - group_offset / copies); + /* The starting byte offset on each child vdev. */ + o = (perm * slice + group_offset / copies) << unit_shift; + + mm = vdev_mirror_map_alloc(copies, B_FALSE, B_FALSE); + permutation = kmem_alloc(sizeof (permutation[0]) * ncols, KM_SLEEP); + VERIFY0(vdev_draid_get_permutation(permutation, perm, cfg)); + + for (c = 0; c < mm->mm_children; c++) { + int idx = group * copies + c; + mirror_child_t *mc = &mm->mm_child[c]; + + /* The remainder group is not usable for IO */ + ASSERT3U(idx, <, ((ncols - nspare) / copies) * copies); + + mc->mc_vd = vd->vdev_child[permutation[idx]]; + mc->mc_offset = o; + } + + if (array == NULL) + kmem_free(permutation, sizeof (permutation[0]) * ncols); + else + *array = permutation; /* caller will free */ + + zio->io_vsd = mm; + zio->io_vsd_ops = &vdev_mirror_vsd_ops; + return (mm); +} + +static inline void +vdev_draid_assert_vd(const vdev_t *vd) +{ + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + ASSERT(cfg != NULL); + ASSERT(cfg->dcf_zero_abd != NULL); + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(vd->vdev_nparity, ==, cfg->dcf_parity); + ASSERT3U(vd->vdev_children, ==, cfg->dcf_children); +} + +uint64_t +vdev_draid_get_groupsz(const vdev_t *vd, boolean_t mirror) +{ + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + uint64_t copies; + + vdev_draid_assert_vd(vd); + + copies = mirror ? + vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data; + return (copies << DRAID_SLICESHIFT); +} + +#define DRAID_PERM_ASIZE(vd) (((vd)->vdev_children - \ + ((struct vdev_draid_configuration *)(vd)->vdev_tsd)->dcf_spare) \ + << DRAID_SLICESHIFT) + +uint64_t +vdev_draid_offset2group(const vdev_t *vd, uint64_t offset, boolean_t mirror) +{ + uint64_t perm, perm_off, group, copies, groups_per_perm; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + vdev_draid_assert_vd(vd); + + perm = offset / DRAID_PERM_ASIZE(vd); + perm_off = offset % DRAID_PERM_ASIZE(vd); + group = perm_off / vdev_draid_get_groupsz(vd, mirror); + + copies = mirror ? + vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data; + groups_per_perm = (vd->vdev_children - cfg->dcf_spare + copies - 1) + / copies; + + return (perm * groups_per_perm + group); +} + +uint64_t +vdev_draid_group2offset(const vdev_t *vd, uint64_t group, boolean_t mirror) +{ + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + uint64_t copies, groups_per_perm, offset; + + vdev_draid_assert_vd(vd); + + copies = mirror ? + vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data; + groups_per_perm = (vd->vdev_children - cfg->dcf_spare + copies - 1) + / copies; + + offset = DRAID_PERM_ASIZE(vd) * (group / groups_per_perm); + offset += + vdev_draid_get_groupsz(vd, mirror) * (group % groups_per_perm); + return (offset); +} + +boolean_t +vdev_draid_is_remainder_group(const vdev_t *vd, + uint64_t group, boolean_t mirror) +{ + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + uint64_t copies, groups_per_perm; + + vdev_draid_assert_vd(vd); + + copies = mirror ? + vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data; + groups_per_perm = (vd->vdev_children - cfg->dcf_spare + copies - 1) + / copies; + + if ((vd->vdev_children - cfg->dcf_spare) % copies == 0) + return (B_FALSE); + + /* Currently only mirror can have remainder group */ + ASSERT(mirror); + + /* The last group in each permutation is the remainder */ + if (group % groups_per_perm == groups_per_perm - 1) + return (B_TRUE); + else + return (B_FALSE); +} + +uint64_t +vdev_draid_get_astart(const vdev_t *vd, const uint64_t start) +{ + uint64_t astart, perm_off, copies; + boolean_t mirror = + vdev_draid_ms_mirrored(vd, start >> vd->vdev_ms_shift); + uint64_t group = vdev_draid_offset2group(vd, start, mirror); + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + vdev_draid_assert_vd(vd); + + if (vdev_draid_is_remainder_group(vd, group, mirror)) + return (start); + + perm_off = start % DRAID_PERM_ASIZE(vd); + copies = mirror ? + vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data; + astart = roundup(perm_off, copies << vd->vdev_ashift); + astart += start - perm_off; + + ASSERT3U(astart, >=, start); + return (astart); +} + +uint64_t +vdev_draid_check_block(const vdev_t *vd, uint64_t start, uint64_t size) +{ + boolean_t mirror = + vdev_draid_ms_mirrored(vd, start >> vd->vdev_ms_shift); + uint64_t group = vdev_draid_offset2group(vd, start, mirror); + uint64_t end = start + size - 1; + + ASSERT3U(size, <, vdev_draid_get_groupsz(vd, mirror)); + ASSERT3U(start >> vd->vdev_ms_shift, ==, end >> vd->vdev_ms_shift); + + /* + * A block is good if it: + * - does not cross group boundary, AND + * - does not use a remainder group + */ + if (group == vdev_draid_offset2group(vd, end, mirror) && + !vdev_draid_is_remainder_group(vd, group, mirror)) { + ASSERT3U(start, ==, vdev_draid_get_astart(vd, start)); + return (start); + } + + group++; + if (vdev_draid_is_remainder_group(vd, group, mirror)) + group++; + ASSERT(!vdev_draid_is_remainder_group(vd, group, mirror)); + return (vdev_draid_group2offset(vd, group, mirror)); +} + +boolean_t +vdev_draid_ms_mirrored(const vdev_t *vd, uint64_t ms_id) +{ + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + /* HH: dedicate 1/20 ms for hybrid mirror */ + if ((ms_id % 20) == 19) + return (B_TRUE); + else + return (B_FALSE); +} + +static vdev_t *vdev_dspare_get_child(vdev_t *vd, uint64_t offset); + +/* + * dRAID spare does not fit into the DTL model. While it has child vdevs, + * there is no redundancy among them, and the effective child vdev is + * determined by offset. Moreover, DTLs of a child vdev before the spare + * becomes active are invalid, because the spare blocks were not in use yet. + * + * Here we are essentially doing a vdev_dtl_reassess() on the fly, by replacing + * a dRAID spare with the child vdev under the offset. Note that it is a + * recursive process because the child vdev can be another dRAID spare, and so + * on. + */ +boolean_t +vdev_draid_missing(vdev_t *vd, uint64_t offset, uint64_t txg, uint64_t size) +{ + int c; + + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) + return (B_TRUE); + + if (vd->vdev_ops == &vdev_draid_spare_ops) + vd = vdev_dspare_get_child(vd, offset); + + if (vd->vdev_ops != &vdev_spare_ops) + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); + + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) + return (B_TRUE); + + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (!vdev_draid_missing(cvd, offset, txg, size)) + return (B_FALSE); + } + + return (B_TRUE); +} + +boolean_t +vdev_draid_readable(vdev_t *vd, uint64_t offset) +{ + int c; + + if (vd->vdev_ops == &vdev_draid_spare_ops) + vd = vdev_dspare_get_child(vd, offset); + + if (vd->vdev_ops != &vdev_spare_ops) + return (vdev_readable(vd)); + + for (c = 0; c < vd->vdev_children; c++) + if (vdev_draid_readable(vd->vdev_child[c], offset)) + return (B_TRUE); + + return (B_FALSE); +} + +boolean_t +vdev_draid_is_dead(vdev_t *vd, uint64_t offset) +{ + int c; + + if (vd->vdev_ops == &vdev_draid_spare_ops) + vd = vdev_dspare_get_child(vd, offset); + + if (vd->vdev_ops != &vdev_spare_ops) + return (vdev_is_dead(vd)); + + for (c = 0; c < vd->vdev_children; c++) + if (!vdev_draid_is_dead(vd->vdev_child[c], offset)) + return (B_FALSE); + + return (B_TRUE); +} + +static boolean_t +vdev_draid_guid_exists(vdev_t *vd, uint64_t guid, uint64_t offset) +{ + int c; + + if (vd->vdev_ops == &vdev_draid_spare_ops) + vd = vdev_dspare_get_child(vd, offset); + + if (vd->vdev_guid == guid) + return (B_TRUE); + + if (vd->vdev_ops->vdev_op_leaf) + return (B_FALSE); + + for (c = 0; c < vd->vdev_children; c++) + if (vdev_draid_guid_exists(vd->vdev_child[c], guid, offset)) + return (B_TRUE); + + return (B_FALSE); +} + +static boolean_t +vdev_draid_vd_degraded(vdev_t *vd, const vdev_t *oldvd, uint64_t offset) +{ + if (oldvd == NULL) /* Resilver */ + return (!vdev_dtl_empty(vd, DTL_PARTIAL)); + + /* Rebuild */ + ASSERT(oldvd->vdev_ops->vdev_op_leaf); + ASSERT(oldvd->vdev_ops != &vdev_draid_spare_ops); + + return (vdev_draid_guid_exists(vd, oldvd->vdev_guid, offset)); +} + +boolean_t +vdev_draid_group_degraded(vdev_t *vd, vdev_t *oldvd, + uint64_t offset, uint64_t size, boolean_t mirror) +{ + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t group = vdev_draid_offset2group(vd, offset, mirror); + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + boolean_t degraded = B_FALSE; + zio_t *zio; + int c, dummy_data; + uint64_t *perm; + char buf[128]; + + vdev_draid_assert_vd(vd); + ASSERT(!vdev_draid_is_remainder_group(vd, group, mirror)); + + zio = kmem_alloc(sizeof (*zio), KM_SLEEP); + zio->io_vd = vd; + zio->io_offset = offset; + zio->io_size = MAX(SPA_MINBLOCKSIZE, 1ULL << ashift); + zio->io_abd = abd_get_from_buf(&dummy_data, zio->io_size); + + buf[0] = '\0'; + if (mirror) { + mirror_map_t *mm = + vdev_draid_mirror_map_alloc(zio, ashift, cfg, &perm); + + ASSERT3U(mm->mm_children, ==, cfg->dcf_parity + 1); + + for (c = 0; c < mm->mm_children; c++) { + mirror_child_t *mc = &mm->mm_child[c]; + char *status = ""; + + if (vdev_draid_vd_degraded(mc->mc_vd, + oldvd, mc->mc_offset)) { + degraded = B_TRUE; + status = "*"; + } + snprintf(buf + strlen(buf), sizeof (buf) - strlen(buf), + U64FMT"%s ", mc->mc_vd->vdev_id, status); + } + } else { + raidz_map_t *rm = vdev_draid_map_alloc(zio, ashift, cfg, &perm); + + ASSERT3U(rm->rm_scols, ==, cfg->dcf_parity + cfg->dcf_data); + + for (c = 0; c < rm->rm_scols; c++) { + raidz_col_t *rc = &rm->rm_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + char *status = ""; + + if (vdev_draid_vd_degraded(cvd, oldvd, rc->rc_offset)) { + degraded = B_TRUE; + status = "*"; + } + snprintf(buf + strlen(buf), sizeof (buf) - strlen(buf), + U64FMT"%s ", cvd->vdev_id, status); + } + } + + snprintf(buf + strlen(buf), sizeof (buf) - strlen(buf), "spares: "); + for (c = 0; c < cfg->dcf_spare; c++) + snprintf(buf + strlen(buf), sizeof (buf) - strlen(buf), + U64FMT" ", perm[cfg->dcf_children - 1 - c]); + draid_dbg(4, "%s %s at "U64FMT"K of "U64FMT"K: %s\n", + degraded ? "Degraded" : "Healthy", + mirror ? "mirror" : "draid", + offset >> 10, size >> 10, buf); + + kmem_free(perm, sizeof (perm[0]) * cfg->dcf_children); + (*zio->io_vsd_ops->vsd_free)(zio); + abd_put(zio->io_abd); + kmem_free(zio, sizeof (*zio)); + return (degraded); +} + +boolean_t +vdev_draid_config_validate(const vdev_t *vd, nvlist_t *config) +{ + int i; + uint_t c; + uint8_t *perm = NULL; + uint64_t n, d, p, s, b; + + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_DRAIDCFG_CHILDREN, &n) != 0) { +#ifndef _KERNEL + fprintf(stderr, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_CHILDREN); +#endif + return (B_FALSE); + } + + if (n - 1 > VDEV_DRAID_U8_MAX) { +#ifndef _KERNEL + fprintf(stderr, "%s configuration too invalid: %lu\n", + ZPOOL_CONFIG_DRAIDCFG_CHILDREN, n); +#endif + return (B_FALSE); + } + if (vd != NULL && n != vd->vdev_children) + return (B_FALSE); + + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_DRAIDCFG_PARITY, &p) != 0) { +#ifndef _KERNEL + fprintf(stderr, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_PARITY); +#endif + return (B_FALSE); + } + + if (vd != NULL && p != vd->vdev_nparity) + return (B_FALSE); + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_DATA, &d) != 0) { +#ifndef _KERNEL + fprintf(stderr, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_DATA); +#endif + return (B_FALSE); + } + + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_DRAIDCFG_SPARE, &s) != 0) { +#ifndef _KERNEL + fprintf(stderr, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_SPARE); +#endif + return (B_FALSE); + } + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_BASE, &b) != 0) { +#ifndef _KERNEL + fprintf(stderr, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_BASE); +#endif + return (B_FALSE); + } + + if (n == 0 || d == 0 || p == 0 || s == 0 || b == 0) { +#ifndef _KERNEL + fprintf(stderr, "Zero n/d/p/s/b\n"); +#endif + return (B_FALSE); + } + + if (p > VDEV_RAIDZ_MAXPARITY) { +#ifndef _KERNEL + fprintf(stderr, gettext("Invalid parity %lu\n"), p); +#endif + return (B_FALSE); + } + + if ((n - s) % (p + d) != 0) { +#ifndef _KERNEL + fprintf(stderr, "%lu mod %lu is not 0\n", n - s, p + d); +#endif + return (B_FALSE); + } + + if (nvlist_lookup_uint8_array(config, + ZPOOL_CONFIG_DRAIDCFG_PERM, &perm, &c) != 0) { +#ifndef _KERNEL + fprintf(stderr, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_PERM); +#endif + return (B_FALSE); + } + + if (c != b * n) { +#ifndef _KERNEL + fprintf(stderr, + "Permutation array has %u items, but %lu expected\n", + c, b * n); +#endif + return (B_FALSE); + } + + for (i = 0; i < b; i++) { + int j, k; + for (j = 0; j < n; j++) { + uint64_t val = perm[i * n + j]; + + if (val >= n) { +#ifndef _KERNEL + fprintf(stderr, + "Invalid value %lu in permutation %d\n", + val, i); +#endif + return (B_FALSE); + } + + for (k = 0; k < j; k++) { + if (val == perm[i * n + k]) { +#ifndef _KERNEL + fprintf(stderr, + "Duplicated value %lu in " + "permutation %d\n", + val, i); +#endif + return (B_FALSE); + } + } + } + } + + return (B_TRUE); +} + +boolean_t +vdev_draid_config_add(nvlist_t *top, nvlist_t *draidcfg) +{ + char *type; + uint64_t parity; + nvlist_t **children = NULL; + uint_t c = 0; + + if (draidcfg == NULL) + return (B_FALSE); + + type = fnvlist_lookup_string(top, ZPOOL_CONFIG_TYPE); + if (strcmp(type, VDEV_TYPE_DRAID) != 0) + return (B_FALSE); + + parity = fnvlist_lookup_uint64(top, ZPOOL_CONFIG_NPARITY); + if (parity != fnvlist_lookup_uint64(draidcfg, + ZPOOL_CONFIG_DRAIDCFG_PARITY)) + return (B_FALSE); + + VERIFY0(nvlist_lookup_nvlist_array(top, + ZPOOL_CONFIG_CHILDREN, &children, &c)); + if (c != + fnvlist_lookup_uint64(draidcfg, ZPOOL_CONFIG_DRAIDCFG_CHILDREN)) + return (B_FALSE); + + /* HH: todo: check permutation array csum */ + fnvlist_add_nvlist(top, ZPOOL_CONFIG_DRAIDCFG, draidcfg); + return (B_TRUE); +} + +static struct vdev_draid_configuration * +vdev_draid_config_create(vdev_t *vd) +{ +/* + * HH: should probably allocate draid_zero_page page aligned, when need to deal + * with ashift larger than a page + */ +#ifdef _KERNEL +#define draid_zero_page empty_zero_page +#else +static char draid_zero_page[PAGE_SIZE]; +#endif + + int i, j; + uint_t c; + uint64_t children; + uint8_t *perms = NULL; + uint64_t *base_perms; + nvlist_t *nvl = vd->vdev_cfg; + struct vdev_draid_configuration *cfg; + + ASSERT(nvl != NULL); + + if (!vdev_draid_config_validate(vd, nvl)) + return (NULL); + + cfg = kmem_alloc(sizeof (*cfg), KM_SLEEP); + cfg->dcf_children = fnvlist_lookup_uint64(nvl, + ZPOOL_CONFIG_DRAIDCFG_CHILDREN); + cfg->dcf_data = fnvlist_lookup_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_DATA); + cfg->dcf_parity = fnvlist_lookup_uint64(nvl, + ZPOOL_CONFIG_DRAIDCFG_PARITY); + cfg->dcf_spare = fnvlist_lookup_uint64(nvl, + ZPOOL_CONFIG_DRAIDCFG_SPARE); + cfg->dcf_bases = fnvlist_lookup_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_BASE); + + VERIFY0(nvlist_lookup_uint8_array(nvl, + ZPOOL_CONFIG_DRAIDCFG_PERM, &perms, &c)); + + base_perms = kmem_alloc(sizeof (uint64_t) * c, KM_SLEEP); + for (i = 0, children = cfg->dcf_children; i < cfg->dcf_bases; i++) + for (j = 0; j < children; j++) + base_perms[i * children + j] = perms[i * children + j]; + cfg->dcf_base_perms = base_perms; + + ASSERT3U(1ULL << vd->vdev_top->vdev_ashift, <=, PAGE_SIZE); + cfg->dcf_zero_abd = abd_get_from_buf(draid_zero_page, PAGE_SIZE); + return (cfg); +} + +static int +vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *ashift) +{ + vdev_t *cvd; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + uint64_t nparity = vd->vdev_nparity; + int c; + int lasterror = 0; + int numerrors = 0; + + ASSERT(nparity > 0); + + if (nparity > VDEV_RAIDZ_MAXPARITY || + vd->vdev_children < nparity + 1) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* vd->vdev_tsd must be set before vdev_open_children(vd) */ + if (cfg == NULL) { + cfg = vdev_draid_config_create(vd); + if (cfg == NULL) + return (SET_ERROR(EINVAL)); + vd->vdev_tsd = cfg; + } else { + ASSERT(vd->vdev_reopening); + } + + vdev_open_children(vd); + + for (c = 0; c < vd->vdev_children; c++) { + cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) { + lasterror = cvd->vdev_open_error; + numerrors++; + continue; + } + + *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; + *ashift = MAX(*ashift, cvd->vdev_ashift); + } + + /* HH: asize becomes tricky with hybrid mirror */ + *asize *= vd->vdev_children - cfg->dcf_spare; + *max_asize *= vd->vdev_children - cfg->dcf_spare; + /* HH: because of the draid_zero_page array */ + ASSERT3U(*ashift, <=, PAGE_SHIFT); + + if (numerrors > nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + return (0); +} + +static void +vdev_draid_close(vdev_t *vd) +{ + int c; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + for (c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); + + if (vd->vdev_reopening || cfg == NULL) + return; + + abd_put(cfg->dcf_zero_abd); + kmem_free((void *)cfg->dcf_base_perms, + sizeof (uint64_t) * cfg->dcf_bases * cfg->dcf_children); + kmem_free(cfg, sizeof (*cfg)); + vd->vdev_tsd = NULL; +} + +static uint64_t +vdev_draid_asize(vdev_t *vd, uint64_t psize) +{ + uint64_t asize; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t nparity = vd->vdev_nparity; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + vdev_draid_assert_vd(vd); + + asize = ((psize - 1) >> ashift) + 1; + + if (asize == 1) { /* mirror */ + asize += nparity; + } else { /* draid */ + asize = roundup(asize, cfg->dcf_data); + asize += nparity * (asize / cfg->dcf_data); + ASSERT0(asize % (nparity + cfg->dcf_data)); + } + + return (asize << ashift); +} + +boolean_t +vdev_draid_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +{ + boolean_t mirror = + vdev_draid_ms_mirrored(vd, offset >> vd->vdev_ms_shift); + + /* A block cannot cross redundancy group boundary */ + ASSERT3U(offset, ==, + vdev_draid_check_block(vd, offset, vdev_draid_asize(vd, psize))); + + return (vdev_draid_group_degraded(vd, NULL, offset, psize, mirror)); +} + +/* + * Start an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. + * 3. If the column skips any sectors for padding, create optional dummy + * write zio children for those areas to improve aggregation continuity. + * - For read operations: + * 1. Create child zio read operations to each data column's vdev to read + * the range of data required for zio. + * 2. If this is a scrub or resilver operation, or if any of the data + * vdevs have had errors, then create zio read operations to the parity + * columns' VDevs as well. + */ +static void +vdev_draid_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + uint64_t ashift = vd->vdev_top->vdev_ashift; + vdev_t *cvd; + raidz_map_t *rm; + raidz_col_t *rc; + int c, i; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + vdev_draid_assert_vd(vd); + + if (vdev_draid_ms_mirrored(vd, zio->io_offset >> vd->vdev_ms_shift)) { + (void) vdev_draid_mirror_map_alloc(zio, ashift, cfg, NULL); + + ASSERT(zio->io_vsd != NULL); + ASSERT(zio->io_size <= (1ULL << ashift) || + ((zio->io_flags & ZIO_FLAG_RESILVER) && + zio->io_spa->spa_dsl_pool->dp_scan->scn_is_sequential)); + vdev_mirror_ops.vdev_op_io_start(zio); + return; + } + + rm = vdev_draid_map_alloc(zio, ashift, cfg, NULL); + + ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); + + if (zio->io_type == ZIO_TYPE_WRITE) { + vdev_raidz_generate_parity(rm); + + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + + /* + * Unlike raidz, it's mandatory to fill skip sectors with zero. + */ + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + ASSERT3U(c, <, rm->rm_scols); + ASSERT3U(c, >, rm->rm_firstdatacol); + + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, cfg->dcf_zero_abd, + 1ULL << ashift, zio->io_type, zio->io_priority, + 0, NULL, NULL)); /* HH: handle skip write error */ + } + + zio_execute(zio); + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + /* + * HH: sequential resilver must do IO at redundancy group boundary, i.e. + * rm->rm_nskip must be 0 + */ + ASSERT((zio->io_flags & ZIO_FLAG_RESILVER) == 0 || + !zio->io_spa->spa_dsl_pool->dp_scan->scn_is_sequential || + rm->rm_nskip == 0); + + /* + * Iterate over the columns in reverse order so that we hit the parity + * last -- any errors along the way will force us to read the parity. + */ + for (c = rm->rm_cols - 1; c >= 0; c--) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + if (!vdev_draid_readable(cvd, rc->rc_offset)) { + if (c >= rm->rm_firstdatacol) + rm->rm_missingdata++; + else + rm->rm_missingparity++; + rc->rc_error = SET_ERROR(ENXIO); + rc->rc_tried = 1; /* don't even try */ + rc->rc_skipped = 1; + continue; + } + if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) { + if (c >= rm->rm_firstdatacol) + rm->rm_missingdata++; + else + rm->rm_missingparity++; + rc->rc_error = SET_ERROR(ESTALE); + rc->rc_skipped = 1; + continue; + } + if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + } + + /* + * Check skip sectors for scrub/resilver. For sequential rebuild, + * this is a no-op because rm->rm_nskip is always zero. + */ + if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + abd_t *abd; + + ASSERT3U(c, <, rm->rm_scols); + ASSERT3U(c, >, rm->rm_firstdatacol); + + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + + if (!vdev_draid_readable(cvd, + rc->rc_offset + rc->rc_size)) { + rc->rc_abd_skip = NULL; + continue; + } + + abd = abd_get_offset(rm->rm_abd_skip, i << ashift); + *((int *)abd_to_buf(abd)) = 1; + rc->rc_abd_skip = abd; + + /* Skip sector to be written in vdev_draid_io_done() */ + if (vdev_draid_missing(cvd, + rc->rc_offset + rc->rc_size, zio->io_txg, 1)) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, abd, + 1ULL << ashift, ZIO_TYPE_READ, + zio->io_priority, 0, NULL, NULL)); + } + } + + zio_execute(zio); +} + +int +vdev_draid_hide_skip_sectors(raidz_map_t *rm) +{ + int c, cols; + size_t size = rm->rm_col[0].rc_size; + + ASSERT(rm->rm_declustered); + + for (c = rm->rm_cols; c < rm->rm_scols; c++) { + void *buf; + raidz_col_t *rc = &rm->rm_col[c]; + + ASSERT0(rc->rc_size); + ASSERT0(rc->rc_error); + ASSERT0(rc->rc_tried); + ASSERT0(rc->rc_skipped); + ASSERT(rc->rc_abd == NULL); + + rc->rc_size = size; + rc->rc_abd = abd_alloc_linear(size, B_TRUE); + buf = abd_to_buf(rc->rc_abd); + bzero(buf, size); + } + + cols = rm->rm_cols; + rm->rm_cols = rm->rm_scols; + return (cols); +} + +void +vdev_draid_restore_skip_sectors(raidz_map_t *rm, int cols) +{ + int c; + + ASSERT(rm->rm_declustered); + ASSERT3U(cols, >, rm->rm_firstdatacol); + ASSERT3U(cols, <=, rm->rm_scols); + + for (c = cols; c < rm->rm_scols; c++) { + raidz_col_t *rc = &rm->rm_col[c]; + + ASSERT0(rc->rc_error); + ASSERT0(rc->rc_tried); + ASSERT0(rc->rc_skipped); + ASSERT(rc->rc_abd != NULL); + + abd_free(rc->rc_abd); + rc->rc_size = 0; + rc->rc_abd = NULL; + } + + rm->rm_cols = cols; +} + +void +vdev_draid_fix_skip_sectors(zio_t *zio) +{ + int c, i; + char *zero; + vdev_t *vd = zio->io_vd; + raidz_map_t *rm = zio->io_vsd; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + const uint64_t size = 1ULL << vd->vdev_top->vdev_ashift; + + ASSERT(rm->rm_declustered); + vdev_draid_assert_vd(vd); + + if (rm->rm_abd_skip == NULL) + return; + + zero = abd_to_buf(cfg->dcf_zero_abd); + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + char *skip; + boolean_t good_skip; + raidz_col_t *rc = &rm->rm_col[c]; + + ASSERT3U(c, <, rm->rm_scols); + ASSERT3U(c, >, rm->rm_firstdatacol); + + if (rc->rc_abd_skip == NULL) + continue; + + skip = abd_to_buf(rc->rc_abd_skip); + good_skip = (memcmp(skip, zero, size) == 0); + abd_put(rc->rc_abd_skip); + rc->rc_abd_skip = NULL; + + if (good_skip || !spa_writeable(zio->io_spa)) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], + rc->rc_offset + rc->rc_size, cfg->dcf_zero_abd, + size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR, NULL, NULL)); + } +} + +static void +vdev_draid_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + if (vdev_draid_ms_mirrored(vd, zio->io_offset >> vd->vdev_ms_shift)) + vdev_mirror_ops.vdev_op_io_done(zio); /* hybrid mirror */ + else + vdev_raidz_ops.vdev_op_io_done(zio); /* declustered raidz */ +} + +static void +vdev_draid_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (faulted > vd->vdev_nparity) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +vdev_ops_t vdev_draid_ops = { + vdev_draid_open, + vdev_draid_close, + vdev_draid_asize, + vdev_draid_io_start, + vdev_draid_io_done, + vdev_draid_state_change, + NULL, + NULL, + VDEV_TYPE_DRAID, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; + +#include <sys/spa_impl.h> + +typedef struct { + vdev_t *dsp_draid; + uint64_t dsp_id; +} vdev_dspare_t; + +static vdev_t * +vdev_dspare_get_child(vdev_t *vd, uint64_t offset) +{ + vdev_t *draid; + uint64_t *permutation, spareidx; + vdev_dspare_t *dspare = vd->vdev_tsd; + struct vdev_draid_configuration *cfg; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + ASSERT3U(offset, <, + vd->vdev_psize - VDEV_LABEL_START_SIZE - VDEV_LABEL_END_SIZE); + ASSERT(dspare != NULL); + draid = dspare->dsp_draid; + vdev_draid_assert_vd(draid); + cfg = draid->vdev_tsd; + ASSERT3U(dspare->dsp_id, <, cfg->dcf_spare); + + permutation = kmem_alloc(sizeof (permutation[0]) * draid->vdev_children, + KM_SLEEP); + VERIFY0(vdev_draid_get_permutation(permutation, + offset >> DRAID_SLICESHIFT, cfg)); + spareidx = permutation[draid->vdev_children - 1 - dspare->dsp_id]; + ASSERT3U(spareidx, <, draid->vdev_children); + kmem_free(permutation, sizeof (permutation[0]) * draid->vdev_children); + + return (draid->vdev_child[spareidx]); +} + +vdev_t * +vdev_draid_spare_get_parent(vdev_t *vd) +{ + vdev_dspare_t *dspare = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + ASSERT(dspare != NULL); + ASSERT(dspare->dsp_draid != NULL); + + return (dspare->dsp_draid); +} + +nvlist_t * +vdev_draid_spare_read_config(vdev_t *vd) +{ + int i; + uint64_t guid; + spa_t *spa = vd->vdev_spa; + spa_aux_vdev_t *sav = &spa->spa_spares; + nvlist_t *nv = fnvlist_alloc(); + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa)); + fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa)); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); + + if (vd->vdev_isspare) + fnvlist_add_uint64(nv, + ZPOOL_CONFIG_POOL_STATE, POOL_STATE_ACTIVE); + else + fnvlist_add_uint64(nv, + ZPOOL_CONFIG_POOL_STATE, POOL_STATE_SPARE); + + for (i = 0, guid = vd->vdev_guid; i < sav->sav_count; i++) { + if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops && + strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) { + guid = sav->sav_vdevs[i]->vdev_guid; + break; + } + } + fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid); + + /* HH: ZPOOL_CONFIG_UNSPARE and ZPOOL_CONFIG_RESILVER_TXG? */ + return (nv); +} + +static int +vdev_dspare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) +{ + uint64_t draid_id, nparity, spare_id; + uint64_t asize, max_asize; + vdev_t *draid; + vdev_dspare_t *dspare; + struct vdev_draid_configuration *cfg; + + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + dspare = vd->vdev_tsd; + draid = dspare->dsp_draid; + cfg = draid->vdev_tsd; + goto skip_open; + } + + if (sscanf(vd->vdev_path, VDEV_DRAID_SPARE_PATH_FMT, + (long unsigned *)&nparity, (long unsigned *)&draid_id, + (long unsigned *)&spare_id) != 3) + return (SET_ERROR(EINVAL)); + + if (draid_id >= vd->vdev_spa->spa_root_vdev->vdev_children) + return (SET_ERROR(EINVAL)); + + draid = vd->vdev_spa->spa_root_vdev->vdev_child[draid_id]; + if (draid->vdev_ops != &vdev_draid_ops) + return (SET_ERROR(EINVAL)); + if (draid->vdev_nparity != nparity) + return (SET_ERROR(EINVAL)); + + cfg = draid->vdev_tsd; + ASSERT(cfg != NULL); + if (spare_id >= cfg->dcf_spare) + return (SET_ERROR(EINVAL)); + + dspare = kmem_alloc(sizeof (*dspare), KM_SLEEP); + dspare->dsp_draid = draid; + dspare->dsp_id = spare_id; + vd->vdev_tsd = dspare; + +skip_open: + vdev_draid_assert_vd(draid); + + asize = draid->vdev_asize / (draid->vdev_children - cfg->dcf_spare); + max_asize = draid->vdev_max_asize / + (draid->vdev_children - cfg->dcf_spare); + + *ashift = draid->vdev_ashift; + *psize = asize + (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); + *max_psize = max_asize + (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); + return (0); +} + +static void +vdev_dspare_close(vdev_t *vd) +{ + vdev_dspare_t *dspare = vd->vdev_tsd; + + if (vd->vdev_reopening || dspare == NULL) + return; + + vd->vdev_tsd = NULL; + kmem_free(dspare, sizeof (*dspare)); +} + +static uint64_t +vdev_dspare_asize(vdev_t *vd, uint64_t psize) +{ + /* HH: this function should never get called */ + ASSERT0(psize); + return (0); +} + +static void +vdev_dspare_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + pio->io_error = zio->io_error; +} + +static void +vdev_dspare_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *cvd; + uint64_t offset = zio->io_offset; + + /* HH: if dspare gets a FLUSH, so do all children of the draid vdev */ + if (zio->io_type == ZIO_TYPE_IOCTL) { + zio->io_error = 0; + zio_execute(zio); + return; + } + + /* + * HH: at pool creation, dspare gets some writes with + * ZIO_FLAG_SPECULATIVE and ZIO_FLAG_NODATA. + * Need to understand and handle them right. + */ + if (zio->io_flags & ZIO_FLAG_NODATA) { + zio->io_error = 0; + zio_execute(zio); + return; + } + + if (offset < VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) { + ASSERT(zio->io_flags & ZIO_FLAG_PHYSICAL); + + /* + * HH: dspare should not get any label IO as it is pretending + * to be a leaf disk. Later should catch and fix all places + * that still does label IO to dspare. + */ + zio->io_error = SET_ERROR(ENODATA); + zio_interrupt(zio); + return; + } + + offset -= VDEV_LABEL_START_SIZE; /* See zio_vdev_child_io() */ + cvd = vdev_dspare_get_child(vd, offset); + if (zio->io_type == ZIO_TYPE_READ && !vdev_readable(cvd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + /* + * Parent vdev should have avoided reading from me in the first + * place, unless this is a mirror scrub. + */ + draid_dbg(1, "Read from dead spare %s:%s:%s at "U64FMT"\n", + vd->vdev_path, + cvd->vdev_ops->vdev_op_type, + cvd->vdev_path != NULL ? cvd->vdev_path : "NA", + offset); + return; + } + + /* dspare IO does not cross slice boundary */ + ASSERT3U(offset >> DRAID_SLICESHIFT, ==, + (offset + zio->io_size - 1) >> DRAID_SLICESHIFT); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, offset, zio->io_abd, + zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_dspare_child_done, zio)); + zio_execute(zio); +} + +static void +vdev_dspare_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_draid_spare_ops = { + vdev_dspare_open, + vdev_dspare_close, + vdev_dspare_asize, + vdev_dspare_io_start, + vdev_dspare_io_done, + NULL, + NULL, + NULL, + VDEV_TYPE_DRAID_SPARE, + B_TRUE +}; + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(draid_debug_lvl, int, 0644); +MODULE_PARM_DESC(draid_debug_lvl, "dRAID debugging verbose level"); +#endif diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 54c54237bfd5..9e65205bb29d 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -141,6 +141,7 @@ #include <sys/zap.h> #include <sys/vdev.h> #include <sys/vdev_impl.h> +#include <sys/vdev_draid_impl.h> #include <sys/uberblock_impl.h> #include <sys/metaslab.h> #include <sys/zio.h> @@ -384,8 +385,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); if (vd->vdev_nparity != 0) { - ASSERT(strcmp(vd->vdev_ops->vdev_op_type, - VDEV_TYPE_RAIDZ) == 0); + ASSERT(vd->vdev_ops == &vdev_raidz_ops || + vd->vdev_ops == &vdev_draid_ops); /* * Make sure someone hasn't managed to sneak a fancy new vdev @@ -405,6 +406,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); } + if (vd->vdev_cfg != NULL) { + ASSERT(vd->vdev_ops == &vdev_draid_ops); + ASSERT(vdev_draid_config_validate(vd, vd->vdev_cfg)); + + fnvlist_add_nvlist(nv, ZPOOL_CONFIG_DRAIDCFG, vd->vdev_cfg); + } + if (vd->vdev_wholedisk != -1ULL) fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, vd->vdev_wholedisk); @@ -601,6 +609,9 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) if (!vdev_readable(vd)) return (NULL); + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (vdev_draid_spare_read_config(vd)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); vp = abd_to_buf(vp_abd); @@ -869,6 +880,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) ASSERT(reason == VDEV_LABEL_REPLACE); } + if (vd->vdev_ops == &vdev_draid_spare_ops) { + error = 0; + goto skip; + } + /* * Initialize its label. */ @@ -990,6 +1006,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) abd_free(ub_abd); abd_free(vp_abd); +skip: /* * If this vdev hasn't been previously identified as a spare, then we * mark it as such only if a) we are labeling it as a spare, or b) it @@ -1079,7 +1096,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, for (c = 0; c < vd->vdev_children; c++) vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); - if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) && + vd->vdev_ops != &vdev_draid_spare_ops) { for (l = 0; l < VDEV_LABELS; l++) { for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, @@ -1160,6 +1178,13 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) if (!vd->vdev_ops->vdev_op_leaf) return; + /* + * HH: no need to sync ub on dspare - if dspare gets a ub sync, so + * does the parent draid vdev + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + if (!vdev_writeable(vd)) return; @@ -1264,6 +1289,9 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) if (!vd->vdev_ops->vdev_op_leaf) return; + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + if (!vdev_writeable(vd)) return; diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 256431e6b334..7aeaab885209 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -30,32 +30,11 @@ #include <sys/zfs_context.h> #include <sys/spa.h> #include <sys/vdev_impl.h> +#include <sys/vdev_draid_impl.h> #include <sys/zio.h> #include <sys/abd.h> #include <sys/fs/zfs.h> -/* - * Virtual device vector for mirroring. - */ - -typedef struct mirror_child { - vdev_t *mc_vd; - uint64_t mc_offset; - int mc_error; - int mc_load; - uint8_t mc_tried; - uint8_t mc_skipped; - uint8_t mc_speculative; -} mirror_child_t; - -typedef struct mirror_map { - int *mm_preferred; - int mm_preferred_cnt; - int mm_children; - boolean_t mm_replacing; - boolean_t mm_root; - mirror_child_t mm_child[]; -} mirror_map_t; static int vdev_mirror_shift = 21; @@ -85,7 +64,7 @@ vdev_mirror_map_size(int children) sizeof (int) * children); } -static inline mirror_map_t * +mirror_map_t * vdev_mirror_map_alloc(int children, boolean_t replacing, boolean_t root) { mirror_map_t *mm; @@ -108,7 +87,7 @@ vdev_mirror_map_free(zio_t *zio) kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); } -static const zio_vsd_ops_t vdev_mirror_vsd_ops = { +const zio_vsd_ops_t vdev_mirror_vsd_ops = { vdev_mirror_map_free, zio_vsd_default_cksum_report }; @@ -331,6 +310,12 @@ vdev_mirror_preferred_child_randomize(zio_t *zio) return (mm->mm_preferred[p]); } +static boolean_t +vdev_mirror_child_readable(mirror_child_t *mc) +{ + return (vdev_draid_readable(mc->mc_vd, mc->mc_offset)); +} + /* * Try to find a vdev whose DTL doesn't contain the block we want to read * prefering vdevs based on determined load. @@ -356,14 +341,15 @@ vdev_mirror_child_select(zio_t *zio) if (mc->mc_tried || mc->mc_skipped) continue; - if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) { + if (mc->mc_vd == NULL || + !vdev_mirror_child_readable(mc)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } - if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { + if (vdev_draid_missing(mc->mc_vd, mc->mc_offset, txg, 1)) { mc->mc_error = SET_ERROR(ESTALE); mc->mc_skipped = 1; mc->mc_speculative = 1; @@ -420,7 +406,12 @@ vdev_mirror_io_start(zio_t *zio) mirror_child_t *mc; int c, children; - mm = vdev_mirror_map_init(zio); + if (zio->io_vsd != NULL) { /* dRAID hybrid mirror */ + ASSERT3P(zio->io_vd->vdev_ops, ==, &vdev_draid_ops); + mm = zio->io_vsd; + } else { + mm = vdev_mirror_map_init(zio); + } if (zio->io_type == ZIO_TYPE_READ) { if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 82e56305487f..de85a3c54e8c 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -35,6 +35,7 @@ #include <sys/fm/fs/zfs.h> #include <sys/vdev_raidz.h> #include <sys/vdev_raidz_impl.h> +#include <sys/vdev_draid_impl.h> /* * Virtual device vector for RAID-Z. @@ -150,6 +151,11 @@ vdev_raidz_map_free(raidz_map_t *rm) size += rm->rm_col[c].rc_size; } + if (rm->rm_abd_skip != NULL) { + ASSERT(rm->rm_declustered); + abd_free(rm->rm_abd_skip); + } + if (rm->rm_abd_copy != NULL) abd_free(rm->rm_abd_copy); @@ -323,7 +329,7 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) ASSERT3U(offset, ==, size); } -static const zio_vsd_ops_t vdev_raidz_vsd_ops = { +const zio_vsd_ops_t vdev_raidz_vsd_ops = { vdev_raidz_map_free_vsd, vdev_raidz_cksum_report }; @@ -398,6 +404,8 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, rm->rm_reports = 0; rm->rm_freed = 0; rm->rm_ecksuminjected = 0; + rm->rm_abd_skip = NULL; + rm->rm_declustered = B_FALSE; asize = 0; @@ -615,6 +623,22 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) } } } + + if (!rm->rm_declustered) + return; + + /* IO doesn't span all child vdevs. */ + for (; c < rm->rm_scols; c++) { + q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + + /* + * Treat skip sectors as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (i = 0; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); + } + } } static void @@ -666,6 +690,24 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) } } } + + if (!rm->rm_declustered) + return; + + /* IO doesn't span all child vdevs. */ + for (; c < rm->rm_scols; c++) { + q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); + + /* + * Treat skip sectors as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (i = 0; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); + VDEV_RAIDZ_64MUL_4(r[i], mask); + } + } } /* @@ -1491,8 +1533,8 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; - int i, c, ret; - int code; + int i, c, code; + int cols = 0; int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; @@ -1527,25 +1569,32 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) ASSERT(nbaddata >= 0); ASSERT(nbaddata + nbadparity == ntgts); + if (rm->rm_declustered) + cols = vdev_draid_hide_skip_sectors(rm); + dt = &tgts[nbadparity]; /* Reconstruct using the new math implementation */ - ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); - if (ret != RAIDZ_ORIGINAL_IMPL) - return (ret); + code = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); + if (code != RAIDZ_ORIGINAL_IMPL) + goto out; /* * See if we can use any of our optimized reconstruction routines. */ switch (nbaddata) { case 1: - if (parity_valid[VDEV_RAIDZ_P]) - return (vdev_raidz_reconstruct_p(rm, dt, 1)); + if (parity_valid[VDEV_RAIDZ_P]) { + code = vdev_raidz_reconstruct_p(rm, dt, 1); + goto out; + } ASSERT(rm->rm_firstdatacol > 1); - if (parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_q(rm, dt, 1)); + if (parity_valid[VDEV_RAIDZ_Q]) { + code = vdev_raidz_reconstruct_q(rm, dt, 1); + goto out; + } ASSERT(rm->rm_firstdatacol > 2); break; @@ -1554,8 +1603,10 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) ASSERT(rm->rm_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_P] && - parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + parity_valid[VDEV_RAIDZ_Q]) { + code = vdev_raidz_reconstruct_pq(rm, dt, 2); + goto out; + } ASSERT(rm->rm_firstdatacol > 2); @@ -1565,6 +1616,9 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); ASSERT(code > 0); +out: + if (rm->rm_declustered) + vdev_draid_restore_skip_sectors(rm, cols); return (code); } @@ -1637,7 +1691,7 @@ vdev_raidz_asize(vdev_t *vd, uint64_t psize) return (asize); } -static void +void vdev_raidz_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; @@ -1647,6 +1701,38 @@ vdev_raidz_child_done(zio_t *zio) rc->rc_skipped = 0; } +boolean_t +vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +{ + uint64_t unit_shift = vd->vdev_top->vdev_ashift; + uint64_t dcols = vd->vdev_children; + uint64_t nparity = vd->vdev_nparity; + uint64_t b = offset >> unit_shift; + uint64_t s = ((psize - 1) >> unit_shift) + 1; + /* The first column for this stripe. */ + uint64_t f = b % dcols; + uint64_t c, devidx; + + if (s + nparity >= dcols) /* spans all child vdevs */ + return (B_TRUE); + + for (c = 0; c < s + nparity; c++) { + vdev_t *cvd; + + /* + * dsl_scan_need_resilver() already checked vd with + * vdev_dtl_contains(). So here just check cvd with + * vdev_dtl_empty(), cheaper and a good approximation. + */ + devidx = (f + c) % dcols; + cvd = vd->vdev_child[devidx]; + if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) + return (B_TRUE); + } + + return (B_FALSE); +} + /* * Start an IO operation on a RAIDZ VDev * @@ -1843,6 +1929,8 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) zio_buf_free(orig[c], rc->rc_size); } + if (ret != 0 && rm->rm_declustered) + vdev_draid_debug_zio(zio, B_FALSE); return (ret); } @@ -2295,6 +2383,9 @@ vdev_raidz_io_done(zio_t *zio) ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } + + if (rm->rm_declustered) + vdev_draid_fix_skip_sectors(zio); } static void diff --git a/module/zfs/vdev_raidz.h b/module/zfs/vdev_raidz.h new file mode 100644 index 000000000000..2c704d1ffb6b --- /dev/null +++ b/module/zfs/vdev_raidz.h @@ -0,0 +1,33 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 Intel Corporation. + */ + +#include <sys/zio.h> +#include <sys/vdev_raidz.h> +#include <sys/vdev_raidz_impl.h> + +extern const zio_vsd_ops_t vdev_raidz_vsd_ops; + +extern void vdev_raidz_generate_parity(raidz_map_t *rm); +extern void vdev_raidz_child_done(zio_t *zio); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 8a3b3066abbe..eb37fd502682 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1047,10 +1047,11 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; + ASSERTV(vdev_t *piovd = pio->io_vd); zio_t *zio; - ASSERT(vd->vdev_parent == - (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); + ASSERT((piovd != NULL && piovd->vdev_ops == &vdev_draid_spare_ops) || + vd->vdev_parent == (piovd ? piovd : pio->io_spa->spa_root_vdev)); if (type == ZIO_TYPE_READ && bp != NULL) { /* @@ -3235,10 +3236,20 @@ zio_vdev_io_start(zio_t *zio) * discard unnecessary repairs as we work our way down the vdev tree. * The same logic applies to any form of nested replication: * ditto + mirror, RAID-Z + replacing, etc. This covers them all. + * + * Leaf DTL_PARTIAL can be empty when a legitimate write comes from + * a dRAID spare vdev. For example, when a dRAID spare is first + * used, its spare blocks need to be written to but the leaf vdev's + * of such blocks can have empty DTL_PARTIAL. + * + * There seemed no clean way to allow such writes while bypassing + * spurious ones. At this point, just avoid all bypassing for dRAID + * for correctness. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ + vd->vdev_top->vdev_ops != &vdev_draid_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); @@ -3246,6 +3257,7 @@ zio_vdev_io_start(zio_t *zio) } if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) @@ -3281,8 +3293,8 @@ zio_vdev_io_done(zio_t *zio) if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; - if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { - + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) { vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE)