From 653f520a9763831e28ddef13059af2ae66dfec3b Mon Sep 17 00:00:00 2001 From: Don Brady Date: Tue, 11 Sep 2018 16:00:47 -0600 Subject: [PATCH 1/3] dRAID implementation PR-7078 rebase Signed-off-by: Don Brady --- cmd/Makefile.am | 2 +- cmd/draidcfg/.gitignore | 1 + cmd/draidcfg/Makefile.am | 20 + cmd/draidcfg/draid_permutation.c | 763 ++++++++ cmd/draidcfg/draid_permutation.h | 41 + cmd/draidcfg/draidcfg.c | 343 ++++ cmd/zdb/zdb.c | 5 +- cmd/zpool/Makefile.am | 1 + cmd/zpool/zpool_main.c | 20 +- cmd/zpool/zpool_vdev.c | 90 +- configure.ac | 1 + include/libzfs.h | 6 +- include/sys/Makefile.am | 2 + include/sys/arc.h | 1 + include/sys/dmu.h | 1 + include/sys/fs/zfs.h | 14 + include/sys/metaslab_impl.h | 1 + include/sys/nvpair.h | 1 + include/sys/spa.h | 6 - include/sys/spa_impl.h | 7 +- include/sys/sysevent/eventdefs.h | 2 + include/sys/vdev.h | 1 + include/sys/vdev_draid_impl.h | 113 ++ include/sys/vdev_impl.h | 30 + include/sys/vdev_raidz_impl.h | 7 + include/sys/vdev_scan.h | 78 + include/zfeature_common.h | 1 + lib/libzfs/Makefile.am | 1 + lib/libzfs/libzfs_import.c | 72 +- lib/libzfs/libzfs_pool.c | 19 +- lib/libzpool/Makefile.am | 2 + man/man5/zpool-features.5 | 17 + module/nvpair/fnvpair.c | 19 +- module/zcommon/zfeature_common.c | 4 + module/zcommon/zfs_namecheck.c | 4 +- module/zfs/Makefile.in | 2 + module/zfs/arc.c | 6 + module/zfs/dsl_scan.c | 125 +- module/zfs/metaslab.c | 90 +- module/zfs/spa.c | 155 +- module/zfs/vdev.c | 70 +- module/zfs/vdev_draid.c | 1688 +++++++++++++++++ module/zfs/vdev_label.c | 34 +- module/zfs/vdev_mirror.c | 62 +- module/zfs/vdev_raidz.c | 90 +- module/zfs/vdev_raidz.h | 33 + module/zfs/vdev_removal.c | 47 +- module/zfs/vdev_scan.c | 583 ++++++ module/zfs/zio.c | 36 +- .../cli_root/zpool_get/zpool_get.cfg | 1 + 50 files changed, 4534 insertions(+), 184 deletions(-) create mode 100644 cmd/draidcfg/.gitignore create mode 100644 cmd/draidcfg/Makefile.am create mode 100644 cmd/draidcfg/draid_permutation.c create mode 100644 cmd/draidcfg/draid_permutation.h create mode 100644 cmd/draidcfg/draidcfg.c create mode 100644 include/sys/vdev_draid_impl.h create mode 100644 include/sys/vdev_scan.h create mode 100644 module/zfs/vdev_draid.c create mode 100644 module/zfs/vdev_raidz.h create mode 100644 module/zfs/vdev_scan.c diff --git a/cmd/Makefile.am b/cmd/Makefile.am index 9dd7b8b4f07d..0d73d0ba54ff 100644 --- a/cmd/Makefile.am +++ b/cmd/Makefile.am @@ -1,3 +1,3 @@ SUBDIRS = zfs zpool zdb zhack zinject zstreamdump ztest SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat dbufstat zed -SUBDIRS += arc_summary raidz_test zgenhostid +SUBDIRS += arc_summary raidz_test zgenhostid draidcfg diff --git a/cmd/draidcfg/.gitignore b/cmd/draidcfg/.gitignore new file mode 100644 index 000000000000..ad7c307b04e3 --- /dev/null +++ b/cmd/draidcfg/.gitignore @@ -0,0 +1 @@ +/draidcfg diff --git a/cmd/draidcfg/Makefile.am b/cmd/draidcfg/Makefile.am new file mode 100644 index 000000000000..f587d271860e --- /dev/null +++ b/cmd/draidcfg/Makefile.am @@ -0,0 +1,20 @@ +include $(top_srcdir)/config/Rules.am + +AM_CPPFLAGS += -DDEBUG + +DEFAULT_INCLUDES += \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib/libspl/include + +bin_PROGRAMS = draidcfg + +draidcfg_SOURCES = \ + draidcfg.c \ + draid_permutation.c \ + draid_permutation.h + +draidcfg_LDADD = \ + $(top_builddir)/lib/libnvpair/libnvpair.la \ + $(top_builddir)/lib/libzpool/libzpool.la \ + $(top_builddir)/lib/libzfs/libzfs.la +draidcfg_LDADD += -lm diff --git a/cmd/draidcfg/draid_permutation.c b/cmd/draidcfg/draid_permutation.c new file mode 100644 index 000000000000..4753f3f31f66 --- /dev/null +++ b/cmd/draidcfg/draid_permutation.c @@ -0,0 +1,763 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016 Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "draid_permutation.h" + + +#define MAX_GROUPSIZE 32 +#define MAX_GROUPS 128 +#define MAX_SPARES 100 +#define MAX_DEVS (MAX_GROUPSIZE * MAX_GROUPS + MAX_SPARES) +#define MAX_ROWS 16384 + +#define UNOPT 0 +#define EVAL_WORST 1 +#define EVAL_MEAN 2 +#define EVAL_RMS 3 + +static int verbose = 0; + +typedef struct +{ + int groupsz; + int ngroups; + int nspares; + int ndevs; + int nrows; + /* each row maps all drives, groups from 0, spares down from ndevs-1 */ + int **rows; + int nbroken; /* # broken drives */ + int *broken; /* which drives are broken */ +} map_t; + +typedef struct +{ + int value; + int order; +} pair_t; + +static void +permute_devs(int *in, int *out, int ndevs) +{ + pair_t tmp[MAX_DEVS]; + int i; + int j; + + if (ndevs == 2) { /* swap */ + i = in[0]; + j = in[1]; + out[0] = j; + out[1] = i; + return; + } + + for (i = 0; i < ndevs; i++) { /* assign random order */ + tmp[i].value = in[i]; + tmp[i].order = mrand48(); + } + + for (i = 1; i < ndevs; i++) /* sort */ + for (j = 0; j < i; j++) + if (tmp[i].order < tmp[j].order) { + pair_t t = tmp[i]; + tmp[i] = tmp[j]; + tmp[j] = t; + } + + for (i = 0; i < ndevs; i++) + out[i] = tmp[i].value; +} + +static void +print_map(map_t *map) +{ + int i; + int j; + + for (i = 0; i < map->nrows; i++) { + for (j = 0; j < map->ndevs; j++) { + if (j == map->ndevs - map->nspares) + printf("S "); + + printf("%2d ", map->rows[i][j]); + } + printf("\n"); + } +} + +static void +check_map(map_t *map) +{ + int i; + int j; + int nrows = map->nrows; + int ndevs = map->ndevs; + int **rows = map->rows; + int devcounts[MAX_DEVS]; + int brokencounts[MAX_DEVS]; + + ASSERT(map->groupsz <= MAX_GROUPSIZE); + ASSERT(map->ngroups <= MAX_GROUPS); + ASSERT(map->nspares <= MAX_SPARES); + ASSERT(map->ndevs == map->nspares + map->ngroups * map->groupsz); + ASSERT(map->nrows <= MAX_ROWS); + ASSERT(map->nbroken <= MAX_SPARES); + + /* Ensure each dev appears once in every row */ + memset(devcounts, 0, sizeof (int) * map->ndevs); + + for (i = 0; i < nrows; i++) { + int *row = rows[i]; + + for (j = 0; j < ndevs; j++) { + int dev = row[j]; + + ASSERT(0 <= dev && dev < ndevs); + ASSERT(devcounts[dev] == i); + devcounts[dev] = i+1; + } + } + + /* Ensure broken drives only appear once */ + memset(brokencounts, 0, sizeof (int) * map->ndevs); + + for (i = 0; i < map->nbroken; i++) { + int dev = map->broken[i]; + + ASSERT(0 <= dev && dev < map->ndevs); /* valid drive */ + ASSERT(brokencounts[i] == 0); /* not used already */ + brokencounts[i] = 1; + } +} + +static map_t * +dup_map(map_t *oldmap) +{ + int groupsz = oldmap->groupsz; + int ngroups = oldmap->ngroups; + int nspares = oldmap->nspares; + int ndevs = oldmap->ndevs; + int nrows = oldmap->nrows; + map_t *map = malloc(sizeof (map_t)); + int i; + + ASSERT(nrows <= MAX_ROWS); + ASSERT(ndevs <= MAX_DEVS); + + map->groupsz = groupsz; + map->ngroups = ngroups; + map->nspares = nspares; + map->ndevs = ndevs; + map->nrows = nrows; + map->rows = malloc(sizeof (int *) * nrows); + + for (i = 0; i < nrows; i++) { + map->rows[i] = malloc(sizeof (int) * ndevs); + memcpy(map->rows[i], oldmap->rows[i], sizeof (int) * ndevs); + } + + /* Init to no failures (nothing broken) */ + map->broken = malloc(sizeof (int) * nspares); + map->nbroken = 0; + + check_map(map); + return (map); +} + +static map_t * +new_map(int groupsz, int ngroups, int nspares, int nrows) +{ + map_t *map = malloc(sizeof (map_t)); + int ndevs = nspares + ngroups * groupsz; + int i; + int j; + + ASSERT(nrows <= MAX_ROWS); + ASSERT(ndevs <= MAX_DEVS); + + map->groupsz = groupsz; + map->ngroups = ngroups; + map->nspares = nspares; + map->ndevs = ndevs; + map->nrows = nrows; + map->rows = malloc(sizeof (int *) * nrows); + + for (i = 0; i < nrows; i++) { + map->rows[i] = malloc(sizeof (int) * ndevs); + + if (i == 0) + for (j = 0; j < ndevs; j++) + map->rows[i][j] = j; + else + permute_devs(map->rows[i-1], map->rows[i], ndevs); + } + + /* Init to no failures (nothing broken) */ + map->broken = malloc(sizeof (int) * nspares); + map->nbroken = 0; + + check_map(map); + return (map); +} + +static void +free_map(map_t *map) +{ + int i; + + free(map->broken); + for (i = 0; i < map->nrows; i++) + free(map->rows[i]); + free(map->rows); + free(map); +} + +static inline int +is_broken(map_t *map, int dev) +{ + int i; + + for (i = 0; i < map->nbroken; i++) + if (dev == map->broken[i]) + return (1); + + return (0); +} + +static int +eval_resilver(map_t *map, int print) +{ + /* Evaluate how resilvering I/O will be distributed */ + int i; + int j; + int k; + int spare; + int dev; + int ndevs = map->ndevs; + int nspares = map->nspares; + int ngroups = map->ngroups; + int groupsz = map->groupsz; + int nrows = map->nrows; + int writes[MAX_DEVS]; + int reads[MAX_DEVS]; + int max_reads = 0; + int max_writes = 0; + int max_ios = 0; + + memset(reads, 0, sizeof (int) * ndevs); + memset(writes, 0, sizeof (int) * ndevs); + + /* resilver all rows */ + for (i = 0; i < nrows; i++) { + int *row = map->rows[i]; + + /* resilver all groups with broken drives */ + for (j = 0; j < ngroups; j++) { + int fix = 0; + + /* See if any disk in this group is broken */ + for (k = 0; k < groupsz && !fix; k++) + fix = is_broken(map, row[j*groupsz + k]); + + if (!fix) + continue; + + /* + * This group needs fixing + * Read all the non-broken drives and write all the + * broken drives to their hot spare for this row + */ + spare = ndevs - nspares; + for (k = 0; k < groupsz; k++) { + dev = row[j*groupsz + k]; + + if (!is_broken(map, dev)) { + reads[dev]++; + } else { + ASSERT(spare < ndevs); + + while (is_broken(map, row[spare])) { + spare++; + ASSERT(spare < ndevs); + } + writes[row[spare++]]++; + } + } + } + } + + /* find drives with most I/O */ + for (i = 0; i < ndevs; i++) { + if (reads[i] > max_reads) + max_reads = reads[i]; + if (writes[i] > max_writes) + max_writes = writes[i]; + + if (reads[i] + writes[i] > max_ios) + max_ios = reads[i] + writes[i]; + } + + if (print) { + printf("Reads: "); + for (i = 0; i < ndevs; i++) + printf(" %5.3f", ((double)reads[i]*ngroups)/nrows); + printf("\n"); + printf("Writes: "); + for (i = 0; i < ndevs; i++) + printf(" %5.3f", ((double)writes[i]*ngroups)/nrows); + printf("\n"); + } + + return (max_ios); +} + +static double +eval_decluster(map_t *map, int how, int faults, int print) +{ + int f1; + int f2; + int ios; + int worst1 = -1; + int worst2 = -1; + int n = 0; + long sum = 0; + long sumsq = 0; + long max_ios = 0; + double val; + + ASSERT(eval_resilver(map, 0) == 0); /* not broken already */ + ASSERT(faults == 1 || faults == 2); + + map->nbroken = faults; + + for (f1 = 0; f1 < map->ndevs; f1++) { + map->broken[0] = f1; + + if (faults < 2) { + ios = eval_resilver(map, 0); /* eval single failure */ + n++; + sum += ios; + sumsq += ios*ios; + if (max_ios < ios) { + worst1 = f1; + max_ios = ios; + } + } else { /* eval double failure */ + for (f2 = f1 + 1; f2 < map->ndevs; f2++) { + map->broken[1] = f2; /* use 2nd hot spare */ + + ios = eval_resilver(map, 0); + n++; + sum += ios; + sumsq += ios*ios; + if (max_ios < ios) { + worst1 = f1; + worst2 = f2; + max_ios = ios; + } + } + } + } + map->nbroken = 0; + + if (print) { + map->nbroken = faults; + map->broken[0] = worst1; + map->broken[2] = worst2; + + eval_resilver(map, 1); + + map->nbroken = 0; + } + + switch (how) { + case EVAL_WORST: + /* + * imbalance from worst possible drive failure + * insensitive to failures handled better + */ + val = max_ios; + break; + case EVAL_MEAN: + /* + * average over all possible drive failures + * sensitive to all possible failures + */ + val = ((double)sum)/n; + break; + case EVAL_RMS: + /* + * root mean square over all possible drive failures + * penalizes higher imbalance more + */ + val = sqrt(((double)sumsq)/n); + break; + default: + ASSERT(0); + } + return ((val/map->nrows)*map->ngroups); +} + +static int +rand_in_range(int min, int count) +{ + return (min + drand48()*count); +} + +static void +permute_map(map_t *map, int temp) +{ + static int prev_temp; + + int nrows = (temp < 1) ? 1 : (temp > 100) ? + map->nrows : rand_in_range(1, (map->nrows * temp)/100); + int row = rand_in_range(0, map->nrows - nrows); + int ncols = map->ndevs; + int col = rand_in_range(0, map->ndevs - ncols); + int i; + + if (verbose > 0 && + temp != prev_temp && + (temp < 10 || (temp % 10 == 0))) + printf("Permute t %3d (%d-%d, %d-%d)\n", + temp, col, ncols, row, nrows); + prev_temp = temp; + + for (i = row; i < row + nrows; i++) + permute_devs(&map->rows[i][col], &map->rows[i][col], ncols); +} + +static map_t * +develop_map(map_t *map) +{ + map_t *dmap = new_map(map->groupsz, map->ngroups, + map->nspares, map->nrows * map->ndevs); + int base; + int dev; + int i; + + for (base = 0; base < map->nrows; base++) + for (dev = 0; dev < map->ndevs; dev++) + for (i = 0; i < map->ndevs; i++) + dmap->rows[base*map->ndevs + dev][i] = + (map->rows[base][i] + dev) % map->ndevs; + + return (dmap); +} + +static map_t * +optimize_map(map_t *map, int eval, int faults) +{ + double temp = 100.0; + double alpha = 0.995; + double epsilon = 0.001; + double val = eval_decluster(map, eval, faults, 0); + int ups = 0; + int downs = 0; + int sames = 0; + int iter = 0; + + while (temp > epsilon) { + map_t *map2 = dup_map(map); + double val2; + double delta; + + permute_map(map2, (int)temp); + + val2 = eval_decluster(map2, eval, faults, 0); + delta = (val2 - val); + + if (delta < 0 || exp(-10000*delta/temp) > drand48()) { + if (delta > 0) + ups++; + else if (delta < 0) + downs++; + else + sames++; + + free_map(map); + map = map2; + val = val2; + } else { + free_map(map2); + } + + temp *= alpha; + + if ((++iter % 100) == 0) { + if (verbose > 0) + printf("%f (%d ups, %d sames, %d downs)\n", + val, ups, sames, downs); + ups = downs = sames = 0; + } + } + + if (verbose > 0) + printf("%d iters, %d ups %d sames %d downs\n", + iter, ups, sames, downs); + return (map); +} + +static void +print_map_stats(map_t *map, int optimize, int print_ios) +{ + double score = eval_decluster(map, EVAL_WORST, 1, 0); + + printf("%6s (%2d x %2d + %2d) x %5d: %2.3f\n", + (optimize == UNOPT) ? "Unopt" : + (optimize == EVAL_WORST) ? "Worst" : + (optimize == EVAL_MEAN) ? "Mean" : "Rms", + map->ngroups, map->groupsz, map->nspares, map->nrows, score); + + if (map->ndevs < 80 && score >= 1.05) + printf("Warning score %6.3f has over 5 percent imbalance!\n", + score); + else if (score >= 1.1) + printf("Warning score %6.3f has over 10 percent imbalance!\n", + score); + +#ifdef FOOO + printf("Single: worst %6.3f mean %6.3f\n", + eval_decluster(map, EVAL_WORST, 1, 0), + eval_decluster(map, EVAL_MEAN, 1, 0)); + + printf("Double: worst %6.3f mean %6.3f\n", + eval_decluster(map, EVAL_WORST, 2, 0), + eval_decluster(map, EVAL_MEAN, 2, 0)); +#endif + + if (print_ios) { + eval_decluster(map, EVAL_WORST, 1, 1); + eval_decluster(map, EVAL_WORST, 2, 1); + } +} + +int +draid_permutation_generate(struct vdev_draid_configuration *cfg) +{ + const int loop = 16; /* HH: make this a parameter */ + const int faults = 1; + const int eval = EVAL_WORST; + + int groupsz = cfg->dcf_data + cfg->dcf_parity; + int nspares = cfg->dcf_spare; + int ngroups = (cfg->dcf_children - nspares) / groupsz; + int nrows; + int i, fd, urand_fd; + long int best_seed; + map_t *best_map; + + fd = open("/dev/random", O_RDONLY | O_NONBLOCK); + if (fd == -1) { + perror("Cannot open /dev/random\n"); + return (-1); + } + urand_fd = open("/dev/urandom", O_RDONLY); + + /* HH: fine tune these heuristics */ + if (cfg->dcf_children - nspares > 80) + nrows = 128; /* 81 - ? */ + else if (cfg->dcf_children - nspares > 40) + nrows = 64; /* 41 - 80 */ + else + nrows = 32; /* 1 - 40 */ + + for (i = 0, best_map = NULL; i < loop; i++) { + int rc; + long int seed; + map_t *map, *omap; + + rc = read(fd, &seed, sizeof (seed)); + if (rc != sizeof (seed)) { + printf("Not enough entropy at /dev/random: read %d, " + "wanted %lu.\n", rc, sizeof (seed)); + /* urand_fd may not be valid but it does not matter */ + rc = read(urand_fd, &seed, sizeof (seed)); + if (rc != sizeof (seed)) + break; + printf("Using /dev/urandom instead.\n"); + } + + srand48(seed); + + map = new_map(groupsz, ngroups, nspares, nrows); + omap = optimize_map(dup_map(map), eval, faults); + if (eval_decluster(omap, eval, faults, 0) > + eval_decluster(map, eval, faults, 0)) { + /* + * optimize_map() may create a worse map, because the + * simulated annealing process may accept worse + * neighbors to avoid getting stuck in local optima + */ + free_map(omap); + } else { + free_map(map); + map = omap; + } + + if (best_map == NULL || + eval_decluster(map, eval, faults, 0) < + eval_decluster(best_map, eval, faults, 0)) { + if (best_map != NULL) + free_map(best_map); + best_map = map; + best_seed = seed; + } else { + free_map(map); + } + } + + close(fd); + close(urand_fd); + if (i != loop) + fprintf(stderr, "Early termination at loop %d. Generated " + "permutations may not be optimal!\n", i + 1); + + if (best_map != NULL) { + int j; + map_t *dmap; + uint64_t *perms; + + assert(best_map->nrows == nrows); + assert(best_map->ndevs == cfg->dcf_children); + + perms = malloc(sizeof (*perms) * nrows * best_map->ndevs); + assert(perms != NULL); + + for (i = 0; i < nrows; i++) + for (j = 0; j < best_map->ndevs; j++) + perms[i * best_map->ndevs + j] = + best_map->rows[i][j]; + + cfg->dcf_bases = nrows; + cfg->dcf_base_perms = perms; + + if (verbose > 1) + print_map(best_map); + dmap = develop_map(best_map); + free_map(best_map); + print_map_stats(dmap, eval, 0); + printf("Seed chosen: %lx\n", best_seed); + free_map(dmap); + return (0); + } else { + return (-1); + } +} + +int +debug_main(int argc, char **argv) +{ + int ngroups = 0; + int groupsz = 0; + int nspares = 0; + int nrows = 0; + int optimize = UNOPT; + int faults = 1; + int develop = 0; + map_t *map; + int c; + + while ((c = getopt(argc, argv, "g:d:s:n:vUWMR12D")) != -1) + switch (c) { + case 'D': + develop = 1; + break; + case 'g': + sscanf(optarg, "%d", &ngroups); + break; + case 'd': + sscanf(optarg, "%d", &groupsz); + break; + case 's': + sscanf(optarg, "%d", &nspares); + break; + case 'n': + sscanf(optarg, "%d", &nrows); + break; + case 'v': + verbose++; + break; + case 'U': + optimize = UNOPT; + break; + case 'W': + optimize = EVAL_WORST; + break; + case 'M': + optimize = EVAL_MEAN; + break; + case 'R': + optimize = EVAL_RMS; + break; + case '1': + faults = 1; + break; + case '2': + faults = 2; + break; + default: + fprintf(stderr, "arg???\n"); + return (1); + } + + if (ngroups <= 0 || groupsz <= 0 || nspares <= 0 || nrows <= 0) { + fprintf(stderr, "missing arg???\n"); + return (1); + } + + map = new_map(groupsz, ngroups, nspares, nrows); + if (verbose > 1) + print_map(map); + + if (verbose > 0) + print_map_stats(map, UNOPT, 1); + + if (optimize != UNOPT) { + map = optimize_map(map, optimize, faults); + + if (verbose > 1) + print_map(map); + if (verbose > 0) + print_map_stats(map, optimize, 1); + } + + if (develop) { + map_t *dmap = develop_map(map); + + free_map(map); + map = dmap; + } + + print_map_stats(map, optimize, verbose > 0); + return (0); +} diff --git a/cmd/draidcfg/draid_permutation.h b/cmd/draidcfg/draid_permutation.h new file mode 100644 index 000000000000..8562ccf09852 --- /dev/null +++ b/cmd/draidcfg/draid_permutation.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016 Intel Corporation. + */ + + +#ifndef _DRAID_PERMUTATION_H +#define _DRAID_PERMUTATION_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern int draid_permutation_generate(struct vdev_draid_configuration *); + +#ifdef __cplusplus +} +#endif + +#endif /* _DRAID_PERMUTATION_H */ diff --git a/cmd/draidcfg/draidcfg.c b/cmd/draidcfg/draidcfg.c new file mode 100644 index 000000000000..90e40a61a2e1 --- /dev/null +++ b/cmd/draidcfg/draidcfg.c @@ -0,0 +1,343 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016 Intel Corporation. + */ + + +#include +#include +#include +#include +#include +#include +#include + +#include "draid_permutation.h" + + +static struct vdev_draid_configuration * +draidcfg_find(const uint64_t data, const uint64_t parity, + const uint64_t spare, const uint64_t children) +{ + /* P D D... P D D... S */ + static const uint64_t bases7[1][7] = {{1, 2, 4, 3, 6, 5, 0}}; + static const uint64_t bases11[1][11] = {{ + 1, 4, 5, 9, 3, 2, 8, 10, 7, 6, 0}}; + static const uint64_t bases19[1][19] = {{ + 1, 5, 6, 11, 17, 9, 7, 16, 4, 10, 12, 3, 15, 18, 14, 13, 8, 2, 0}}; + static const uint64_t bases23[1][23] = {{ + 1, 8, 18, 6, 2, 16, 13, 12, 4, 9, 3, 10, 11, 19, 14, 20, 22, + 15, 5, 17, 21, 7, 0}}; + static const uint64_t bases31[1][31] = {{ + 1, 8, 2, 16, 4, 17, 12, 3, 24, 6, 10, 18, 20, 5, 9, 15, 27, 30, 23, + 29, 7, 25, 14, 19, 28, 26, 22, 21, 13, 11, 0}}; + static const uint64_t bases41[1][41] = {{ + 1, 25, 10, 4, 18, 40, 16, 31, 37, 23, 6, 27, 19, + 24, 26, 35, 14, 22, 17, 15, 36, 39, 32, 21, 33, + 5, 2, 9, 20, 8, 11, 29, 28, 3, 34, 30, 12, 13, 38, 7, 0}}; + + static struct vdev_draid_configuration known_cfgs[6] = { + { + .dcf_data = 2, .dcf_parity = 1, .dcf_spare = 1, .dcf_children = 7, + .dcf_bases = 1, .dcf_base_perms = &bases7[0][0] + }, + { + .dcf_data = 4, .dcf_parity = 1, .dcf_spare = 1, .dcf_children = 11, + .dcf_bases = 1, .dcf_base_perms = &bases11[0][0] + }, + { + .dcf_data = 8, .dcf_parity = 1, .dcf_spare = 1, .dcf_children = 19, + .dcf_bases = 1, .dcf_base_perms = &bases19[0][0] + }, + { + .dcf_data = 8, .dcf_parity = 3, .dcf_spare = 1, .dcf_children = 23, + .dcf_bases = 1, .dcf_base_perms = &bases23[0][0] + }, + { + .dcf_data = 4, .dcf_parity = 1, .dcf_spare = 1, .dcf_children = 31, + .dcf_bases = 1, .dcf_base_perms = &bases31[0][0] + }, + { + .dcf_data = 8, .dcf_parity = 2, .dcf_spare = 1, .dcf_children = 41, + .dcf_bases = 1, .dcf_base_perms = &bases41[0][0] + }, + }; + + int i; + + for (i = 0; i < sizeof (known_cfgs) / sizeof (known_cfgs[0]); i++) { + struct vdev_draid_configuration *cfg = &known_cfgs[i]; + + if (data == cfg->dcf_data && parity == cfg->dcf_parity && + spare == cfg->dcf_spare && children == cfg->dcf_children) + return (cfg); + } + + return (NULL); +} + +static struct vdev_draid_configuration * +draidcfg_create(const uint64_t data, const uint64_t parity, + const uint64_t spare, const uint64_t children) +{ + struct vdev_draid_configuration *cfg = calloc(1, sizeof (*cfg)); + + assert(cfg != NULL); + cfg->dcf_data = data; + cfg->dcf_parity = parity; + cfg->dcf_spare = spare; + cfg->dcf_children = children; + + cfg->dcf_bases = 0; + cfg->dcf_base_perms = NULL; + if (draid_permutation_generate(cfg) != 0) { + free(cfg); + return (NULL); + } + + assert(cfg->dcf_bases != 0); + assert(cfg->dcf_base_perms != NULL); + return (cfg); +} + +static inline void +draidcfg_free(struct vdev_draid_configuration *cfg) +{ + free((void *)cfg->dcf_base_perms); + free(cfg); +} + +static int +draidcfg_create_file(const uint64_t data, const uint64_t parity, + const uint64_t spare, const uint64_t children, const char *path) +{ + FILE *fp; + size_t len; + int ret = 0; + void *packed; + nvlist_t *nvl; + boolean_t freecfg = B_FALSE; + struct vdev_draid_configuration *cfg; + + ASSERT(children != 0); + ASSERT3U(children, <=, VDEV_DRAID_MAX_CHILDREN); + + if (children - 1 > VDEV_DRAID_U8_MAX) { + fprintf(stderr, "Configuration for over %u children " + "is not supported\n", VDEV_DRAID_U8_MAX + 1); + return (1); + } + + cfg = draidcfg_find(data, parity, spare, children); + if (cfg == NULL) { + cfg = draidcfg_create(data, parity, spare, children); + if (cfg == NULL) { + fprintf(stderr, "Cannot create" + "supported configuration\n"); + return (1); + } + freecfg = B_TRUE; + } + + fp = fopen(path, "w+"); + if (fp == NULL) { + fprintf(stderr, "Cannot open file %s for write\n", path); + if (freecfg) + draidcfg_free(cfg); + return (1); + } + + nvl = fnvlist_alloc(); + fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_DATA, data); + fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_PARITY, parity); + fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_SPARE, spare); + fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_CHILDREN, children); + fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_BASE, cfg->dcf_bases); + + if (children - 1 <= VDEV_DRAID_U8_MAX) { + int i, j; + uint8_t *val = calloc(children * cfg->dcf_bases, sizeof (*val)); + + for (i = 0; i < cfg->dcf_bases; i++) { + for (j = 0; j < children; j++) { + uint64_t c = + cfg->dcf_base_perms[i * children + j]; + + ASSERT3U(c, <, children); + ASSERT3U(c, <=, VDEV_DRAID_U8_MAX); + val[i * children + j] = (uint8_t)c; + } + } + + fnvlist_add_uint8_array(nvl, ZPOOL_CONFIG_DRAIDCFG_PERM, + val, children * cfg->dcf_bases); + free(val); + } else { + ASSERT3U(children, ==, 0); /* not supported yet */ + } + + assert(vdev_draid_config_validate(NULL, nvl)); + + packed = fnvlist_pack_xdr(nvl, &len); + if (fwrite(packed, 1, len, fp) != len) { + ret = 1; + fprintf(stderr, "Cannot write %lu bytes to %s\n", len, path); + } + + fnvlist_pack_free(packed, len); + fnvlist_free(nvl); + if (freecfg) + draidcfg_free(cfg); + fclose(fp); + return (ret); +} + +static void +draidcfg_print(nvlist_t *config) +{ + uint_t c; + uint8_t *perm = NULL; + uint64_t n, d, p, s, b, i; + + n = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_CHILDREN); + d = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_DATA); + p = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_PARITY); + s = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_SPARE); + b = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_BASE); + + printf("dRAID%lu vdev of %lu child drives: %lu x (%lu data + " + "%lu parity) and %lu distributed spare\n", + p, n, (n - s) / (d + p), d, p, s); + printf("Using %lu base permutation%s\n", b, b > 1 ? "s" : ""); + + VERIFY0(nvlist_lookup_uint8_array(config, + ZPOOL_CONFIG_DRAIDCFG_PERM, &perm, &c)); + ASSERT3U(c, ==, b * n); + + for (i = 0; i < b; i++) { + int j; + + printf(" "); + for (j = 0; j < n; j++) + printf("%*u,", n > 99 ? 3 : 2, perm[i * n + j]); + printf("\n"); + } +} + +static inline int usage(void) +{ + printf(gettext("draidcfg [-r] [-n children] [-d data] [-p parity]" + " [-s spare] \n")); + return (1); +} + +int +main(int argc, char **argv) +{ + boolean_t read = B_FALSE; + char *cfg = NULL; + uint64_t data = 0, parity = 0, spare = 0, children = 0; + int c; + + while ((c = getopt(argc, argv, "rn:d:p:s:")) != -1) { + char *endptr; + uint64_t *p = NULL; + + switch (c) { + case 'r': + read = B_TRUE; + break; + case 'n': + p = &children; + case 'd': + if (p == NULL) + p = &data; + case 'p': + if (p == NULL) + p = &parity; + case 's': + if (p == NULL) + p = &spare; + + errno = 0; + *p = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0') { + fprintf(stderr, + gettext("Invalid -%c value: %s\n"), + c, optarg); + return (usage()); + } + break; + case ':': + fprintf(stderr, gettext("Missing argument for " + "'%c' option\n"), optopt); + return (usage()); + case '?': + fprintf(stderr, gettext("Invalid option '%c'\n"), + optopt); + return (usage()); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + fprintf(stderr, + gettext("Missing configuration file argument\n")); + return (usage()); + } + + cfg = argv[0]; + + if (read) { + nvlist_t *nvl = draidcfg_read_file(cfg); + + if (nvl == NULL) { + return (1); + } else { + draidcfg_print(nvl); + nvlist_free(nvl); + return (0); + } + } + + assert(!read); + + if (data == 0 || parity == 0 || spare == 0 || children == 0) { + fprintf(stderr, + gettext("Missing data/parity/spare/children argument\n")); + return (usage()); + } + + if (parity > VDEV_RAIDZ_MAXPARITY) { + fprintf(stderr, gettext("Invalid parity %lu\n"), parity); + return (usage()); + } + + if (children % (data + parity) != spare) { + fprintf(stderr, gettext("Invalid draid configration\n")); + return (usage()); + } + + return (draidcfg_create_file(data, parity, spare, children, cfg)); +} diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 21113da2f03c..db73a439e474 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -923,7 +923,10 @@ dump_metaslab(metaslab_t *msp) } if (dump_opt['d'] > 5 || dump_opt['m'] > 3) { - ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); + if (vd->vdev_ops == &vdev_draid_ops) + ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift); + else + ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift); dump_spacemap(spa->spa_meta_objset, msp->ms_sm); } diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am index c03da941dbb3..1a5075fcddd4 100644 --- a/cmd/zpool/Makefile.am +++ b/cmd/zpool/Makefile.am @@ -16,6 +16,7 @@ zpool_SOURCES = \ zpool_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ $(top_builddir)/lib/libuutil/libuutil.la \ + $(top_builddir)/lib/libzpool/libzpool.la \ $(top_builddir)/lib/libzfs/libzfs.la zpool_LDADD += -lm $(LIBBLKID) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 5af626558aa8..7e48c67c0b4c 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -6350,7 +6351,8 @@ print_scan_status(pool_scan_stat_t *ps) zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf)); assert(ps->pss_func == POOL_SCAN_SCRUB || - ps->pss_func == POOL_SCAN_RESILVER); + ps->pss_func == POOL_SCAN_RESILVER || + ps->pss_func == POOL_SCAN_REBUILD); /* Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { @@ -6374,6 +6376,13 @@ print_scan_status(pool_scan_stat_t *ps) (u_longlong_t)days_left, (u_longlong_t)hours_left, (u_longlong_t)mins_left, (u_longlong_t)secs_left, (u_longlong_t)ps->pss_errors, ctime(&end)); + } else if (ps->pss_func == POOL_SCAN_REBUILD) { + (void) printf(gettext("rebuilt %s " + "in %llu days %02llu:%02llu:%02llu " + "with %llu errors on %s"), processed_buf, + (u_longlong_t)days_left, (u_longlong_t)hours_left, + (u_longlong_t)mins_left, (u_longlong_t)secs_left, + (u_longlong_t)ps->pss_errors, ctime(&end)); } return; } else if (ps->pss_state == DSS_CANCELED) { @@ -6383,6 +6392,9 @@ print_scan_status(pool_scan_stat_t *ps) } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver canceled on %s"), ctime(&end)); + } else if (ps->pss_func == POOL_SCAN_REBUILD) { + (void) printf(gettext("rebuild canceled on %s"), + ctime(&end)); } return; } @@ -6403,6 +6415,9 @@ print_scan_status(pool_scan_stat_t *ps) } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver in progress since %s"), ctime(&start)); + } else if (ps->pss_func == POOL_SCAN_REBUILD) { + (void) printf(gettext("rebuild in progress since %s"), + ctime(&start)); } scanned = ps->pss_examined; @@ -6452,6 +6467,9 @@ print_scan_status(pool_scan_stat_t *ps) } else if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("\t%s repaired, %.2f%% done"), processed_buf, 100 * fraction_done); + } else if (ps->pss_func == POOL_SCAN_REBUILD) { + (void) printf(gettext("\t%s rebuilt, %.2f%% done"), + processed_buf, 100 * fraction_done); } if (pause == 0) { diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 37f1ca1d9beb..deefcaa53a87 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -84,6 +84,7 @@ #include #include "zpool_util.h" #include +#include /* * For any given vdev specification, we can have multiple errors. The @@ -592,6 +593,7 @@ is_spare(nvlist_t *config, const char *path) * /dev/xxx Complete disk path * /xxx Full path to file * xxx Shorthand for /xxx + * %draidxxx dRAID spare, see VDEV_DRAID_SPARE_PATH_FMT */ static nvlist_t * make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) @@ -634,6 +636,11 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) /* After whole disk check restore original passed path */ strlcpy(path, arg, sizeof (path)); + } else if (arg[0] == VDEV_DRAID_SPARE_PATH_FMT[0]) { + ashift = 12; + wholedisk = B_TRUE; + strlcpy(path, arg, sizeof (path)); + type = VDEV_TYPE_DRAID_SPARE; } else { err = is_shorthand_path(arg, path, sizeof (path), &statbuf, &wholedisk); @@ -662,17 +669,19 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) } } - /* - * Determine whether this is a device or a file. - */ - if (wholedisk || S_ISBLK(statbuf.st_mode)) { - type = VDEV_TYPE_DISK; - } else if (S_ISREG(statbuf.st_mode)) { - type = VDEV_TYPE_FILE; - } else { - (void) fprintf(stderr, gettext("cannot use '%s': must be a " - "block device or regular file\n"), path); - return (NULL); + if (type == NULL) { + /* + * Determine whether this is a device or a file. + */ + if (wholedisk || S_ISBLK(statbuf.st_mode)) { + type = VDEV_TYPE_DISK; + } else if (S_ISREG(statbuf.st_mode)) { + type = VDEV_TYPE_FILE; + } else { + fprintf(stderr, gettext("cannot use '%s': must " + "be a block device or regular file\n"), path); + return (NULL); + } } /* @@ -836,7 +845,8 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) rep.zprl_type = type; rep.zprl_children = 0; - if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || + strcmp(type, VDEV_TYPE_DRAID) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &rep.zprl_parity) == 0); @@ -1427,7 +1437,8 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, static const char * is_grouping(const char *type, int *mindev, int *maxdev) { - if (strncmp(type, "raidz", 5) == 0) { + if (strncmp(type, VDEV_TYPE_RAIDZ, 5) == 0 || + strncmp(type, VDEV_TYPE_DRAID, 5) == 0) { const char *p = type + 5; char *end; long nparity; @@ -1447,8 +1458,12 @@ is_grouping(const char *type, int *mindev, int *maxdev) if (mindev != NULL) *mindev = nparity + 1; if (maxdev != NULL) - *maxdev = 255; - return (VDEV_TYPE_RAIDZ); + *maxdev = VDEV_DRAID_MAX_CHILDREN; + + if (strncmp(type, VDEV_TYPE_RAIDZ, 5) == 0) + return (VDEV_TYPE_RAIDZ); + else + return (VDEV_TYPE_DRAID); } if (maxdev != NULL) @@ -1524,6 +1539,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { nvlist_t **child = NULL; int c, children = 0; + nvlist_t *draidcfg = NULL; if (strcmp(type, VDEV_TYPE_SPARE) == 0) { if (spares != NULL) { @@ -1601,6 +1617,34 @@ construct_spec(nvlist_t *props, int argc, char **argv) for (c = 1; c < argc; c++) { if (is_grouping(argv[c], NULL, NULL) != NULL) break; + + if (strcmp(type, VDEV_TYPE_DRAID) == 0 && + strncmp(argv[c], "cfg=", 4) == 0) { + if (draidcfg == NULL) { + draidcfg = + draidcfg_read_file(argv[c] + + 4); + if (draidcfg != NULL) + continue; + fprintf(stderr, + gettext("invalid draid " + "configuration '%s'\n"), + argv[c]); + } else { + fprintf(stderr, + gettext("dRAID config " + "specified more than " + "once: %s\n"), argv[c]); + } + + for (c = 0; c < children - 1; c++) + nvlist_free(child[c]); + free(child); + if (draidcfg != NULL) + nvlist_free(draidcfg); + return (NULL); + } + children++; child = realloc(child, children * sizeof (nvlist_t *)); @@ -1670,7 +1714,8 @@ construct_spec(nvlist_t *props, int argc, char **argv) ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_DEDUP) == 0); } - if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || + strcmp(type, VDEV_TYPE_DRAID) == 0) { verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, mindev - 1) == 0); @@ -1682,6 +1727,19 @@ construct_spec(nvlist_t *props, int argc, char **argv) for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); + + if (draidcfg != NULL) { + ASSERT0(strcmp(type, VDEV_TYPE_DRAID)); + + if (!vdev_draid_config_add(nv, + draidcfg)) + fprintf(stderr, + gettext("ignoring invalid " + "draid config\n")); + + nvlist_free(draidcfg); + draidcfg = NULL; + } } } else { /* diff --git a/configure.ac b/configure.ac index 301258e7f756..5fe18cadaaae 100644 --- a/configure.ac +++ b/configure.ac @@ -119,6 +119,7 @@ AC_CONFIG_FILES([ cmd/arc_summary/Makefile cmd/zed/Makefile cmd/raidz_test/Makefile + cmd/draidcfg/Makefile cmd/zgenhostid/Makefile contrib/Makefile contrib/bash_completion.d/Makefile diff --git a/include/libzfs.h b/include/libzfs.h index a8e3c9c404bc..a93b142b3127 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -429,7 +429,6 @@ extern int zpool_tryimport(libzfs_handle_t *hdl, char *target, nvlist_t **configp, importargs_t *args); /* legacy pool search routines */ -extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **); extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *, char *, uint64_t); @@ -893,6 +892,11 @@ int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); extern int zpool_enable_datasets(zpool_handle_t *, const char *, int); extern int zpool_disable_datasets(zpool_handle_t *, boolean_t); +/* + * dRAID import support + */ +nvlist_t *draidcfg_read_file(const char *); + /* * Support for Linux libudev derived persistent device strings */ diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 8bf376998bf6..b5d455678b61 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -93,11 +93,13 @@ COMMON_H = \ $(top_srcdir)/include/sys/vdev_disk.h \ $(top_srcdir)/include/sys/vdev_file.h \ $(top_srcdir)/include/sys/vdev.h \ + $(top_srcdir)/include/sys/vdev_scan.h \ $(top_srcdir)/include/sys/vdev_impl.h \ $(top_srcdir)/include/sys/vdev_indirect_births.h \ $(top_srcdir)/include/sys/vdev_indirect_mapping.h \ $(top_srcdir)/include/sys/vdev_raidz.h \ $(top_srcdir)/include/sys/vdev_raidz_impl.h \ + $(top_srcdir)/include/sys/vdev_draid_impl.h \ $(top_srcdir)/include/sys/vdev_removal.h \ $(top_srcdir)/include/sys/xvattr.h \ $(top_srcdir)/include/sys/zap.h \ diff --git a/include/sys/arc.h b/include/sys/arc.h index dc2fd03647f3..b74ae5bd2da6 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -292,6 +292,7 @@ void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg); uint64_t arc_target_bytes(void); +uint64_t arc_max_bytes(void); void arc_init(void); void arc_fini(void); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index bc7046fdced8..c35dd6c155b7 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -382,6 +382,7 @@ typedef struct dmu_buf { #define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" #define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" #define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" +#define DMU_POOL_REBUILDING "com.intel:rebuilding" /* * Allocate an object from this objset. The range of object numbers diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 6bbf8434619c..0ba297347308 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -711,6 +711,15 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" #define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" #define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */ + +#define ZPOOL_CONFIG_DRAIDCFG "com.intel:draid_config" +#define ZPOOL_CONFIG_DRAIDCFG_DATA "com.intel:draid_data" +#define ZPOOL_CONFIG_DRAIDCFG_PARITY "com.intel:draid_parity" +#define ZPOOL_CONFIG_DRAIDCFG_SPARE "com.intel:draid_spare" +#define ZPOOL_CONFIG_DRAIDCFG_BASE "com.intel:draid_base" +#define ZPOOL_CONFIG_DRAIDCFG_CHILDREN "com.intel:draid_children" +#define ZPOOL_CONFIG_DRAIDCFG_PERM "com.intel:draid_perm" + #define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */ @@ -745,6 +754,8 @@ typedef struct zpool_load_policy { #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" +#define VDEV_TYPE_DRAID "draid" +#define VDEV_TYPE_DRAID_SPARE "dspare" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" @@ -866,6 +877,7 @@ typedef enum pool_scan_func { POOL_SCAN_NONE, POOL_SCAN_SCRUB, POOL_SCAN_RESILVER, + POOL_SCAN_REBUILD, /* sequential SPA scan */ POOL_SCAN_FUNCS } pool_scan_func_t; @@ -1317,6 +1329,8 @@ typedef enum { * * ESC_ZFS_RESILVER_START * ESC_ZFS_RESILVER_END + * ESC_ZFS_REBUILD_START + * ESC_ZFS_REBUILD_FINISH * ESC_ZFS_POOL_DESTROY * ESC_ZFS_POOL_REGUID * diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index aa1c82a0258e..143213eeadce 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -366,6 +366,7 @@ struct metaslab { */ boolean_t ms_loaded; boolean_t ms_loading; + boolean_t ms_rebuilding; int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ diff --git a/include/sys/nvpair.h b/include/sys/nvpair.h index e8567933d2bc..3d5a82aff3f0 100644 --- a/include/sys/nvpair.h +++ b/include/sys/nvpair.h @@ -277,6 +277,7 @@ nvlist_t *fnvlist_alloc(void); void fnvlist_free(nvlist_t *); size_t fnvlist_size(nvlist_t *); char *fnvlist_pack(nvlist_t *, size_t *); +char *fnvlist_pack_xdr(nvlist_t *, size_t *); void fnvlist_pack_free(char *, size_t); nvlist_t *fnvlist_unpack(char *, size_t); nvlist_t *fnvlist_dup(nvlist_t *); diff --git a/include/sys/spa.h b/include/sys/spa.h index 443d835a1bd0..98f38a0b795b 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -773,12 +773,6 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_REMOVE_DONE 0x40 #define SPA_ASYNC_REMOVE_STOP 0x80 -/* - * Controls the behavior of spa_vdev_remove(). - */ -#define SPA_REMOVE_UNSPARE 0x01 -#define SPA_REMOVE_DONE 0x02 - /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 9dbdcfcf5284..be3f7433fa20 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -36,6 +36,8 @@ #include #include #include +#include +#include #include #include #include @@ -295,10 +297,11 @@ struct spa { spa_condensing_indirect_phys_t spa_condensing_indirect_phys; spa_condensing_indirect_t *spa_condensing_indirect; zthr_t *spa_condense_zthr; /* zthr doing condense. */ - uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */ spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ - zthr_t *spa_checkpoint_discard_zthr; + zthr_t *spa_checkpoint_discard_zthr; + + spa_vdev_scan_t *spa_vdev_scan; char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ diff --git a/include/sys/sysevent/eventdefs.h b/include/sys/sysevent/eventdefs.h index aa13bd5052c7..4678160f8ee5 100644 --- a/include/sys/sysevent/eventdefs.h +++ b/include/sys/sysevent/eventdefs.h @@ -95,6 +95,8 @@ extern "C" { */ #define ESC_ZFS_RESILVER_START "resilver_start" #define ESC_ZFS_RESILVER_FINISH "resilver_finish" +#define ESC_ZFS_REBUILD_START "rebuild_start" +#define ESC_ZFS_REBUILD_FINISH "rebuild_finish" #define ESC_ZFS_VDEV_REMOVE "vdev_remove" #define ESC_ZFS_VDEV_REMOVE_AUX "vdev_remove_aux" #define ESC_ZFS_VDEV_REMOVE_DEV "vdev_remove_dev" diff --git a/include/sys/vdev.h b/include/sys/vdev.h index b37b60bdd14d..968e2f0991aa 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -121,6 +121,7 @@ extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); extern void vdev_clear(spa_t *spa, vdev_t *vd); extern boolean_t vdev_is_dead(vdev_t *vd); +extern boolean_t vdev_is_dead_at(vdev_t *vd, uint64_t offset); extern boolean_t vdev_readable(vdev_t *vd); extern boolean_t vdev_writeable(vdev_t *vd); extern boolean_t vdev_allocatable(vdev_t *vd); diff --git a/include/sys/vdev_draid_impl.h b/include/sys/vdev_draid_impl.h new file mode 100644 index 000000000000..1bc9ebeab0d0 --- /dev/null +++ b/include/sys/vdev_draid_impl.h @@ -0,0 +1,113 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _VDEV_DRAID_IMPL_H +#define _VDEV_DRAID_IMPL_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +struct vdev_draid_configuration { + uint64_t dcf_data; + uint64_t dcf_parity; + uint64_t dcf_spare; + uint64_t dcf_children; + uint64_t dcf_bases; + abd_t *dcf_zero_abd; + const uint64_t *dcf_base_perms; +}; + +extern boolean_t vdev_draid_ms_mirrored(const vdev_t *, uint64_t); +extern boolean_t vdev_draid_group_degraded(vdev_t *, vdev_t *, + uint64_t, uint64_t, boolean_t); +extern uint64_t vdev_draid_check_block(const vdev_t *vd, uint64_t, uint64_t); +extern uint64_t vdev_draid_get_astart(const vdev_t *, const uint64_t); +extern uint64_t vdev_draid_offset2group(const vdev_t *, uint64_t, boolean_t); +extern uint64_t vdev_draid_group2offset(const vdev_t *, uint64_t, boolean_t); +extern boolean_t vdev_draid_is_remainder_group(const vdev_t *, + uint64_t, boolean_t); +extern uint64_t vdev_draid_get_groupsz(const vdev_t *, boolean_t); +extern boolean_t vdev_draid_config_validate(const vdev_t *, nvlist_t *); +extern boolean_t vdev_draid_config_add(nvlist_t *, nvlist_t *); +extern void vdev_draid_fix_skip_sectors(zio_t *); +extern int vdev_draid_hide_skip_sectors(raidz_map_t *); +extern void vdev_draid_restore_skip_sectors(raidz_map_t *, int); +extern boolean_t vdev_draid_readable(vdev_t *, uint64_t); +extern boolean_t vdev_draid_is_dead(vdev_t *, uint64_t); +extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t); +extern vdev_t *vdev_draid_spare_get_parent(vdev_t *); +extern nvlist_t *vdev_draid_spare_read_config(vdev_t *); +extern uint64_t vdev_draid_asize2psize(vdev_t *, uint64_t, uint64_t); +extern uint64_t vdev_draid_max_rebuildable_asize(vdev_t *, uint64_t); + +#define VDEV_DRAID_MAX_CHILDREN 255 +#define VDEV_DRAID_U8_MAX ((uint8_t)-1) + +/* + * Double '%' characters in the front because it's used as format string in + * scanf()/printf() family of functions + */ +#define VDEV_DRAID_SPARE_PATH_FMT "%%"VDEV_TYPE_DRAID"%lu-%lu-s%lu" + +#ifdef _KERNEL +#define U64FMT "%llu" +#ifdef ZFS_IS_GPL_COMPATIBLE +#define draid_print(fmt, ...) trace_printk(fmt, ##__VA_ARGS__) +#else +#define draid_print(fmt, ...) printk(fmt, ##__VA_ARGS__) +#endif +#define draid_console(fmt, ...) printk(KERN_EMERG fmt, ##__VA_ARGS__) +#else /* _KERNEL */ +#include +#define U64FMT "%"PRIu64 +#define draid_print(fmt, ...) printf(fmt, ##__VA_ARGS__) +#define draid_console(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) +#endif + +extern int draid_debug_lvl; +extern void vdev_draid_debug_zio(zio_t *, boolean_t); + +#define draid_dbg(lvl, fmt, ...) \ + do { \ + if ((lvl) == 0) \ + draid_console(fmt, ##__VA_ARGS__); \ + else if (draid_debug_lvl >= (lvl)) \ + draid_print(fmt, ##__VA_ARGS__); \ + } while (0); + + +#ifdef __cplusplus +} +#endif + +#endif /* _VDEV_DRAID_IMPL_H */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index e055161e8374..1a7e60263545 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -250,6 +250,9 @@ struct vdev { /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ + uint64_t vdev_last_io; /* lbolt of last non-scan I/O */ + nvlist_t *vdev_cfg; /* additional dRAID configuration */ + /* * Values stored in the config for an indirect or removing vdev. */ @@ -466,6 +469,8 @@ extern vdev_ops_t vdev_root_ops; extern vdev_ops_t vdev_mirror_ops; extern vdev_ops_t vdev_replacing_ops; extern vdev_ops_t vdev_raidz_ops; +extern vdev_ops_t vdev_draid_ops; +extern vdev_ops_t vdev_draid_spare_ops; extern vdev_ops_t vdev_disk_ops; extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; @@ -473,6 +478,31 @@ extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; extern vdev_ops_t vdev_indirect_ops; +/* + * Virtual device vector for mirroring. + */ +typedef struct mirror_child { + vdev_t *mc_vd; + uint64_t mc_offset; + int mc_error; + int mc_load; + uint8_t mc_tried; + uint8_t mc_skipped; + uint8_t mc_speculative; +} mirror_child_t; + +typedef struct mirror_map { + int *mm_preferred; + int mm_preferred_cnt; + int mm_children; + boolean_t mm_replacing; + boolean_t mm_root; + mirror_child_t mm_child[]; +} mirror_map_t; + +extern mirror_map_t *vdev_mirror_map_alloc(int, boolean_t, boolean_t); +extern const zio_vsd_ops_t vdev_mirror_vsd_ops; + /* * Common size functions */ diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 0799ed19dfc8..32ae63471a77 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -29,6 +29,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -106,6 +107,7 @@ typedef struct raidz_col { uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ abd_t *rc_abd; /* I/O data */ + abd_t *rc_abd_skip; /* Skip sector */ void *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ @@ -123,13 +125,18 @@ typedef struct raidz_map { uint64_t rm_nskip; /* Skipped sectors for padding */ uint64_t rm_skipstart; /* Column index of padding start */ abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ + abd_t *rm_abd_skip; /* dRAID skip sectors */ uintptr_t rm_reports; /* # of referencing checksum reports */ uint8_t rm_freed; /* map no longer has referencing ZIO */ uint8_t rm_ecksuminjected; /* checksum error was injected */ raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ + vdev_t *rm_vdev; /* RAIDz/dRAID vdev */ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ } raidz_map_t; +#define vdev_raidz_map_declustered(rm) ((rm)->rm_vdev != NULL && \ + (rm)->rm_vdev->vdev_ops == &vdev_draid_ops) + #define RAIDZ_ORIGINAL_IMPL (INT_MAX) extern const raidz_impl_ops_t vdev_raidz_scalar_impl; diff --git a/include/sys/vdev_scan.h b/include/sys/vdev_scan.h new file mode 100644 index 000000000000..151fd7ca4615 --- /dev/null +++ b/include/sys/vdev_scan.h @@ -0,0 +1,78 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018, Intel Corporation. + */ + +#ifndef _SYS_VDEV_SCAN_H +#define _SYS_VDEV_SCAN_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct spa_rebuilding_phys { + uint64_t sr_vdev; + uint64_t sr_oldvd; + int64_t sr_ms; +} spa_rebuilding_phys_t; + +typedef struct spa_vdev_scan { + dsl_pool_t *svs_dp; + vdev_t *svs_vd; + kthread_t *svs_thread; + kmutex_t svs_lock; + kcondvar_t svs_cv; + boolean_t svs_thread_exit; + uint64_t svs_dtl_max; + int svs_msi; + int svs_msi_synced; + int *svs_ms_done; + + kmutex_t svs_io_lock; + kcondvar_t svs_io_cv; + uint64_t svs_io_asize; + + spa_rebuilding_phys_t svs_phys; +} spa_vdev_scan_t; + +extern void spa_vdev_scan_setup_sync(dmu_tx_t *); +extern void spa_vdev_scan_start(spa_t *, vdev_t *, int, uint64_t); +extern int spa_vdev_scan_restart(vdev_t *); +extern int spa_vdev_scan_rebuild_cb(dsl_pool_t *, + const blkptr_t *, const zbookmark_phys_t *); +extern void spa_vdev_scan_suspend(spa_t *); +extern void spa_vdev_scan_destroy(spa_t *); +extern void spa_vdev_scan_sync_state(spa_vdev_scan_t *, dmu_tx_t *); + +#define DSL_SCAN_IS_REBUILD(scn) ((scn)->scn_phys.scn_func == POOL_SCAN_REBUILD) + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_SCAN_H */ diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 84bc7f816734..63f0eba20e67 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -65,6 +65,7 @@ typedef enum spa_feature { SPA_FEATURE_POOL_CHECKPOINT, SPA_FEATURE_SPACEMAP_V2, SPA_FEATURE_ALLOCATION_CLASSES, + SPA_FEATURE_DRAID, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index da40c96ce77b..0709c5c5298e 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -60,6 +60,7 @@ libzfs_la_LIBADD = \ $(top_builddir)/lib/libshare/libshare.la \ $(top_builddir)/lib/libtpool/libtpool.la \ $(top_builddir)/lib/libuutil/libuutil.la \ + $(top_builddir)/lib/libzpool/libzpool.la \ $(top_builddir)/lib/libzfs_core/libzfs_core.la libzfs_la_LIBADD += -lm $(LIBBLKID) $(LIBUDEV) $(LIBSSL) diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index 634c076b89cf..b0d574e55501 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include "libzfs.h" #include "libzfs_impl.h" @@ -912,7 +913,7 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config) /* * Determine if the vdev id is a hole in the namespace. */ -boolean_t +static boolean_t vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) { int c; @@ -926,6 +927,64 @@ vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) return (B_FALSE); } +nvlist_t * +draidcfg_read_file(const char *path) +{ + int fd; + struct stat64 sb; + char *buf; + nvlist_t *config; + + if ((fd = open(path, O_RDONLY)) < 0) { + (void) fprintf(stderr, "Cannot open '%s'\n", path); + return (NULL); + } + + if (fstat64(fd, &sb) != 0) { + (void) fprintf(stderr, "Failed to stat '%s'\n", path); + close(fd); + return (NULL); + } + + if (!S_ISREG(sb.st_mode)) { + (void) fprintf(stderr, "Not a regular file '%s'\n", path); + close(fd); + return (NULL); + } + + if ((buf = malloc(sb.st_size)) == NULL) { + (void) fprintf(stderr, "Failed to allocate %llu bytes\n", + (u_longlong_t)sb.st_size); + close(fd); + return (NULL); + } + + if (read(fd, buf, sb.st_size) != sb.st_size) { + (void) fprintf(stderr, "Failed to read %llu bytes\n", + (u_longlong_t)sb.st_size); + close(fd); + free(buf); + return (NULL); + } + + (void) close(fd); + + if (nvlist_unpack(buf, sb.st_size, &config, 0) != 0) { + (void) fprintf(stderr, "Failed to unpack nvlist\n"); + free(buf); + return (NULL); + } + + free(buf); + + if (!vdev_draid_config_validate(NULL, config)) { + nvlist_free(config); + return (NULL); + } + + return (config); +} + /* * Convert our list of pools into the definitive set of configurations. We * start by picking the best config for each toplevel vdev. Once that's done, @@ -2146,17 +2205,6 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) return (ret); } -nvlist_t * -zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv) -{ - importargs_t iarg = { 0 }; - - iarg.paths = argc; - iarg.path = argv; - - return (zpool_find_import_impl(hdl, &iarg)); -} - /* * Given a cache file, return the contents as a list of importable pools. * poolname or guid (but not both) are provided by the caller when trying diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index fca1a4178a42..5fb041b8ce07 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include "zfs_namecheck.h" @@ -972,6 +973,7 @@ zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) if (ret == 0 && !isopen && (strncmp(pool, "mirror", 6) == 0 || strncmp(pool, "raidz", 5) == 0 || + strncmp(pool, "draid", 5) == 0 || strncmp(pool, "spare", 5) == 0 || strcmp(pool, "log") == 0)) { if (hdl != NULL) @@ -2363,6 +2365,7 @@ zpool_vdev_is_interior(const char *name) strncmp(name, VDEV_TYPE_SPARE, strlen(VDEV_TYPE_SPARE)) == 0 || strncmp(name, VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 || + strncmp(name, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0 || strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) return (B_TRUE); return (B_FALSE); @@ -2953,6 +2956,10 @@ zpool_vdev_attach(zpool_handle_t *zhp, if (islog) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a log with a spare")); + else if (new_disk[0] == VDEV_DRAID_SPARE_PATH_FMT[0]) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dspare can only replace a child " + "drive in its parent draid vdev")); else if (version >= SPA_VERSION_MULTI_REPLACE) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "already in replacing/spare config; wait " @@ -3346,6 +3353,12 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot remove %s"), path); + if (path[0] == VDEV_DRAID_SPARE_PATH_FMT[0]) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dRAID spare cannot be removed")); + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + } + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) @@ -3895,7 +3908,8 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, /* * Remove the partition from the path it this is a whole disk. */ - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) + if (strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0 && + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) == 0 && value && !(name_flags & VDEV_NAME_PATH)) { return (zfs_strip_partition(path)); } @@ -3905,7 +3919,8 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, /* * If it's a raidz device, we need to stick in the parity level. */ - if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) { + if (strcmp(path, VDEV_TYPE_RAIDZ) == 0 || + strcmp(path, VDEV_TYPE_DRAID) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &value) == 0); (void) snprintf(buf, sizeof (buf), "%s%llu", path, diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index e10f20dd98dc..1a2c2c9b39b8 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -109,6 +109,7 @@ KERNEL_C = \ unique.c \ vdev.c \ vdev_cache.c \ + vdev_draid.c \ vdev_file.c \ vdev_indirect_births.c \ vdev_indirect.c \ @@ -129,6 +130,7 @@ KERNEL_C = \ vdev_raidz_math_ssse3.c \ vdev_removal.c \ vdev_root.c \ + vdev_scan.c \ zap.c \ zap_leaf.c \ zap_micro.c \ diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 index 5cc4db45e0ed..7516c65eca11 100644 --- a/man/man5/zpool-features.5 +++ b/man/man5/zpool-features.5 @@ -305,6 +305,23 @@ required in order to support crash dumps under Linux. Existing pools where this feature is \fB\fBactive\fR can be imported. .RE +.sp +.ne 2 +.na +\fB\fBdraid\fR\fR +.ad +.RS 4n +.TS +l l . +GUID com.intel:draid +READ\-ONLY COMPATIBLE no +DEPENDENCIES none +.TE + +This feature enables use of the \fBdraid\fR vdev driver. + +.RE + .sp .ne 2 .na diff --git a/module/nvpair/fnvpair.c b/module/nvpair/fnvpair.c index dc8257e48594..b0645637b7ac 100644 --- a/module/nvpair/fnvpair.c +++ b/module/nvpair/fnvpair.c @@ -73,15 +73,26 @@ fnvlist_size(nvlist_t *nvl) * Returns allocated buffer of size *sizep. Caller must free the buffer with * fnvlist_pack_free(). */ -char * -fnvlist_pack(nvlist_t *nvl, size_t *sizep) +static char * +fnvlist_pack_enc(nvlist_t *nvl, size_t *sizep, int encoding) { char *packed = 0; - VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE, - KM_SLEEP), ==, 0); + VERIFY3U(nvlist_pack(nvl, &packed, sizep, encoding, KM_SLEEP), ==, 0); return (packed); } +char * +fnvlist_pack(nvlist_t *nvl, size_t *sizep) +{ + return (fnvlist_pack_enc(nvl, sizep, NV_ENCODE_NATIVE)); +} + +char * +fnvlist_pack_xdr(nvlist_t *nvl, size_t *sizep) +{ + return (fnvlist_pack_enc(nvl, sizep, NV_ENCODE_XDR)); +} + /*ARGSUSED*/ void fnvlist_pack_free(char *pack, size_t size) diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index e630481cbff3..15b6063ccb50 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -253,6 +253,10 @@ zpool_feature_init(void) "Crash dumps to multiple vdev pools.", 0, NULL); + zfeature_register(SPA_FEATURE_DRAID, + "com.intel:draid", "draid", "draid vdev driver.", + ZFEATURE_FLAG_MOS, NULL); + zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM, "com.delphix:spacemap_histogram", "spacemap_histogram", "Spacemaps maintain space histograms.", diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c index 58b23b0e00b0..af0373267a65 100644 --- a/module/zcommon/zfs_namecheck.c +++ b/module/zcommon/zfs_namecheck.c @@ -381,7 +381,9 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what) return (-1); } - if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) { + if (strcmp(pool, "mirror") == 0 || + strcmp(pool, "raidz") == 0 || + strcmp(pool, "draid") == 0) { if (why) *why = NAME_ERR_RESERVED; return (-1); diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index a243f51d86bd..0ee803553a46 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -85,6 +85,7 @@ $(MODULE)-objs += unique.o $(MODULE)-objs += vdev.o $(MODULE)-objs += vdev_cache.o $(MODULE)-objs += vdev_disk.o +$(MODULE)-objs += vdev_draid.o $(MODULE)-objs += vdev_file.o $(MODULE)-objs += vdev_indirect.o $(MODULE)-objs += vdev_indirect_births.o @@ -98,6 +99,7 @@ $(MODULE)-objs += vdev_raidz_math.o $(MODULE)-objs += vdev_raidz_math_scalar.o $(MODULE)-objs += vdev_removal.o $(MODULE)-objs += vdev_root.o +$(MODULE)-objs += vdev_scan.o $(MODULE)-objs += zap.o $(MODULE)-objs += zap_leaf.o $(MODULE)-objs += zap_micro.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 07612468dcb5..60114a02069c 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -7683,6 +7683,12 @@ arc_target_bytes(void) return (arc_c); } +uint64_t +arc_max_bytes(void) +{ + return (arc_c_max); +} + void arc_init(void) { diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index b84c2aa45fd7..b43986936c7e 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -199,8 +200,9 @@ int zfs_free_bpobj_enabled = 1; /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { NULL, - dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ - dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ + dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ + dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ + spa_vdev_scan_rebuild_cb, /* POOL_SCAN_REBUILD */ }; /* In core node for the scn->scn_queue. Represents a dataset to be scanned */ @@ -330,8 +332,11 @@ dsl_scan_is_running(const dsl_scan_t *scn) boolean_t dsl_scan_resilvering(dsl_pool_t *dp) { - return (dsl_scan_is_running(dp->dp_scan) && - dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); + dsl_scan_t *scn = dp->dp_scan; + + return (dsl_scan_is_running(scn) && + (scn->scn_phys.scn_func == POOL_SCAN_RESILVER || + DSL_SCAN_IS_REBUILD(scn))); } static inline void @@ -480,6 +485,12 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) "by old software; restarting in txg %llu", (longlong_t)scn->scn_restart_txg); } + + if (DSL_SCAN_IS_REBUILD(scn) && + scn->scn_phys.scn_state == DSS_SCANNING) { + ASSERT3P(spa->spa_vdev_scan, ==, NULL); + scn->scn_phys.scn_state = DSS_CANCELED; + } } /* reload the queue into the in-core state */ @@ -631,6 +642,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) ASSERT(!dsl_scan_is_running(scn)); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); + ASSERT(*funcp != POOL_SCAN_REBUILD); bzero(&scn->scn_phys, sizeof (scn->scn_phys)); scn->scn_phys.scn_func = *funcp; scn->scn_phys.scn_state = DSS_SCANNING; @@ -754,18 +766,22 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; - int i; + boolean_t rebuild = DSL_SCAN_IS_REBUILD(scn); - /* Remove any remnants of an old-style scrub. */ - for (i = 0; old_names[i]; i++) { - (void) zap_remove(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); - } + if (!rebuild) { + int i; - if (scn->scn_phys.scn_queue_obj != 0) { - VERIFY0(dmu_object_free(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, tx)); - scn->scn_phys.scn_queue_obj = 0; + /* Remove any remnants of an old-style scrub. */ + for (i = 0; old_names[i]; i++) { + (void) zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); + } + + if (scn->scn_phys.scn_queue_obj != 0) { + VERIFY0(dmu_object_free(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, tx)); + scn->scn_phys.scn_queue_obj = 0; + } } scan_ds_queue_clear(scn); @@ -802,7 +818,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) spa_history_log_internal(spa, "scan done", tx, "errors=%llu", spa_get_errlog_size(spa)); - if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + if (DSL_SCAN_IS_SCRUB_RESILVER(scn) || rebuild) { spa->spa_scrub_started = B_FALSE; spa->spa_scrub_active = B_FALSE; @@ -821,9 +837,16 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, scn->scn_phys.scn_max_txg, B_TRUE); - spa_event_notify(spa, NULL, NULL, - scn->scn_phys.scn_min_txg ? - ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); + const char *name; + + if (rebuild) + name = ESC_ZFS_REBUILD_FINISH; + else if (scn->scn_phys.scn_min_txg) + name = ESC_ZFS_RESILVER_FINISH; + else + name = ESC_ZFS_SCRUB_FINISH; + + spa_event_notify(spa, NULL, NULL, name); } else { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, 0, B_TRUE); @@ -853,6 +876,8 @@ dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) if (!dsl_scan_is_running(scn)) return (SET_ERROR(ENOENT)); + if (DSL_SCAN_IS_REBUILD(scn)) + return (SET_ERROR(ENOTSUP)); return (0); } @@ -943,6 +968,9 @@ dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) void dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) { + if (dp->dp_spa->spa_vdev_scan != NULL) + return; + if (txg == 0) { dmu_tx_t *tx; tx = dmu_tx_create_dd(dp->dp_mos_dir); @@ -2322,7 +2350,7 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, zbookmark_phys_t zb = { 0 }; int p; - if (!dsl_scan_is_running(scn)) + if (!dsl_scan_is_running(scn) || DSL_SCAN_IS_REBUILD(scn)) return; for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { @@ -3010,10 +3038,7 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, * then it may be possible to skip the resilver IO. The psize * is provided instead of asize to simplify the check for RAIDZ. */ - if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) - return (B_FALSE); - - return (B_TRUE); + return (vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)); } static int @@ -3181,11 +3206,18 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (dsl_scan_restarting(scn, tx)) { pool_scan_func_t func = POOL_SCAN_SCRUB; dsl_scan_done(scn, B_FALSE, tx); - if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) - func = POOL_SCAN_RESILVER; + if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { + if (spa->spa_vdev_scan != NULL) + func = POOL_SCAN_REBUILD; + else + func = POOL_SCAN_RESILVER; + } zfs_dbgmsg("restarting scan func=%u txg=%llu", func, (longlong_t)tx->tx_txg); - dsl_scan_setup_sync(&func, tx); + if (func == POOL_SCAN_REBUILD) + spa_vdev_scan_setup_sync(tx); + else + dsl_scan_setup_sync(&func, tx); } /* @@ -3237,6 +3269,47 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn)) return; + if (DSL_SCAN_IS_REBUILD(scn)) { + spa_vdev_scan_t *svs = spa->spa_vdev_scan; + int msi; + boolean_t done; + + ASSERT(svs != NULL); + + mutex_enter(&svs->svs_lock); + done = (svs->svs_thread == NULL) ? B_TRUE : B_FALSE; + msi = svs->svs_msi_synced; + mutex_exit(&svs->svs_lock); + + if (done) { + boolean_t complete = !svs->svs_thread_exit; + + if (complete) { + ASSERT3U(msi + 1, ==, + svs->svs_vd->vdev_top->vdev_ms_count); + svs->svs_phys.sr_ms = -1; + svs->svs_phys.sr_vdev = 0; + svs->svs_phys.sr_oldvd = 0; + } + dsl_scan_done(scn, complete, tx); + /* + * HH: remove calls to dsl_scan_sync_state() here and + * below, when states shared with DSL scan are removed + */ + dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); + spa_vdev_scan_sync_state(svs, tx); + + spa_vdev_scan_destroy(spa); + svs = NULL; + } else if (msi == -1 || msi > svs->svs_phys.sr_ms) { + svs->svs_phys.sr_ms = msi; + dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); + spa_vdev_scan_sync_state(svs, tx); + } + /* Rebuild is mostly handled in the open-context scan thread */ + return; + } + /* * Wait a few txgs after importing to begin scanning so that * we can get the pool imported quickly. diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index a117dc4460b8..cb82481086f1 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -219,7 +220,6 @@ boolean_t metaslab_trace_enabled = B_TRUE; uint64_t metaslab_trace_max_entries = 5000; #endif -static uint64_t metaslab_weight(metaslab_t *); static void metaslab_set_fragmentation(metaslab_t *); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); @@ -1167,8 +1167,8 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) * tree looking for a block that matches the specified criteria. */ static uint64_t -metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, - uint64_t align) +metaslab_block_picker(metaslab_t *msp, avl_tree_t *t, uint64_t *cursor, + uint64_t size, uint64_t align) { range_seg_t *rs = metaslab_block_find(t, *cursor, size); @@ -1176,8 +1176,27 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, uint64_t offset = P2ROUNDUP(rs->rs_start, align); if (offset + size <= rs->rs_end) { - *cursor = offset + size; - return (offset); + vdev_t *vd = msp->ms_group->mg_vd; + uint64_t next_offset; + + if (vd->vdev_ops != &vdev_draid_ops) { + *cursor = offset + size; + return (offset); + } + + next_offset = vdev_draid_check_block(vd, offset, size); + if (next_offset == offset) { + *cursor = offset + size; + return (offset); + } + + offset = P2ROUNDUP(next_offset, align); + if (offset + size <= rs->rs_end) { + ASSERT3U(offset, ==, + vdev_draid_check_block(vd, offset, size)); + *cursor = offset + size; + return (offset); + } } rs = AVL_NEXT(t, rs); } @@ -1190,7 +1209,7 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, return (-1ULL); *cursor = 0; - return (metaslab_block_picker(t, cursor, size, align)); + return (metaslab_block_picker(msp, t, cursor, size, align)); } #endif /* WITH_FF/DF/CF_BLOCK_ALLOCATOR */ @@ -1214,7 +1233,7 @@ metaslab_ff_alloc(metaslab_t *msp, uint64_t size) uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; avl_tree_t *t = &msp->ms_allocatable->rt_root; - return (metaslab_block_picker(t, cursor, size, align)); + return (metaslab_block_picker(msp, t, cursor, size, align)); } static metaslab_ops_t metaslab_ff_ops = { @@ -1267,7 +1286,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) *cursor = 0; } - return (metaslab_block_picker(t, cursor, size, 1ULL)); + return (metaslab_block_picker(msp, t, cursor, size, 1ULL)); } static metaslab_ops_t metaslab_df_ops = { @@ -1502,11 +1521,19 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); + ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; ms->ms_allocator = -1; ms->ms_new = B_TRUE; + if (vd->vdev_ops == &vdev_draid_ops) { + uint64_t astart = vdev_draid_get_astart(vd, ms->ms_start); + + ASSERT3U(astart - ms->ms_start, <, ms->ms_size); + ms->ms_size -= astart - ms->ms_start; + ms->ms_start = astart; + } /* * We only open space map objects that already exist. All others @@ -1730,6 +1757,30 @@ metaslab_set_fragmentation(metaslab_t *msp) msp->ms_fragmentation = fragmentation; } +/* + * dRAID metaslabs start at a certain alignment, which causes their sizes to + * vary by a few sectors. The block allocator may get confused and pick a + * distant metaslab because the closer ones are slightly smaller. The small + * variance doesn't matter when the metaslab has already been allocated from. + * + * This function returns adjusted size to calculate metaslab weight, and + * should not be used for other purposes. + */ +static uint64_t +metaslab_weight_size(metaslab_t *msp) +{ + vdev_t *vd = msp->ms_group->mg_vd; + uint64_t size; + + if (vd->vdev_ops != &vdev_draid_ops || + space_map_allocated(msp->ms_sm) != 0) + return (msp->ms_size); + + size = 1ULL << vd->vdev_ms_shift; + ASSERT3U(size, >=, msp->ms_size); + return (size); +} + /* * Compute a weight -- a selection preference value -- for the given metaslab. * This is based on the amount of free space, the level of fragmentation, @@ -1748,7 +1799,7 @@ metaslab_space_weight(metaslab_t *msp) /* * The baseline weight is the metaslab's free space. */ - space = msp->ms_size - space_map_allocated(msp->ms_sm); + space = metaslab_weight_size(msp) - space_map_allocated(msp->ms_sm); if (metaslab_fragmentation_factor_enabled && msp->ms_fragmentation != ZFS_FRAG_INVALID) { @@ -1885,7 +1936,7 @@ metaslab_segment_weight(metaslab_t *msp) * The metaslab is completely free. */ if (space_map_allocated(msp->ms_sm) == 0) { - int idx = highbit64(msp->ms_size) - 1; + int idx = highbit64(metaslab_weight_size(msp)) - 1; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; if (idx < max_idx) { @@ -2401,10 +2452,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * could call into the DMU, because the DMU can call down to us * (e.g. via zio_free()) at any time. * - * The spa_vdev_remove_thread() can be reading metaslab state - * concurrently, and it is locked out by the ms_sync_lock. Note - * that the ms_lock is insufficient for this, because it is dropped - * by space_map_write(). + * The spa_vdev_remove_thread() or spa_scan_thread() can be reading + * metaslab state * concurrently, and it is locked out by the + * ms_sync_lock. Note that the ms_lock is insufficient for this, + * because it is dropped by space_map_write(). */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); @@ -2967,6 +3018,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) metaslab_class_t *mc = msp->ms_group->mg_class; VERIFY(!msp->ms_condensing); + VERIFY(!msp->ms_rebuilding); start = mc->mc_ops->msop_alloc(msp, size); if (start != -1ULL) { @@ -2979,7 +3031,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) range_tree_remove(rt, start, size); if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) - vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); + vdev_dirty(vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); @@ -3029,7 +3081,7 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, /* * If the selected metaslab is condensing, skip it. */ - if (msp->ms_condensing) + if (msp->ms_condensing || msp->ms_rebuilding) continue; *was_active = msp->ms_allocator != -1; @@ -3192,7 +3244,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, * we can't manipulate this metaslab until it's committed * to disk. */ - if (msp->ms_condensing) { + if (msp->ms_condensing || msp->ms_rebuilding) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_CONDENSING, allocator); metaslab_passivate(msp, msp->ms_weight & @@ -3248,7 +3300,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, * we may end up in an infinite loop retrying the same * metaslab. */ - ASSERT(!metaslab_should_allocate(msp, asize)); + ASSERT(!metaslab_should_allocate(msp, asize) || + mg->mg_vd->vdev_ops == &vdev_draid_ops); mutex_exit(&msp->ms_lock); } @@ -3547,6 +3600,7 @@ metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; VERIFY(!msp->ms_condensing); + VERIFY(!msp->ms_rebuilding); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index a1851bca25ab..76eff8beb74f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -79,6 +80,7 @@ #include #include #include +#include #include #include #include @@ -1436,6 +1438,7 @@ spa_unload(spa_t *spa) * Stop async tasks. */ spa_async_suspend(spa); + spa_vdev_scan_suspend(spa); /* * Stop syncing. @@ -1490,6 +1493,8 @@ spa_unload(spa_t *spa) spa_condense_fini(spa); + spa_vdev_scan_destroy(spa); + bpobj_close(&spa->spa_deferred_bpobj); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -4217,7 +4222,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * Check all DTLs to see if anything needs resilvering. */ if (!dsl_scan_resilvering(spa->spa_dsl_pool) && - vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) + vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && + spa_vdev_scan_restart(spa->spa_root_vdev) != 0) spa_async_request(spa, SPA_ASYNC_RESILVER); /* @@ -4979,6 +4985,72 @@ spa_create_check_encryption_params(dsl_crypto_params_t *dcp, return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); } +static int +spa_add_draid_spare(nvlist_t *nvroot, vdev_t *rvd) +{ + int i, j, n; + nvlist_t **oldspares, **newspares; + uint_t nspares; + vdev_t *c; + struct vdev_draid_configuration *cfg; + + for (i = 0, n = 0; i < rvd->vdev_children; i++) { + c = rvd->vdev_child[i]; + + if (c->vdev_ops == &vdev_draid_ops) { + cfg = c->vdev_tsd; + ASSERT(cfg != NULL); + n += cfg->dcf_spare; + } + } + + if (n == 0) + return (0); + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &oldspares, &nspares) != 0) + nspares = 0; + + newspares = kmem_alloc(sizeof (*newspares) * (n + nspares), KM_SLEEP); + for (i = 0; i < nspares; i++) + newspares[i] = fnvlist_dup(oldspares[i]); + + for (i = 0, n = nspares; i < rvd->vdev_children; i++) { + c = rvd->vdev_child[i]; + + if (c->vdev_ops != &vdev_draid_ops) + continue; + + cfg = c->vdev_tsd; + for (j = 0; j < cfg->dcf_spare; j++) { + nvlist_t *ds = fnvlist_alloc(); + char path[64]; + + snprintf(path, sizeof (path), VDEV_DRAID_SPARE_PATH_FMT, + (long unsigned)c->vdev_nparity, + (long unsigned)c->vdev_id, (long unsigned)j); + fnvlist_add_string(ds, ZPOOL_CONFIG_PATH, path); + fnvlist_add_string(ds, + ZPOOL_CONFIG_TYPE, VDEV_TYPE_DRAID_SPARE); + fnvlist_add_uint64(ds, ZPOOL_CONFIG_IS_LOG, 0); + fnvlist_add_uint64(ds, ZPOOL_CONFIG_IS_SPARE, 1); + fnvlist_add_uint64(ds, ZPOOL_CONFIG_WHOLE_DISK, 1); + fnvlist_add_uint64(ds, + ZPOOL_CONFIG_ASHIFT, c->vdev_ashift); + + newspares[n] = ds; + n++; + } + } + + (void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES); + fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, newspares, n); + for (i = 0; i < n; i++) + nvlist_free(newspares[i]); + kmem_free(newspares, sizeof (*newspares) * n); + return (0); +} + /* * Pool Creation */ @@ -5002,6 +5074,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, char *feat_name; char *poolname; nvlist_t *nvl; + int draid = 0; if (props == NULL || nvlist_lookup_string(props, "tname", &poolname) != 0) @@ -5106,17 +5179,21 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && + (error = spa_add_draid_spare(nvroot, rvd)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { /* * instantiate the metaslab groups (this will dirty the vdevs) * we can no longer error exit past this point */ - for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { + for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_metaslab_set_size(vd); vdev_expand(vd, txg); + + if (vd->vdev_ops == &vdev_draid_ops) + draid++; } } @@ -5247,6 +5324,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_sync_props(props, tx); } + for (int i = 0; i < draid; i++) + spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); + dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; @@ -5736,6 +5816,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; + int c, draid = 0; ASSERT(spa_writeable(spa)); @@ -5773,18 +5854,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * If we are in the middle of a device removal, we can only add * devices which match the existing devices in the pool. * If we are in the middle of a removal, or have some indirect - * vdevs, we can not add raidz toplevels. + * vdevs, we can not add raidz or draid toplevels. */ if (spa->spa_vdev_removal != NULL || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { - for (int c = 0; c < vd->vdev_children; c++) { + for (c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; if (spa->spa_vdev_removal != NULL && tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } /* Fail if top level vdev is raidz */ - if (tvd->vdev_ops == &vdev_raidz_ops) { + if (tvd->vdev_ops == &vdev_raidz_ops || + tvd->vdev_ops == &vdev_draid_ops) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } /* @@ -5804,7 +5886,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) } } - for (int c = 0; c < vd->vdev_children; c++) { + for (c = 0; c < vd->vdev_children; c++) { /* * Set the vdev id to the first hole, if one exists. @@ -5820,6 +5902,20 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) tvd->vdev_id = id; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); + + if (tvd->vdev_ops == &vdev_draid_ops) + draid++; + } + + if (draid != 0) { + dmu_tx_t *tx; + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + for (c = 0; c < draid; c++) + spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); + + dmu_tx_commit(tx); } if (nspares != 0) { @@ -5859,6 +5955,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) return (0); } +static int spa_rebuild_mirror = 0; /* * Attach a device to a mirror. The arguments are the path to any device * in the mirror, and the nvroot for the new device. If the path specifies @@ -5882,11 +5979,15 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) char *oldvdpath, *newvdpath; int newvd_isspare; int error; + boolean_t rebuild = B_FALSE; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); + if (spa->spa_vdev_scan != NULL) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@ -5922,6 +6023,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); + /* + * dRAID spare can only replace a child drive of its parent + * dRAID vdev + */ + if (newvd->vdev_ops == &vdev_draid_spare_ops && + oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + /* * Spares can't replace logs */ @@ -6039,8 +6148,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, - dtl_max_txg - TXG_INITIAL); + vdev_dtl_dirty(newvd, DTL_MISSING, + TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); @@ -6056,12 +6165,19 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ vdev_dirty(tvd, VDD_DTL, newvd, txg); + if (newvd->vdev_ops == &vdev_draid_spare_ops || + (tvd->vdev_ops == &vdev_mirror_ops && spa_rebuild_mirror != 0)) + rebuild = B_TRUE; /* HH: let zpool cmd choose */ + /* * Schedule the resilver to restart in the future. We do this to * ensure that dmu_sync-ed blocks have been stitched into the * respective datasets. */ - dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); + if (rebuild) + spa_vdev_scan_start(spa, oldvd, 0, dtl_max_txg); + else + dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); @@ -6214,6 +6330,17 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) unspare = B_TRUE; + /* + * If we are detaching a draid spare that is being rebuilt, we need to + * abort the rebuild thread. + */ + if (replace_done == 0 && + pvd->vdev_ops == &vdev_spare_ops && + vd->vdev_ops == &vdev_draid_spare_ops && + spa->spa_vdev_scan != NULL && + spa->spa_vdev_scan->svs_vd->vdev_parent == pvd) + spa->spa_vdev_scan->svs_thread_exit = B_TRUE; + /* * Erase the disk labels so the disk can be used for other things. * This must be done after all other error cases are handled, @@ -6853,9 +6980,13 @@ spa_scan(spa_t *spa, pool_scan_func_t func) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); - if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) + if (func >= POOL_SCAN_FUNCS || + func == POOL_SCAN_NONE || func == POOL_SCAN_REBUILD) return (SET_ERROR(ENOTSUP)); + if (spa->spa_vdev_scan != NULL) + return (SET_ERROR(EBUSY)); + /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. @@ -8266,6 +8397,10 @@ module_param(spa_load_verify_data, int, 0644); MODULE_PARM_DESC(spa_load_verify_data, "Set to traverse data on pool import"); +module_param(spa_rebuild_mirror, int, 0644); +MODULE_PARM_DESC(spa_rebuild_mirror, + "Set to enable rebuild on mirror vdev"); + module_param(spa_load_print_vdev_tree, int, 0644); MODULE_PARM_DESC(spa_load_print_vdev_tree, "Print vdev tree to zfs_dbgmsg during pool import"); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index dfe444368022..b6a76ccf850a 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -38,6 +38,8 @@ #include #include #include +#include +#include #include #include #include @@ -179,6 +181,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent) static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, + &vdev_draid_ops, + &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, @@ -275,6 +279,16 @@ vdev_get_min_asize(vdev_t *vd) return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / pvd->vdev_children); + if (pvd->vdev_ops == &vdev_draid_ops) { + struct vdev_draid_configuration *cfg = pvd->vdev_tsd; + + ASSERT(cfg != NULL); + ASSERT3U(pvd->vdev_nparity, ==, cfg->dcf_parity); + ASSERT3U(pvd->vdev_children, ==, cfg->dcf_children); + return (pvd->vdev_min_asize / + (pvd->vdev_children - cfg->dcf_spare)); + } + return (pvd->vdev_min_asize); } @@ -496,6 +510,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); + vd->vdev_cfg = NULL; + vd->vdev_last_io = 0; vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); @@ -548,6 +564,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, vdev_indirect_config_t *vic; char *tmp = NULL; int rc; + nvlist_t *draidcfg = NULL; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); @@ -604,7 +621,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, * Set the nparity property for RAID-Z vdevs. */ nparity = -1ULL; - if (ops == &vdev_raidz_ops) { + if (ops == &vdev_raidz_ops || ops == &vdev_draid_ops) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) @@ -655,6 +672,21 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } } + if (ops == &vdev_draid_ops) { + if (nvlist_lookup_nvlist(nv, + ZPOOL_CONFIG_DRAIDCFG, &draidcfg) != 0) + return (SET_ERROR(EINVAL)); + if (!vdev_draid_config_validate(NULL, draidcfg)) + return (SET_ERROR(EINVAL)); + if (alloctype == VDEV_ALLOC_ADD && + spa->spa_load_state != SPA_LOAD_CREATE && + !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { + cmn_err(CE_WARN, "pool '%s' adding a dRAID " + "VDEV requires feature@draid", spa_name(spa)); + return (SET_ERROR(EINVAL)); + } + } + vd = vdev_alloc_common(spa, id, guid, ops); vic = &vd->vdev_indirect_config; @@ -662,6 +694,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, vd->vdev_nparity = nparity; if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; + if (ops == &vdev_draid_ops) + vd->vdev_cfg = fnvlist_dup(draidcfg); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); @@ -912,6 +946,9 @@ vdev_free(vdev_t *vd) if (vd->vdev_isl2cache) spa_l2cache_remove(vd); + if (vd->vdev_cfg) + fnvlist_free(vd->vdev_cfg); + txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); @@ -967,6 +1004,7 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) int t; ASSERT(tvd == tvd->vdev_top); + ASSERT(svd->vdev_ops != &vdev_draid_ops); tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite; tvd->vdev_ms_array = svd->vdev_ms_array; @@ -1412,6 +1450,9 @@ vdev_probe(vdev_t *vd, zio_t *zio) ASSERT(vd->vdev_ops->vdev_op_leaf); + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (NULL); + /* * Don't probe the probe. */ @@ -1771,6 +1812,7 @@ vdev_open(vdev_t *vd) * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); @@ -3454,6 +3496,9 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; @@ -3641,6 +3686,18 @@ vdev_is_dead(vdev_t *vd) vd->vdev_ops == &vdev_missing_ops); } +boolean_t +vdev_is_dead_at(vdev_t *vd, uint64_t zio_offset) +{ + if (vd->vdev_top == NULL || vd->vdev_top->vdev_ops != &vdev_draid_ops) + return (vdev_is_dead(vd)); + + if (vd->vdev_ops == &vdev_draid_spare_ops) + zio_offset -= VDEV_LABEL_START_SIZE; + + return (vdev_draid_is_dead(vd, zio_offset)); +} + boolean_t vdev_readable(vdev_t *vd) { @@ -3918,7 +3975,8 @@ vdev_stat_update(zio_t *zio, uint64_t psize) uint64_t *processed = &scn_phys->scn_processed; /* XXX cleanup? */ - if (vd->vdev_ops->vdev_op_leaf) + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } @@ -3981,20 +4039,22 @@ vdev_stat_update(zio_t *zio, uint64_t psize) return; mutex_enter(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { + if (type == ZIO_TYPE_READ && !vdev_is_dead_at(vd, zio->io_offset)) { if (zio->io_error == ECKSUM) vs->vs_checksum_errors++; else vs->vs_read_errors++; } - if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) + if (type == ZIO_TYPE_WRITE && !vdev_is_dead_at(vd, zio->io_offset)) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); + /* HH: todo proper rebuild IO error handling... */ if (spa->spa_load_state == SPA_LOAD_NONE && type == ZIO_TYPE_WRITE && txg != 0 && + vd->vdev_ops != &vdev_draid_spare_ops && (!(flags & ZIO_FLAG_IO_REPAIR) || - (flags & ZIO_FLAG_SCAN_THREAD) || + ((flags & ZIO_FLAG_SCAN_THREAD) && spa->spa_vdev_scan == NULL) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c new file mode 100644 index 000000000000..8476384d2cf0 --- /dev/null +++ b/module/zfs/vdev_draid.c @@ -0,0 +1,1688 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#else +#include +#endif + +#include "vdev_raidz.h" + + +int draid_debug_lvl = 1; + +static void +vdev_draid_debug_map(int lvl, raidz_map_t *rm) +{ + int c; + + for (c = 0; rm != NULL && c < rm->rm_scols; c++) { + char t = 'D'; + raidz_col_t *rc = &rm->rm_col[c]; + vdev_t *cvd = rm->rm_vdev->vdev_child[rc->rc_devidx]; + + if (c >= rm->rm_cols) { + t = 'S'; + } else if (c < rm->rm_firstdatacol) { + switch (c) { + case 0: + t = 'P'; + break; + case 1: + t = 'Q'; + break; + case 2: + t = 'R'; + break; + default: + ASSERT0(c); + } + } + + draid_dbg(lvl, + "%c: dev "U64FMT" (%s) off "U64FMT"K, sz "U64FMT"K, " + "err %d, skipped %d, tried %d\n", t, rc->rc_devidx, + cvd->vdev_path != NULL ? cvd->vdev_path : "NA", + rc->rc_offset >> 10, rc->rc_size >> 10, + rc->rc_error, rc->rc_skipped, rc->rc_tried); + } +} + +void +vdev_draid_debug_zio(zio_t *zio, boolean_t mirror) +{ + ASSERT0(mirror); + + draid_dbg(3, "%s zio: off "U64FMT"K sz "U64FMT"K data %p\n", + mirror ? "Mirror" : "dRAID", zio->io_offset >> 10, + zio->io_size >> 10, zio->io_abd); + + if (!mirror) + vdev_draid_debug_map(3, zio->io_vsd); +} + +/* A child vdev is divided into slices */ +static unsigned int slice_shift = 0; +#define DRAID_SLICESHIFT (SPA_MAXBLOCKSHIFT + slice_shift) +/* 2 ** slice_shift * SPA_MAXBLOCKSIZE */ +#define DRAID_SLICESIZE (1ULL << DRAID_SLICESHIFT) +#define DRAID_SLICEMASK (DRAID_SLICESIZE - 1) + +static int +vdev_draid_get_permutation(uint64_t *p, uint64_t nr, + const struct vdev_draid_configuration *cfg) +{ + uint64_t i; + uint64_t ncols = cfg->dcf_children; + uint64_t off = nr % (cfg->dcf_bases * ncols); + uint64_t base = off / ncols; + uint64_t dev = off % ncols; + + for (i = 0; i < ncols; i++) { + const uint64_t *base_perm = cfg->dcf_base_perms + + (base * ncols); + + p[i] = (base_perm[i] + dev) % ncols; + } + + return (0); +} + +noinline static raidz_map_t * +vdev_draid_map_alloc(zio_t *zio, uint64_t **array) +{ + vdev_t *vd = zio->io_vd; + const struct vdev_draid_configuration *cfg = vd->vdev_tsd; + const uint64_t unit_shift = vd->vdev_top->vdev_ashift; + const uint64_t ndata = cfg->dcf_data; + const uint64_t nparity = cfg->dcf_parity; + const uint64_t nspare = cfg->dcf_spare; + const uint64_t ncols = cfg->dcf_children; + /* The starting DRAID (parent) vdev sector of the block. */ + const uint64_t b = zio->io_offset >> unit_shift; + /* The zio's size in units of the vdev's minimum sector size. */ + const uint64_t psize = zio->io_size >> unit_shift; + const uint64_t slice = DRAID_SLICESIZE >> unit_shift; + uint64_t o, q, r, c, bc, acols, scols, asize, tot; + uint64_t perm, perm_off, group, group_offset, group_left, abd_off; + raidz_map_t *rm; + uint64_t *permutation; + + ASSERT(!vdev_draid_ms_mirrored(vd, + zio->io_offset >> vd->vdev_ms_shift)); + ASSERT3U(ncols % (nparity + ndata), ==, nspare); + ASSERT0(b % (nparity + ndata)); + ASSERT0(P2PHASE(DRAID_SLICESIZE, 1ULL << unit_shift)); + + /* HH: may not actually need the nspare columns for normal IO */ + permutation = kmem_alloc(sizeof (permutation[0]) * ncols, KM_SLEEP); + + perm = b / ((ncols - nspare) * slice); + perm_off = b % ((ncols - nspare) * slice); + group = perm_off / ((nparity + ndata) * slice); + group_offset = perm_off % ((nparity + ndata) * slice); + ASSERT0(group_offset % (nparity + ndata)); + + group_left = (slice - group_offset / (nparity + ndata)) * ndata; + ASSERT3U(psize, <=, group_left); + + /* The starting byte offset on each child vdev. */ + o = (perm * slice + group_offset / (nparity + ndata)) << unit_shift; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + */ + q = psize / ndata; + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + r = psize - q * ndata; + + /* The number of "big columns" - those which contain remainder data. */ + bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ + tot = psize + nparity * (q + (r == 0 ? 0 : 1)); + + /* acols: The columns that will be accessed. */ + /* scols: The columns that will be accessed or skipped. */ + if (q == 0) { + /* Our I/O request doesn't span all child vdevs. */ + acols = bc; + } else { + acols = nparity + ndata; + } + scols = nparity + ndata; + + ASSERT3U(acols, <=, scols); + + rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); + rm->rm_cols = acols; + rm->rm_scols = scols; + rm->rm_bigcols = bc; + rm->rm_skipstart = bc; + rm->rm_missingdata = 0; + rm->rm_missingparity = 0; + rm->rm_firstdatacol = nparity; + rm->rm_abd_copy = NULL; + rm->rm_reports = 0; + rm->rm_freed = 0; + rm->rm_ecksuminjected = 0; + rm->rm_vdev = vd; + + VERIFY0(vdev_draid_get_permutation(permutation, perm, cfg)); + + for (c = 0, asize = 0; c < scols; c++) { + uint64_t i = group * (nparity + ndata) + c; + + ASSERT3U(i, <, ncols - nspare); + + rm->rm_col[c].rc_devidx = permutation[i]; + rm->rm_col[c].rc_offset = o; + rm->rm_col[c].rc_abd = NULL; + rm->rm_col[c].rc_gdata = NULL; + rm->rm_col[c].rc_error = 0; + rm->rm_col[c].rc_tried = 0; + rm->rm_col[c].rc_skipped = 0; + + if (c >= acols) + rm->rm_col[c].rc_size = 0; + else if (c < bc) + rm->rm_col[c].rc_size = (q + 1) << unit_shift; + else + rm->rm_col[c].rc_size = q << unit_shift; + + asize += rm->rm_col[c].rc_size; + } + + ASSERT3U(asize, ==, tot << unit_shift); + rm->rm_asize = roundup(asize, (ndata + nparity) << unit_shift); + rm->rm_nskip = roundup(tot, ndata + nparity) - tot; + ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); + ASSERT3U(rm->rm_nskip, <, ndata); + + if (rm->rm_nskip == 0 || + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) == 0) + rm->rm_abd_skip = NULL; + else + rm->rm_abd_skip = + abd_alloc_linear(rm->rm_nskip << unit_shift, B_TRUE); + + for (c = 0; c < rm->rm_firstdatacol; c++) + rm->rm_col[c].rc_abd = + abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE); + + abd_off = 0; + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, abd_off, + rm->rm_col[c].rc_size); + abd_off += rm->rm_col[c].rc_size; + + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, + abd_off, rm->rm_col[c].rc_size); + abd_off += rm->rm_col[c].rc_size; + } + + if (array == NULL) + kmem_free(permutation, sizeof (permutation[0]) * ncols); + else + *array = permutation; /* caller will free */ + rm->rm_ops = vdev_raidz_math_get_ops(); + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; + return (rm); +} + +noinline static mirror_map_t * +vdev_draid_mirror_map_alloc(zio_t *zio, uint64_t unit_shift, + const struct vdev_draid_configuration *cfg, uint64_t **array) +{ + const uint64_t nparity = cfg->dcf_parity; + const uint64_t copies = nparity + 1; + const uint64_t nspare = cfg->dcf_spare; + const uint64_t ncols = cfg->dcf_children; + /* The starting DRAID (parent) vdev sector of the block. */ + const uint64_t b = zio->io_offset >> unit_shift; + const uint64_t slice = DRAID_SLICESIZE >> unit_shift; + vdev_t *vd = zio->io_vd; + uint64_t o, c, perm, perm_off, group, group_offset; + mirror_map_t *mm; + uint64_t *permutation; + ASSERTV(const uint64_t psize = zio->io_size >> unit_shift); + + ASSERT(vdev_draid_ms_mirrored(vd, zio->io_offset >> vd->vdev_ms_shift)); + ASSERT3U(ncols % (nparity + cfg->dcf_data), ==, nspare); + ASSERT0(P2PHASE(DRAID_SLICESIZE, 1ULL << unit_shift)); + + perm = b / ((ncols - nspare) * slice); + perm_off = b % ((ncols - nspare) * slice); + group = perm_off / (copies * slice); + ASSERT3U(group, <, (ncols - nspare) / copies); + group_offset = perm_off % (copies * slice); + ASSERT0(group_offset % copies); + ASSERT3U(psize, <=, slice - group_offset / copies); + /* The starting byte offset on each child vdev. */ + o = (perm * slice + group_offset / copies) << unit_shift; + + mm = vdev_mirror_map_alloc(copies, B_FALSE, B_FALSE); + permutation = kmem_alloc(sizeof (permutation[0]) * ncols, KM_SLEEP); + VERIFY0(vdev_draid_get_permutation(permutation, perm, cfg)); + + for (c = 0; c < mm->mm_children; c++) { + int idx = group * copies + c; + mirror_child_t *mc = &mm->mm_child[c]; + + /* The remainder group is not usable for IO */ + ASSERT3U(idx, <, ((ncols - nspare) / copies) * copies); + + mc->mc_vd = vd->vdev_child[permutation[idx]]; + mc->mc_offset = o; + } + + if (array == NULL) + kmem_free(permutation, sizeof (permutation[0]) * ncols); + else + *array = permutation; /* caller will free */ + + zio->io_vsd = mm; + zio->io_vsd_ops = &vdev_mirror_vsd_ops; + return (mm); +} + +static inline void +vdev_draid_assert_vd(const vdev_t *vd) +{ + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT(cfg != NULL); + ASSERT3U(vd->vdev_nparity, ==, cfg->dcf_parity); + ASSERT3U(vd->vdev_children, ==, cfg->dcf_children); + ASSERT(cfg->dcf_zero_abd != NULL); +} + +uint64_t +vdev_draid_get_groupsz(const vdev_t *vd, boolean_t mirror) +{ + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + uint64_t copies; + + ASSERT0(mirror); + + vdev_draid_assert_vd(vd); + + copies = mirror ? + vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data; + return (copies << DRAID_SLICESHIFT); +} + +#define DRAID_PERM_ASIZE(vd) (((vd)->vdev_children - \ + ((struct vdev_draid_configuration *)(vd)->vdev_tsd)->dcf_spare) \ + << DRAID_SLICESHIFT) + +uint64_t +vdev_draid_offset2group(const vdev_t *vd, uint64_t offset, boolean_t mirror) +{ + uint64_t perm, perm_off, group, copies, groups_per_perm; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + ASSERT0(mirror); + vdev_draid_assert_vd(vd); + + perm = offset / DRAID_PERM_ASIZE(vd); + perm_off = offset % DRAID_PERM_ASIZE(vd); + group = perm_off / vdev_draid_get_groupsz(vd, mirror); + + copies = mirror ? + vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data; + groups_per_perm = (vd->vdev_children - cfg->dcf_spare + copies - 1) + / copies; + + return (perm * groups_per_perm + group); +} + +uint64_t +vdev_draid_group2offset(const vdev_t *vd, uint64_t group, boolean_t mirror) +{ + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + uint64_t copies, groups_per_perm, offset; + + ASSERT0(mirror); + vdev_draid_assert_vd(vd); + + copies = mirror ? + vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data; + groups_per_perm = (vd->vdev_children - cfg->dcf_spare + copies - 1) + / copies; + + offset = DRAID_PERM_ASIZE(vd) * (group / groups_per_perm); + offset += + vdev_draid_get_groupsz(vd, mirror) * (group % groups_per_perm); + return (offset); +} + +boolean_t +vdev_draid_is_remainder_group(const vdev_t *vd, + uint64_t group, boolean_t mirror) +{ + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + uint64_t copies, groups_per_perm; + + ASSERT0(mirror); + vdev_draid_assert_vd(vd); + + copies = mirror ? + vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data; + groups_per_perm = (vd->vdev_children - cfg->dcf_spare + copies - 1) + / copies; + + if ((vd->vdev_children - cfg->dcf_spare) % copies == 0) + return (B_FALSE); + + /* Currently only mirror can have remainder group */ + ASSERT(mirror); + + /* The last group in each permutation is the remainder */ + if (group % groups_per_perm == groups_per_perm - 1) + return (B_TRUE); + else + return (B_FALSE); +} + +uint64_t +vdev_draid_get_astart(const vdev_t *vd, const uint64_t start) +{ + uint64_t astart, perm_off, copies; + boolean_t mirror = + vdev_draid_ms_mirrored(vd, start >> vd->vdev_ms_shift); + uint64_t group = vdev_draid_offset2group(vd, start, mirror); + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + ASSERT0(mirror); + vdev_draid_assert_vd(vd); + + if (vdev_draid_is_remainder_group(vd, group, mirror)) + return (start); + + perm_off = start % DRAID_PERM_ASIZE(vd); + copies = mirror ? + vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data; + astart = roundup(perm_off, copies << vd->vdev_ashift); + astart += start - perm_off; + + ASSERT3U(astart, >=, start); + return (astart); +} + +uint64_t +vdev_draid_check_block(const vdev_t *vd, uint64_t start, uint64_t size) +{ + boolean_t mirror = + vdev_draid_ms_mirrored(vd, start >> vd->vdev_ms_shift); + uint64_t group = vdev_draid_offset2group(vd, start, mirror); + uint64_t end = start + size - 1; + + ASSERT0(mirror); + ASSERT3U(size, <, vdev_draid_get_groupsz(vd, mirror)); + ASSERT3U(start >> vd->vdev_ms_shift, ==, end >> vd->vdev_ms_shift); + + /* + * A block is good if it: + * - does not cross group boundary, AND + * - does not use a remainder group + */ + if (group == vdev_draid_offset2group(vd, end, mirror) && + !vdev_draid_is_remainder_group(vd, group, mirror)) { + ASSERT3U(start, ==, vdev_draid_get_astart(vd, start)); + return (start); + } + + group++; + if (vdev_draid_is_remainder_group(vd, group, mirror)) + group++; + ASSERT(!vdev_draid_is_remainder_group(vd, group, mirror)); + return (vdev_draid_group2offset(vd, group, mirror)); +} + +boolean_t +vdev_draid_ms_mirrored(const vdev_t *vd, uint64_t ms_id) +{ + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); +#if 0 + /* HH: dedicate 1/20 ms for hybrid mirror */ + if ((ms_id % 20) == 19) + return (B_TRUE); + else +#endif + return (B_FALSE); +} + +static vdev_t *vdev_dspare_get_child(vdev_t *vd, uint64_t offset); + +/* + * dRAID spare does not fit into the DTL model. While it has child vdevs, + * there is no redundancy among them, and the effective child vdev is + * determined by offset. Moreover, DTLs of a child vdev before the spare + * becomes active are invalid, because the spare blocks were not in use yet. + * + * Here we are essentially doing a vdev_dtl_reassess() on the fly, by replacing + * a dRAID spare with the child vdev under the offset. Note that it is a + * recursive process because the child vdev can be another dRAID spare, and so + * on. + */ +boolean_t +vdev_draid_missing(vdev_t *vd, uint64_t offset, uint64_t txg, uint64_t size) +{ + int c; + + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) + return (B_TRUE); + + if (vd->vdev_ops == &vdev_draid_spare_ops) + vd = vdev_dspare_get_child(vd, offset); + + if (vd->vdev_ops != &vdev_spare_ops) + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); + + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) + return (B_TRUE); + + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (!vdev_draid_missing(cvd, offset, txg, size)) + return (B_FALSE); + } + + return (B_TRUE); +} + +boolean_t +vdev_draid_readable(vdev_t *vd, uint64_t offset) +{ + int c; + + if (vd->vdev_ops == &vdev_draid_spare_ops) + vd = vdev_dspare_get_child(vd, offset); + + if (vd->vdev_ops != &vdev_spare_ops) + return (vdev_readable(vd)); + + for (c = 0; c < vd->vdev_children; c++) + if (vdev_draid_readable(vd->vdev_child[c], offset)) + return (B_TRUE); + + return (B_FALSE); +} + +boolean_t +vdev_draid_is_dead(vdev_t *vd, uint64_t offset) +{ + int c; + + if (vd->vdev_ops == &vdev_draid_spare_ops) + vd = vdev_dspare_get_child(vd, offset); + + if (vd->vdev_ops != &vdev_spare_ops) + return (vdev_is_dead(vd)); + + for (c = 0; c < vd->vdev_children; c++) + if (!vdev_draid_is_dead(vd->vdev_child[c], offset)) + return (B_FALSE); + + return (B_TRUE); +} + +static boolean_t +vdev_draid_guid_exists(vdev_t *vd, uint64_t guid, uint64_t offset) +{ + int c; + + if (vd->vdev_ops == &vdev_draid_spare_ops) + vd = vdev_dspare_get_child(vd, offset); + + if (vd->vdev_guid == guid) + return (B_TRUE); + + if (vd->vdev_ops->vdev_op_leaf) + return (B_FALSE); + + for (c = 0; c < vd->vdev_children; c++) + if (vdev_draid_guid_exists(vd->vdev_child[c], guid, offset)) + return (B_TRUE); + + return (B_FALSE); +} + +static boolean_t +vdev_draid_vd_degraded(vdev_t *vd, const vdev_t *oldvd, uint64_t offset) +{ + if (oldvd == NULL) /* Resilver */ + return (!vdev_dtl_empty(vd, DTL_PARTIAL)); + + /* Rebuild */ + ASSERT(oldvd->vdev_ops->vdev_op_leaf); + ASSERT(oldvd->vdev_ops != &vdev_draid_spare_ops); + + return (vdev_draid_guid_exists(vd, oldvd->vdev_guid, offset)); +} + +boolean_t +vdev_draid_group_degraded(vdev_t *vd, vdev_t *oldvd, + uint64_t offset, uint64_t size, boolean_t mirror) +{ + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t group = vdev_draid_offset2group(vd, offset, mirror); + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + boolean_t degraded = B_FALSE; + zio_t *zio; + int c, dummy_data; + uint64_t *perm; + char buf[128]; + + ASSERT0(mirror); + vdev_draid_assert_vd(vd); + ASSERT(!vdev_draid_is_remainder_group(vd, group, mirror)); + + zio = kmem_alloc(sizeof (*zio), KM_SLEEP); + zio->io_vd = vd; + zio->io_offset = offset; + zio->io_size = MAX(SPA_MINBLOCKSIZE, 1ULL << ashift); + zio->io_abd = abd_get_from_buf(&dummy_data, zio->io_size); + + buf[0] = '\0'; + if (mirror) { + mirror_map_t *mm = + vdev_draid_mirror_map_alloc(zio, ashift, cfg, &perm); + + ASSERT3U(mm->mm_children, ==, cfg->dcf_parity + 1); + + for (c = 0; c < mm->mm_children; c++) { + mirror_child_t *mc = &mm->mm_child[c]; + char *status = ""; + + if (vdev_draid_vd_degraded(mc->mc_vd, + oldvd, mc->mc_offset)) { + degraded = B_TRUE; + status = "*"; + } + snprintf(buf + strlen(buf), sizeof (buf) - strlen(buf), + U64FMT"%s ", mc->mc_vd->vdev_id, status); + } + } else { + raidz_map_t *rm = vdev_draid_map_alloc(zio, &perm); + + ASSERT3U(rm->rm_scols, ==, cfg->dcf_parity + cfg->dcf_data); + + for (c = 0; c < rm->rm_scols; c++) { + raidz_col_t *rc = &rm->rm_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + char *status = ""; + + if (vdev_draid_vd_degraded(cvd, oldvd, rc->rc_offset)) { + degraded = B_TRUE; + status = "*"; + } + snprintf(buf + strlen(buf), sizeof (buf) - strlen(buf), + U64FMT"%s ", cvd->vdev_id, status); + } + } + + snprintf(buf + strlen(buf), sizeof (buf) - strlen(buf), "spares: "); + for (c = 0; c < cfg->dcf_spare; c++) + snprintf(buf + strlen(buf), sizeof (buf) - strlen(buf), + U64FMT" ", perm[cfg->dcf_children - 1 - c]); + draid_dbg(4, "%s %s at "U64FMT"K of "U64FMT"K: %s\n", + degraded ? "Degraded" : "Healthy", + mirror ? "mirror" : "draid", + offset >> 10, size >> 10, buf); + + kmem_free(perm, sizeof (perm[0]) * cfg->dcf_children); + (*zio->io_vsd_ops->vsd_free)(zio); + abd_put(zio->io_abd); + kmem_free(zio, sizeof (*zio)); + return (degraded); +} + +boolean_t +vdev_draid_config_validate(const vdev_t *vd, nvlist_t *config) +{ + int i; + uint_t c; + uint8_t *perm = NULL; + uint64_t n, d, p, s, b; + + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_DRAIDCFG_CHILDREN, &n) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_CHILDREN); + return (B_FALSE); + } + + if (n - 1 > VDEV_DRAID_U8_MAX) { + draid_dbg(0, "%s configuration too large: "U64FMT"\n", + ZPOOL_CONFIG_DRAIDCFG_CHILDREN, n); + return (B_FALSE); + } + if (vd != NULL && n != vd->vdev_children) + return (B_FALSE); + + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_DRAIDCFG_PARITY, &p) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_PARITY); + return (B_FALSE); + } + + if (vd != NULL && p != vd->vdev_nparity) + return (B_FALSE); + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_DATA, &d) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_DATA); + return (B_FALSE); + } + + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_DRAIDCFG_SPARE, &s) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_SPARE); + return (B_FALSE); + } + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_BASE, &b) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_BASE); + return (B_FALSE); + } + + if (n == 0 || d == 0 || p == 0 || s == 0 || b == 0) { + draid_dbg(0, "Zero n/d/p/s/b\n"); + return (B_FALSE); + } + + if (p > VDEV_RAIDZ_MAXPARITY) { + draid_dbg(0, "Invalid parity "U64FMT"\n", p); + return (B_FALSE); + } + + if ((n - s) % (p + d) != 0) { + draid_dbg(0, U64FMT" mod "U64FMT" is not 0\n", n - s, p + d); + return (B_FALSE); + } + + if (nvlist_lookup_uint8_array(config, + ZPOOL_CONFIG_DRAIDCFG_PERM, &perm, &c) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_PERM); + return (B_FALSE); + } + + if (c != b * n) { + draid_dbg(0, + "Permutation array has %u items, but "U64FMT" expected\n", + c, b * n); + return (B_FALSE); + } + + for (i = 0; i < b; i++) { + int j, k; + for (j = 0; j < n; j++) { + uint64_t val = perm[i * n + j]; + + if (val >= n) { + draid_dbg(0, + "Invalid value "U64FMT" in " + "permutation %d\n", val, i); + return (B_FALSE); + } + + for (k = 0; k < j; k++) { + if (val == perm[i * n + k]) { + draid_dbg(0, + "Duplicated value "U64FMT" in " + "permutation %d\n", + val, i); + return (B_FALSE); + } + } + } + } + + return (B_TRUE); +} + +boolean_t +vdev_draid_config_add(nvlist_t *top, nvlist_t *draidcfg) +{ + char *type; + uint64_t parity; + nvlist_t **children = NULL; + uint_t c = 0; + + if (draidcfg == NULL) + return (B_FALSE); + + type = fnvlist_lookup_string(top, ZPOOL_CONFIG_TYPE); + if (strcmp(type, VDEV_TYPE_DRAID) != 0) + return (B_FALSE); + + parity = fnvlist_lookup_uint64(top, ZPOOL_CONFIG_NPARITY); + if (parity != fnvlist_lookup_uint64(draidcfg, + ZPOOL_CONFIG_DRAIDCFG_PARITY)) + return (B_FALSE); + + VERIFY0(nvlist_lookup_nvlist_array(top, + ZPOOL_CONFIG_CHILDREN, &children, &c)); + if (c != + fnvlist_lookup_uint64(draidcfg, ZPOOL_CONFIG_DRAIDCFG_CHILDREN)) + return (B_FALSE); + + /* HH: todo: check permutation array csum */ + fnvlist_add_nvlist(top, ZPOOL_CONFIG_DRAIDCFG, draidcfg); + return (B_TRUE); +} + +/* Unfortunately this requires GPL-only symbols */ +#ifdef ZFS_IS_GPL_COMPATIBLE +#define __DRAID_HARDENING +#else +#undef __DRAID_HARDENING +#endif + +static void +vdev_draid_setup_page(const void *start, size_t sz, boolean_t readonly) +{ +#ifdef __DRAID_HARDENING + ASSERT(sz != 0); + + if (!IS_P2ALIGNED(sz, PAGESIZE) || !IS_P2ALIGNED(start, PAGESIZE)) { + draid_dbg(1, "Buffer not page aligned %p %lu\n", start, sz); + return; + } + +#ifdef _KERNEL + if (readonly) + set_memory_ro((unsigned long)start, sz >> PAGE_SHIFT); + else + set_memory_rw((unsigned long)start, sz >> PAGE_SHIFT); +#endif +#endif +} + +static inline void +vdev_draid_set_mem_ro(const void *start, size_t sz) +{ + vdev_draid_setup_page(start, sz, B_TRUE); +} + +static inline void +vdev_draid_set_mem_rw(const void *start, size_t sz) +{ + vdev_draid_setup_page(start, sz, B_FALSE); +} + +static uint64_t * +vdev_draid_create_base_perms(const uint8_t *perms, + const struct vdev_draid_configuration *cfg) +{ + int i, j; + uint64_t children = cfg->dcf_children, *base_perms; + size_t sz = sizeof (uint64_t) * cfg->dcf_bases * children; + +#ifdef __DRAID_HARDENING + sz = P2ROUNDUP(sz, PAGESIZE); +#endif + base_perms = kmem_alloc(sz, KM_SLEEP); + for (i = 0; i < cfg->dcf_bases; i++) + for (j = 0; j < children; j++) + base_perms[i * children + j] = perms[i * children + j]; + + vdev_draid_set_mem_ro(base_perms, sz); + return (base_perms); +} + +static struct vdev_draid_configuration * +vdev_draid_config_create(vdev_t *vd) +{ + uint_t c; + uint8_t *perms = NULL; + nvlist_t *nvl = vd->vdev_cfg; + struct vdev_draid_configuration *cfg; + + ASSERT(nvl != NULL); + + if (!vdev_draid_config_validate(vd, nvl)) + return (NULL); + + cfg = kmem_alloc(sizeof (*cfg), KM_SLEEP); + cfg->dcf_children = fnvlist_lookup_uint64(nvl, + ZPOOL_CONFIG_DRAIDCFG_CHILDREN); + cfg->dcf_data = fnvlist_lookup_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_DATA); + cfg->dcf_parity = fnvlist_lookup_uint64(nvl, + ZPOOL_CONFIG_DRAIDCFG_PARITY); + cfg->dcf_spare = fnvlist_lookup_uint64(nvl, + ZPOOL_CONFIG_DRAIDCFG_SPARE); + cfg->dcf_bases = fnvlist_lookup_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_BASE); + + VERIFY0(nvlist_lookup_uint8_array(nvl, + ZPOOL_CONFIG_DRAIDCFG_PERM, &perms, &c)); + + cfg->dcf_base_perms = vdev_draid_create_base_perms(perms, cfg); + cfg->dcf_zero_abd = NULL; + return (cfg); +} + +static int +vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *ashift) +{ + vdev_t *cvd; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + uint64_t nparity = vd->vdev_nparity; + int c; + int lasterror = 0; + int numerrors = 0; + + ASSERT(nparity > 0); + + if (nparity > VDEV_RAIDZ_MAXPARITY || + vd->vdev_children < nparity + 1) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* vd->vdev_tsd must be set before vdev_open_children(vd) */ + if (cfg == NULL) { + cfg = vdev_draid_config_create(vd); + if (cfg == NULL) + return (SET_ERROR(EINVAL)); + vd->vdev_tsd = cfg; + } else { + ASSERT(vd->vdev_reopening); + } + + vdev_open_children(vd); + + for (c = 0; c < vd->vdev_children; c++) { + cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) { + lasterror = cvd->vdev_open_error; + numerrors++; + continue; + } + + *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; + *ashift = MAX(*ashift, cvd->vdev_ashift); + } + + if (cfg->dcf_zero_abd == NULL) { + abd_t *zabd; + size_t sz = 1ULL << MAX(*ashift, vd->vdev_ashift); + +#ifdef __DRAID_HARDENING + sz = P2ROUNDUP(sz, PAGESIZE); +#endif + zabd = abd_alloc_linear(sz, B_TRUE); + abd_zero(zabd, sz); + vdev_draid_set_mem_ro(abd_to_buf(zabd), sz); + cfg->dcf_zero_abd = zabd; + } + + /* HH: asize becomes tricky with hybrid mirror */ + *asize *= vd->vdev_children - cfg->dcf_spare; + *max_asize *= vd->vdev_children - cfg->dcf_spare; + + if (numerrors > nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + return (0); +} + +static void +vdev_draid_close(vdev_t *vd) +{ + int c; + size_t sz; + abd_t *zabd; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + for (c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); + + if (vd->vdev_reopening || cfg == NULL) + return; + + zabd = cfg->dcf_zero_abd; + ASSERT(zabd != NULL); + vdev_draid_set_mem_rw(abd_to_buf(zabd), zabd->abd_size); + abd_free(zabd); + + sz = sizeof (uint64_t) * cfg->dcf_bases * cfg->dcf_children; +#ifdef __DRAID_HARDENING + sz = P2ROUNDUP(sz, PAGESIZE); +#endif + vdev_draid_set_mem_rw(cfg->dcf_base_perms, sz); + kmem_free((void *)cfg->dcf_base_perms, sz); + + kmem_free(cfg, sizeof (*cfg)); + vd->vdev_tsd = NULL; +} + +uint64_t +vdev_draid_asize_by_type(const vdev_t *vd, uint64_t psize, boolean_t mirror) +{ + uint64_t asize; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t nparity = vd->vdev_nparity; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + ASSERT0(mirror); + vdev_draid_assert_vd(vd); + + asize = ((psize - 1) >> ashift) + 1; + + if (mirror) { + asize *= 1 + nparity; + } else { /* draid */ + ASSERT3U(cfg->dcf_data, !=, 0); + asize = roundup(asize, cfg->dcf_data); + asize += nparity * (asize / cfg->dcf_data); + ASSERT0(asize % (nparity + cfg->dcf_data)); + } + + ASSERT(asize != 0); + return (asize << ashift); +} + +static uint64_t +vdev_draid_asize(vdev_t *vd, uint64_t psize) +{ +#if 0 + uint64_t sector = ((psize - 1) >> vd->vdev_top->vdev_ashift) + 1; + + return (vdev_draid_asize_by_type(vd, psize, sector == 1)); +#else + return (vdev_draid_asize_by_type(vd, psize, B_FALSE)); +#endif +} + +uint64_t +vdev_draid_asize2psize(vdev_t *vd, uint64_t asize, uint64_t offset) +{ + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t msid = offset >> vd->vdev_ms_shift; + boolean_t mirror = vdev_draid_ms_mirrored(vd, msid); + uint64_t psize; + + ASSERT0(mirror); + ASSERT0(P2PHASE(asize, 1ULL << ashift)); + ASSERT0(P2PHASE(offset, 1ULL << ashift)); + + if (mirror) { + ASSERT0((asize >> ashift) % (1 + vd->vdev_nparity)); + psize = asize / (1 + vd->vdev_nparity); + } else { + ASSERT0((asize >> ashift) % (cfg->dcf_data + vd->vdev_nparity)); + psize = (asize / (cfg->dcf_data + vd->vdev_nparity)) + * cfg->dcf_data; + } + + if (psize > SPA_MAXBLOCKSIZE) { + draid_dbg(0, "Psize "U64FMT" too big at offset "U64FMT" from " + "asize "U64FMT", ashift "U64FMT", %s MS "U64FMT"\n", + psize, offset, asize, ashift, + mirror ? "mirrored" : "draid", msid); + } + ASSERT3U(psize, <=, SPA_MAXBLOCKSIZE); + + return (psize); +} + +uint64_t +vdev_draid_max_rebuildable_asize(vdev_t *vd, uint64_t offset) +{ + uint64_t maxpsize = SPA_MAXBLOCKSIZE; + uint64_t ashift = vd->vdev_top->vdev_ashift; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + if (vdev_draid_ms_mirrored(vd, offset >> vd->vdev_ms_shift)) + return (vdev_draid_asize_by_type(vd, maxpsize, B_TRUE)); + + /* + * When SPA_MAXBLOCKSIZE>>ashift does not divide evenly by the number + * of data drives, the remainder must be discarded. Otherwise the skip + * sectors will cause vdev_draid_asize2psize() to get a psize larger + * than SPA_MAXBLOCKSIZE + */ + maxpsize >>= ashift; + maxpsize /= cfg->dcf_data; + maxpsize *= cfg->dcf_data; + maxpsize <<= ashift; + return (vdev_draid_asize_by_type(vd, maxpsize, B_FALSE)); +} + +static boolean_t +vdev_draid_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +{ + boolean_t mirror = + vdev_draid_ms_mirrored(vd, offset >> vd->vdev_ms_shift); + + ASSERT0(mirror); + + /* A block cannot cross redundancy group boundary */ + ASSERT3U(offset, ==, + vdev_draid_check_block(vd, offset, vdev_draid_asize(vd, psize))); + + return (vdev_draid_group_degraded(vd, NULL, offset, psize, mirror)); +} + +static void +vdev_draid_skip_io_done(zio_t *zio) +{ + /* + * HH: handle skip IO error + * raidz_col_t *rc = zio->io_private; + */ +} + +/* + * Start an IO operation on a dRAID VDev + * + * Outline: + * - For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. + * 3. If the column skips any sectors for padding, create optional dummy + * write zio children for those areas to improve aggregation continuity. + * - For read operations: + * 1. Create child zio read operations to each data column's vdev to read + * the range of data required for zio. + * 2. If this is a scrub or resilver operation, or if any of the data + * vdevs have had errors, then create zio read operations to the parity + * columns' VDevs as well. + */ +static void +vdev_draid_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + uint64_t ashift = vd->vdev_top->vdev_ashift; + vdev_t *cvd; + raidz_map_t *rm; + raidz_col_t *rc; + int c, i; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + vdev_draid_assert_vd(vd); + + if (vdev_draid_ms_mirrored(vd, zio->io_offset >> vd->vdev_ms_shift)) { + (void) vdev_draid_mirror_map_alloc(zio, ashift, cfg, NULL); + + ASSERT(zio->io_vsd != NULL); + vdev_mirror_ops.vdev_op_io_start(zio); + return; + } + + rm = vdev_draid_map_alloc(zio, NULL); + ASSERT3U(rm->rm_asize, ==, + vdev_draid_asize_by_type(vd, zio->io_size, B_FALSE)); + + if (zio->io_type == ZIO_TYPE_WRITE) { + vdev_raidz_generate_parity(rm); + + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + + /* + * Unlike raidz, it's mandatory to fill skip sectors with zero. + */ + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + ASSERT3U(c, <, rm->rm_scols); + ASSERT3U(c, >, rm->rm_firstdatacol); + + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, cfg->dcf_zero_abd, + 1ULL << ashift, zio->io_type, zio->io_priority, + 0, vdev_draid_skip_io_done, rc)); + } + + zio_execute(zio); + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + /* + * Sequential rebuild must do IO at redundancy group boundary, i.e. + * rm->rm_nskip must be 0 + */ + ASSERT((zio->io_flags & ZIO_FLAG_RESILVER) == 0 || + !DSL_SCAN_IS_REBUILD(zio->io_spa->spa_dsl_pool->dp_scan) || + rm->rm_nskip == 0); + + /* + * Iterate over the columns in reverse order so that we hit the parity + * last -- any errors along the way will force us to read the parity. + */ + for (c = rm->rm_cols - 1; c >= 0; c--) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + if (!vdev_draid_readable(cvd, rc->rc_offset)) { + if (c >= rm->rm_firstdatacol) + rm->rm_missingdata++; + else + rm->rm_missingparity++; + rc->rc_error = SET_ERROR(ENXIO); + rc->rc_tried = 1; /* don't even try */ + rc->rc_skipped = 1; + continue; + } + if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) { + if (c >= rm->rm_firstdatacol) + rm->rm_missingdata++; + else + rm->rm_missingparity++; + rc->rc_error = SET_ERROR(ESTALE); + rc->rc_skipped = 1; + continue; + } + if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + } + + /* + * Check skip sectors for scrub/resilver. For sequential rebuild, + * this is a no-op because rm->rm_nskip is always zero. + */ + if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + abd_t *abd; + + ASSERT3U(c, <, rm->rm_scols); + ASSERT3U(c, >, rm->rm_firstdatacol); + + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + + if (!vdev_draid_readable(cvd, + rc->rc_offset + rc->rc_size)) { + rc->rc_abd_skip = NULL; + continue; + } + + abd = abd_get_offset_size(rm->rm_abd_skip, + i << ashift, 1ULL << ashift); + *((int *)abd_to_buf(abd)) = 1; + rc->rc_abd_skip = abd; + + /* Skip sector to be written in vdev_draid_io_done() */ + if (vdev_draid_missing(cvd, + rc->rc_offset + rc->rc_size, zio->io_txg, 1)) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, abd, + 1ULL << ashift, ZIO_TYPE_READ, + zio->io_priority, 0, vdev_draid_skip_io_done, rc)); + } + } + + zio_execute(zio); +} + +int +vdev_draid_hide_skip_sectors(raidz_map_t *rm) +{ + int c, cols; + size_t size = rm->rm_col[0].rc_size; + vdev_t *vd = rm->rm_vdev; + struct vdev_draid_configuration *cfg; + + ASSERT(vdev_raidz_map_declustered(rm)); + + cfg = vd->vdev_tsd; + + for (c = rm->rm_cols; c < rm->rm_scols; c++) { + raidz_col_t *rc = &rm->rm_col[c]; + + ASSERT0(rc->rc_size); + ASSERT0(rc->rc_error); + ASSERT0(rc->rc_tried); + ASSERT0(rc->rc_skipped); + ASSERT(rc->rc_abd == NULL); + ASSERT3U(cfg->dcf_zero_abd->abd_size, >=, size); + + rc->rc_size = size; + rc->rc_abd = cfg->dcf_zero_abd; + } + + cols = rm->rm_cols; + rm->rm_cols = rm->rm_scols; + return (cols); +} + +void +vdev_draid_restore_skip_sectors(raidz_map_t *rm, int cols) +{ + int c; + + ASSERT3U(cols, >, rm->rm_firstdatacol); + ASSERT3U(cols, <=, rm->rm_scols); + ASSERT(vdev_raidz_map_declustered(rm)); + + for (c = cols; c < rm->rm_scols; c++) { + raidz_col_t *rc = &rm->rm_col[c]; + + ASSERT0(rc->rc_error); + ASSERT0(rc->rc_tried); + ASSERT0(rc->rc_skipped); + ASSERT(rc->rc_abd != NULL); + + rc->rc_size = 0; + rc->rc_abd = NULL; + } + + rm->rm_cols = cols; +} + +void +vdev_draid_fix_skip_sectors(zio_t *zio) +{ + int c, i; + char *zero; + vdev_t *vd = zio->io_vd; + raidz_map_t *rm = zio->io_vsd; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + const uint64_t size = 1ULL << vd->vdev_top->vdev_ashift; + + vdev_draid_assert_vd(vd); + ASSERT3P(rm->rm_vdev, ==, vd); + + if (rm->rm_abd_skip == NULL) + return; + + zero = abd_to_buf(cfg->dcf_zero_abd); + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + char *skip; + boolean_t good_skip; + raidz_col_t *rc = &rm->rm_col[c]; + + ASSERT3U(c, <, rm->rm_scols); + ASSERT3U(c, >, rm->rm_firstdatacol); + + if (rc->rc_abd_skip == NULL) + continue; + + skip = abd_to_buf(rc->rc_abd_skip); + good_skip = (memcmp(skip, zero, size) == 0); + abd_put(rc->rc_abd_skip); + rc->rc_abd_skip = NULL; + + if (good_skip || !spa_writeable(zio->io_spa)) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], + rc->rc_offset + rc->rc_size, cfg->dcf_zero_abd, + size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR, NULL, NULL)); + } +} + +static void +vdev_draid_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + if (vdev_draid_ms_mirrored(vd, zio->io_offset >> vd->vdev_ms_shift)) + vdev_mirror_ops.vdev_op_io_done(zio); /* hybrid mirror */ + else + vdev_raidz_ops.vdev_op_io_done(zio); /* declustered raidz */ +} + +static void +vdev_draid_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (faulted > vd->vdev_nparity) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +vdev_ops_t vdev_draid_ops = { + vdev_draid_open, + vdev_draid_close, + vdev_draid_asize, + vdev_draid_io_start, + vdev_draid_io_done, + vdev_draid_state_change, + vdev_draid_need_resilver, + NULL, + NULL, + NULL, + VDEV_TYPE_DRAID, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; + +#include + +typedef struct { + vdev_t *dsp_draid; + uint64_t dsp_id; +} vdev_dspare_t; + +static vdev_t * +vdev_dspare_get_child(vdev_t *vd, uint64_t offset) +{ + vdev_t *draid; + uint64_t *permutation, spareidx; + vdev_dspare_t *dspare = vd->vdev_tsd; + struct vdev_draid_configuration *cfg; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + ASSERT3U(offset, <, + vd->vdev_psize - VDEV_LABEL_START_SIZE - VDEV_LABEL_END_SIZE); + ASSERT(dspare != NULL); + draid = dspare->dsp_draid; + vdev_draid_assert_vd(draid); + cfg = draid->vdev_tsd; + ASSERT3U(dspare->dsp_id, <, cfg->dcf_spare); + + permutation = kmem_alloc(sizeof (permutation[0]) * draid->vdev_children, + KM_SLEEP); + VERIFY0(vdev_draid_get_permutation(permutation, + offset >> DRAID_SLICESHIFT, cfg)); + spareidx = permutation[draid->vdev_children - 1 - dspare->dsp_id]; + ASSERT3U(spareidx, <, draid->vdev_children); + kmem_free(permutation, sizeof (permutation[0]) * draid->vdev_children); + + return (draid->vdev_child[spareidx]); +} + +vdev_t * +vdev_draid_spare_get_parent(vdev_t *vd) +{ + vdev_dspare_t *dspare = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + ASSERT(dspare != NULL); + ASSERT(dspare->dsp_draid != NULL); + + return (dspare->dsp_draid); +} + +nvlist_t * +vdev_draid_spare_read_config(vdev_t *vd) +{ + int i; + uint64_t guid; + spa_t *spa = vd->vdev_spa; + spa_aux_vdev_t *sav = &spa->spa_spares; + nvlist_t *nv = fnvlist_alloc(); + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa)); + fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa)); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); + + if (vd->vdev_isspare) + fnvlist_add_uint64(nv, + ZPOOL_CONFIG_POOL_STATE, POOL_STATE_ACTIVE); + else + fnvlist_add_uint64(nv, + ZPOOL_CONFIG_POOL_STATE, POOL_STATE_SPARE); + + for (i = 0, guid = vd->vdev_guid; i < sav->sav_count; i++) { + if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops && + strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) { + guid = sav->sav_vdevs[i]->vdev_guid; + break; + } + } + fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid); + + /* HH: ZPOOL_CONFIG_UNSPARE and ZPOOL_CONFIG_RESILVER_TXG? */ + return (nv); +} + +static int +vdev_dspare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) +{ + uint64_t draid_id, nparity, spare_id; + uint64_t asize, max_asize; + vdev_t *draid; + vdev_dspare_t *dspare; + struct vdev_draid_configuration *cfg; + + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + dspare = vd->vdev_tsd; + draid = dspare->dsp_draid; + cfg = draid->vdev_tsd; + goto skip_open; + } + + if (sscanf(vd->vdev_path, VDEV_DRAID_SPARE_PATH_FMT, + (long unsigned *)&nparity, (long unsigned *)&draid_id, + (long unsigned *)&spare_id) != 3) + return (SET_ERROR(EINVAL)); + + if (draid_id >= vd->vdev_spa->spa_root_vdev->vdev_children) + return (SET_ERROR(EINVAL)); + + draid = vd->vdev_spa->spa_root_vdev->vdev_child[draid_id]; + if (draid->vdev_ops != &vdev_draid_ops) + return (SET_ERROR(EINVAL)); + if (draid->vdev_nparity != nparity) + return (SET_ERROR(EINVAL)); + + cfg = draid->vdev_tsd; + ASSERT(cfg != NULL); + if (nparity != cfg->dcf_parity || spare_id >= cfg->dcf_spare) + return (SET_ERROR(EINVAL)); + + dspare = kmem_alloc(sizeof (*dspare), KM_SLEEP); + dspare->dsp_draid = draid; + dspare->dsp_id = spare_id; + vd->vdev_tsd = dspare; + +skip_open: + asize = draid->vdev_asize / (draid->vdev_children - cfg->dcf_spare); + max_asize = draid->vdev_max_asize / + (draid->vdev_children - cfg->dcf_spare); + + *ashift = draid->vdev_ashift; + *psize = asize + (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); + *max_psize = max_asize + (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); + return (0); +} + +static void +vdev_dspare_close(vdev_t *vd) +{ + vdev_dspare_t *dspare = vd->vdev_tsd; + + if (vd->vdev_reopening || dspare == NULL) + return; + + vd->vdev_tsd = NULL; + kmem_free(dspare, sizeof (*dspare)); +} + +static uint64_t +vdev_dspare_asize(vdev_t *vd, uint64_t psize) +{ + /* HH: this function should never get called */ + ASSERT0(psize); + return (0); +} + +static void +vdev_dspare_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + pio->io_error = zio->io_error; +} + +static void +vdev_dspare_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *cvd; + uint64_t offset = zio->io_offset; + + /* HH: if dspare gets a FLUSH, so do all children of the draid vdev */ + if (zio->io_type == ZIO_TYPE_IOCTL) { + zio->io_error = 0; + zio_execute(zio); + return; + } + + /* + * HH: at pool creation, dspare gets some writes with + * ZIO_FLAG_SPECULATIVE and ZIO_FLAG_NODATA. + * Need to understand and handle them right. + */ + if (zio->io_flags & ZIO_FLAG_NODATA) { + zio->io_error = 0; + zio_execute(zio); + return; + } + + if (offset < VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) { + ASSERT(zio->io_flags & ZIO_FLAG_PHYSICAL); + + /* + * HH: dspare should not get any label IO as it is pretending + * to be a leaf disk. Later should catch and fix all places + * that still does label IO to dspare. + */ + zio->io_error = SET_ERROR(ENODATA); + zio_interrupt(zio); + return; + } + + offset -= VDEV_LABEL_START_SIZE; /* See zio_vdev_child_io() */ + cvd = vdev_dspare_get_child(vd, offset); + if (zio->io_type == ZIO_TYPE_READ && !vdev_readable(cvd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + /* + * Parent vdev should have avoided reading from me in the first + * place, unless this is a mirror scrub. + */ + draid_dbg(1, "Read from dead spare %s:%s:%s at "U64FMT"\n", + vd->vdev_path, + cvd->vdev_ops->vdev_op_type, + cvd->vdev_path != NULL ? cvd->vdev_path : "NA", + offset); + return; + } + + /* dspare IO does not cross slice boundary */ + ASSERT3U(offset >> DRAID_SLICESHIFT, ==, + (offset + zio->io_size - 1) >> DRAID_SLICESHIFT); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, offset, zio->io_abd, + zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_dspare_child_done, zio)); + zio_execute(zio); +} + +static void +vdev_dspare_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_draid_spare_ops = { + vdev_dspare_open, + vdev_dspare_close, + vdev_dspare_asize, + vdev_dspare_io_start, + vdev_dspare_io_done, + NULL, + NULL, + NULL, + NULL, + NULL, + VDEV_TYPE_DRAID_SPARE, + B_TRUE +}; + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(draid_debug_lvl, int, 0644); +MODULE_PARM_DESC(draid_debug_lvl, "dRAID debugging verbose level"); +#endif diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 439ab7438d90..44514deeba44 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -143,6 +143,7 @@ #include #include #include +#include #include #include #include @@ -420,8 +421,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); if (vd->vdev_nparity != 0) { - ASSERT(strcmp(vd->vdev_ops->vdev_op_type, - VDEV_TYPE_RAIDZ) == 0); + ASSERT(vd->vdev_ops == &vdev_raidz_ops || + vd->vdev_ops == &vdev_draid_ops); /* * Make sure someone hasn't managed to sneak a fancy new vdev @@ -441,6 +442,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); } + if (vd->vdev_cfg != NULL) { + ASSERT(vd->vdev_ops == &vdev_draid_ops); + ASSERT(vdev_draid_config_validate(vd, vd->vdev_cfg)); + + fnvlist_add_nvlist(nv, ZPOOL_CONFIG_DRAIDCFG, vd->vdev_cfg); + } + if (vd->vdev_wholedisk != -1ULL) fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, vd->vdev_wholedisk); @@ -736,6 +744,9 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) if (!vdev_readable(vd)) return (NULL); + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (vdev_draid_spare_read_config(vd)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); vp = abd_to_buf(vp_abd); @@ -1009,6 +1020,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) ASSERT(reason == VDEV_LABEL_REPLACE); } + if (vd->vdev_ops == &vdev_draid_spare_ops) { + error = 0; + goto skip; + } + /* * Initialize its label. */ @@ -1130,6 +1146,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) abd_free(ub_abd); abd_free(vp_abd); +skip: /* * If this vdev hasn't been previously identified as a spare, then we * mark it as such only if a) we are labeling it as a spare, or b) it @@ -1217,7 +1234,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, for (int c = 0; c < vd->vdev_children; c++) vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); - if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) && + vd->vdev_ops != &vdev_draid_spare_ops) { for (int l = 0; l < VDEV_LABELS; l++) { for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, @@ -1364,6 +1382,13 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, if (!vd->vdev_ops->vdev_op_leaf) return; + /* + * No need to sync ub on dspare - if dspare gets a ub sync, so + * do the parent draid vdev and all its children. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + if (!vdev_writeable(vd)) return; @@ -1480,6 +1505,9 @@ vdev_label_sync(zio_t *zio, uint64_t *good_writes, if (!vd->vdev_ops->vdev_op_leaf) return; + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + if (!vdev_writeable(vd)) return; diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 65357d841805..5fdc5a387b52 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -93,29 +94,6 @@ vdev_mirror_stat_fini(void) } } -/* - * Virtual device vector for mirroring. - */ - -typedef struct mirror_child { - vdev_t *mc_vd; - uint64_t mc_offset; - int mc_error; - int mc_load; - uint8_t mc_tried; - uint8_t mc_skipped; - uint8_t mc_speculative; -} mirror_child_t; - -typedef struct mirror_map { - int *mm_preferred; - int mm_preferred_cnt; - int mm_children; - boolean_t mm_replacing; - boolean_t mm_root; - mirror_child_t mm_child[]; -} mirror_map_t; - static int vdev_mirror_shift = 21; /* @@ -144,7 +122,7 @@ vdev_mirror_map_size(int children) sizeof (int) * children); } -static inline mirror_map_t * +mirror_map_t * vdev_mirror_map_alloc(int children, boolean_t replacing, boolean_t root) { mirror_map_t *mm; @@ -167,7 +145,7 @@ vdev_mirror_map_free(zio_t *zio) kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); } -static const zio_vsd_ops_t vdev_mirror_vsd_ops = { +const zio_vsd_ops_t vdev_mirror_vsd_ops = { .vsd_free = vdev_mirror_map_free, .vsd_cksum_report = zio_vsd_default_cksum_report }; @@ -427,6 +405,28 @@ vdev_mirror_preferred_child_randomize(zio_t *zio) return (mm->mm_preferred[p]); } +static boolean_t +vdev_mirror_child_readable(mirror_child_t *mc) +{ + vdev_t *vd = mc->mc_vd; + + if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) + return (vdev_draid_readable(vd, mc->mc_offset)); + else + return (vdev_readable(vd)); +} + +static boolean_t +vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size) +{ + vdev_t *vd = mc->mc_vd; + + if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) + return (vdev_draid_missing(vd, mc->mc_offset, txg, size)); + else + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + /* * Try to find a vdev whose DTL doesn't contain the block we want to read * prefering vdevs based on determined load. @@ -452,14 +452,15 @@ vdev_mirror_child_select(zio_t *zio) if (mc->mc_tried || mc->mc_skipped) continue; - if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) { + if (mc->mc_vd == NULL || + !vdev_mirror_child_readable(mc)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } - if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { + if (vdev_mirror_child_missing(mc, txg, 1)) { mc->mc_error = SET_ERROR(ESTALE); mc->mc_skipped = 1; mc->mc_speculative = 1; @@ -510,7 +511,12 @@ vdev_mirror_io_start(zio_t *zio) mirror_child_t *mc; int c, children; - mm = vdev_mirror_map_init(zio); + if (zio->io_vsd != NULL) { /* dRAID hybrid mirror */ + ASSERT3P(zio->io_vd->vdev_ops, ==, &vdev_draid_ops); + mm = zio->io_vsd; + } else { + mm = vdev_mirror_map_init(zio); + } if (mm == NULL) { ASSERT(!spa_trust_config(zio->io_spa)); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index a21baf9c264b..06636c61f2a8 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -35,6 +35,7 @@ #include #include #include +#include /* * Virtual device vector for RAID-Z. @@ -145,6 +146,11 @@ vdev_raidz_map_free(raidz_map_t *rm) for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) abd_put(rm->rm_col[c].rc_abd); + if (rm->rm_abd_skip != NULL) { + ASSERT(vdev_raidz_map_declustered(rm)); + abd_free(rm->rm_abd_skip); + } + if (rm->rm_abd_copy != NULL) abd_free(rm->rm_abd_copy); @@ -317,7 +323,7 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) ASSERT3U(offset, ==, size); } -static const zio_vsd_ops_t vdev_raidz_vsd_ops = { +const zio_vsd_ops_t vdev_raidz_vsd_ops = { .vsd_free = vdev_raidz_map_free_vsd, .vsd_cksum_report = vdev_raidz_cksum_report }; @@ -392,6 +398,8 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, rm->rm_reports = 0; rm->rm_freed = 0; rm->rm_ecksuminjected = 0; + rm->rm_abd_skip = NULL; + rm->rm_vdev = NULL; asize = 0; @@ -669,23 +677,30 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) void vdev_raidz_generate_parity(raidz_map_t *rm) { - /* Generate using the new math implementation */ - if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL) - return; + int cols = 0; - switch (rm->rm_firstdatacol) { - case 1: - vdev_raidz_generate_parity_p(rm); - break; - case 2: - vdev_raidz_generate_parity_pq(rm); - break; - case 3: - vdev_raidz_generate_parity_pqr(rm); - break; - default: - cmn_err(CE_PANIC, "invalid RAID-Z configuration"); + if (vdev_raidz_map_declustered(rm) && rm->rm_firstdatacol > 1) + cols = vdev_draid_hide_skip_sectors(rm); + + /* Generate using the new math implementation */ + if (vdev_raidz_math_generate(rm) == RAIDZ_ORIGINAL_IMPL) { + switch (rm->rm_firstdatacol) { + case 1: + vdev_raidz_generate_parity_p(rm); + break; + case 2: + vdev_raidz_generate_parity_pq(rm); + break; + case 3: + vdev_raidz_generate_parity_pqr(rm); + break; + default: + cmn_err(CE_PANIC, "invalid RAID-Z configuration"); + } } + + if (cols != 0) + vdev_draid_restore_skip_sectors(rm, cols); } /* ARGSUSED */ @@ -1471,8 +1486,8 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; - int i, c, ret; - int code; + int i, c, code; + int cols = 0; int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; @@ -1507,25 +1522,32 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) ASSERT(nbaddata >= 0); ASSERT(nbaddata + nbadparity == ntgts); + if (vdev_raidz_map_declustered(rm)) + cols = vdev_draid_hide_skip_sectors(rm); + dt = &tgts[nbadparity]; /* Reconstruct using the new math implementation */ - ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); - if (ret != RAIDZ_ORIGINAL_IMPL) - return (ret); + code = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); + if (code != RAIDZ_ORIGINAL_IMPL) + goto out; /* * See if we can use any of our optimized reconstruction routines. */ switch (nbaddata) { case 1: - if (parity_valid[VDEV_RAIDZ_P]) - return (vdev_raidz_reconstruct_p(rm, dt, 1)); + if (parity_valid[VDEV_RAIDZ_P]) { + code = vdev_raidz_reconstruct_p(rm, dt, 1); + goto out; + } ASSERT(rm->rm_firstdatacol > 1); - if (parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_q(rm, dt, 1)); + if (parity_valid[VDEV_RAIDZ_Q]) { + code = vdev_raidz_reconstruct_q(rm, dt, 1); + goto out; + } ASSERT(rm->rm_firstdatacol > 2); break; @@ -1534,8 +1556,10 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) ASSERT(rm->rm_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_P] && - parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + parity_valid[VDEV_RAIDZ_Q]) { + code = vdev_raidz_reconstruct_pq(rm, dt, 2); + goto out; + } ASSERT(rm->rm_firstdatacol > 2); @@ -1545,6 +1569,9 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); ASSERT(code > 0); +out: + if (cols != 0) + vdev_draid_restore_skip_sectors(rm, cols); return (code); } @@ -1617,7 +1644,7 @@ vdev_raidz_asize(vdev_t *vd, uint64_t psize) return (asize); } -static void +void vdev_raidz_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; @@ -1820,6 +1847,8 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) abd_free(orig[c]); } + if (ret != 0 && vdev_raidz_map_declustered(rm)) + vdev_draid_debug_zio(zio, B_FALSE); return (ret); } @@ -2271,6 +2300,9 @@ vdev_raidz_io_done(zio_t *zio) ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } + + if (vdev_raidz_map_declustered(rm)) + vdev_draid_fix_skip_sectors(zio); } static void @@ -2288,7 +2320,7 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) /* * Determine if any portion of the provided block resides on a child vdev * with a dirty DTL and therefore needs to be resilvered. The function - * assumes that at least one DTL is dirty which imples that full stripe + * assumes that at least one DTL is dirty which implies that full stripe * width blocks must be resilvered. */ static boolean_t diff --git a/module/zfs/vdev_raidz.h b/module/zfs/vdev_raidz.h new file mode 100644 index 000000000000..ab3c5b81dc64 --- /dev/null +++ b/module/zfs/vdev_raidz.h @@ -0,0 +1,33 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018 Intel Corporation. + */ + +#include +#include +#include + +extern const zio_vsd_ops_t vdev_raidz_vsd_ops; + +extern void vdev_raidz_generate_parity(raidz_map_t *rm); +extern void vdev_raidz_child_done(zio_t *zio); diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 9db6fe37b4db..ec3a53395f1c 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -234,6 +234,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) ASSERTV(uint64_t txg = dmu_tx_get_txg(tx)); ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + ASSERT3P(vd->vdev_ops, !=, &vdev_draid_ops); svr = spa_vdev_removal_create(vd); ASSERT(vd->vdev_removing); @@ -1074,6 +1075,7 @@ vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) { ASSERT3P(zlist, !=, NULL); ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + ASSERT3P(vd->vdev_ops, !=, &vdev_draid_ops); if (vd->vdev_leaf_zap != 0) { char zkey[32]; @@ -1919,7 +1921,7 @@ spa_vdev_remove_top_check(vdev_t *vd) /* * All vdevs in normal class must have the same ashift - * and not be raidz. + * and not be raidz or draid. */ vdev_t *rvd = spa->spa_root_vdev; int num_indirect = 0; @@ -1931,7 +1933,8 @@ spa_vdev_remove_top_check(vdev_t *vd) num_indirect++; if (!vdev_is_concrete(cvd)) continue; - if (cvd->vdev_ops == &vdev_raidz_ops) + if (cvd->vdev_ops == &vdev_raidz_ops || + cvd->vdev_ops == &vdev_draid_ops) return (SET_ERROR(EINVAL)); /* * Need the mirror to be mirror of leaf vdevs only @@ -2072,20 +2075,32 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) * in this pool. */ if (vd == NULL || unspare) { - if (vd == NULL) - vd = spa_lookup_by_guid(spa, guid, B_TRUE); - ev = spa_event_create(spa, vd, NULL, - ESC_ZFS_VDEV_REMOVE_AUX); - - char *nvstr = fnvlist_lookup_string(nv, - ZPOOL_CONFIG_PATH); - spa_history_log_internal(spa, "vdev remove", NULL, - "%s vdev (%s) %s", spa_name(spa), - VDEV_TYPE_SPARE, nvstr); - spa_vdev_remove_aux(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares, nv); - spa_load_spares(spa); - spa->spa_spares.sav_sync = B_TRUE; + char *type; + boolean_t draid_spare = B_FALSE; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) + == 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) + draid_spare = B_TRUE; + + if (vd == NULL && draid_spare) { + error = SET_ERROR(ENOTSUP); + } else { + if (vd == NULL) + vd = spa_lookup_by_guid(spa, guid, + B_TRUE); + ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_AUX); + + char *nvstr = fnvlist_lookup_string(nv, + ZPOOL_CONFIG_PATH); + spa_history_log_internal(spa, "vdev remove", + NULL, "%s vdev (%s) %s", spa_name(spa), + VDEV_TYPE_SPARE, nvstr); + spa_vdev_remove_aux(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares, nv); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + } } else { error = SET_ERROR(EBUSY); } diff --git a/module/zfs/vdev_scan.c b/module/zfs/vdev_scan.c new file mode 100644 index 000000000000..ba60649e3c8d --- /dev/null +++ b/module/zfs/vdev_scan.c @@ -0,0 +1,583 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void +spa_vdev_scan_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + spa_vdev_scan_t *svs = zio->io_private; + uint64_t asize; + + ASSERT(svs != NULL); + ASSERT(svs->svs_thread != NULL); + ASSERT(zio->io_bp != NULL); + + abd_free(zio->io_abd); + asize = DVA_GET_ASIZE(&zio->io_bp->blk_dva[0]); + + scn->scn_phys.scn_examined += asize; + spa->spa_scan_pass_exam += asize; + spa->spa_scan_pass_issued += asize; + + if (zio->io_error && (zio->io_error != ECKSUM || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { + spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; + } + + mutex_enter(&svs->svs_io_lock); + ASSERT3U(svs->svs_io_asize, >=, asize); + svs->svs_io_asize -= asize; + cv_broadcast(&svs->svs_io_cv); + mutex_exit(&svs->svs_io_lock); +} + +static int spa_vdev_scan_delay = 64; /* number of ticks to delay rebuild */ +static int spa_vdev_scan_idle = 512; /* idle window in clock ticks */ + +static void +spa_vdev_scan_rebuild_block(spa_vdev_scan_t *svs, zio_t *pio, + vdev_t *vd, uint64_t offset, uint64_t asize) +{ + blkptr_t blk, *bp = &blk; + dva_t *dva = bp->blk_dva; + int scan_delay = spa_vdev_scan_delay; + uint64_t psize; + spa_t *spa = vd->vdev_spa; + + ASSERT(vd->vdev_ops == &vdev_draid_ops || + vd->vdev_ops == &vdev_mirror_ops); + + /* Calculate psize from asize */ + if (vd->vdev_ops == &vdev_mirror_ops) { + psize = asize; + } else { + int c, faulted; + + /* + * Initialize faulted to 1, to count the spare vdev we're + * rebuilding, which is not in faulted state. + */ + for (c = 0, faulted = 1; c < vd->vdev_children; c++) { + vdev_t *child = vd->vdev_child[c]; + + if (!vdev_readable(child) || + (!vdev_writeable(child) && spa_writeable(spa))) + faulted++; + } + + if (faulted >= vd->vdev_nparity) + scan_delay = 0; /* critical, go full speed */ + + psize = vdev_draid_asize2psize(vd, asize, offset); + } + /* + * HH: add this assertion after dmirror implemented + * ASSERT3U(asize, ==, vdev_psize_to_asize(vd, psize, offset)); + */ + + BP_ZERO(bp); + + DVA_SET_VDEV(&dva[0], vd->vdev_id); + DVA_SET_OFFSET(&dva[0], offset); + DVA_SET_GANG(&dva[0], 0); + DVA_SET_ASIZE(&dva[0], asize); + + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + BP_SET_LSIZE(bp, psize); + BP_SET_PSIZE(bp, psize); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + mutex_enter(&svs->svs_io_lock); + while (svs->svs_io_asize >= + MIN(arc_max_bytes(), 4 * SPA_MAXBLOCKSIZE * vd->vdev_children)) + cv_wait(&svs->svs_io_cv, &svs->svs_io_lock); + svs->svs_io_asize += asize; + mutex_exit(&svs->svs_io_lock); + + if (scan_delay != 0) { + /* + * If we're seeing recent (spa_vdev_scan_idle) "important" I/Os + * then throttle our workload to limit the impact of a scan. + */ + if (ddi_get_lbolt64() - vd->vdev_last_io <= spa_vdev_scan_idle) + delay(scan_delay); + } + + zio_nowait(zio_read(pio, spa, bp, + abd_alloc(psize, B_FALSE), psize, spa_vdev_scan_done, svs, + ZIO_PRIORITY_SCRUB, ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | + ZIO_FLAG_CANFAIL | ZIO_FLAG_RESILVER, NULL)); +} + +static void +spa_vdev_scan_rebuild(spa_vdev_scan_t *svs, zio_t *pio, + vdev_t *vd, uint64_t offset, uint64_t length) +{ + uint64_t max_asize; + + if (vd->vdev_ops == &vdev_draid_ops) + max_asize = vdev_draid_max_rebuildable_asize(vd, offset); + else + max_asize = vdev_psize_to_asize(vd, SPA_MAXBLOCKSIZE); + + while (length > 0 && !svs->svs_thread_exit) { + uint64_t chunksz = MIN(length, max_asize); + + spa_vdev_scan_rebuild_block(svs, pio, vd, offset, chunksz); + + length -= chunksz; + offset += chunksz; + } +} + +static void +spa_vdev_scan_draid_rebuild(spa_vdev_scan_t *svs, zio_t *pio, + vdev_t *vd, vdev_t *oldvd, uint64_t offset, uint64_t length) +{ + uint64_t msi = offset >> vd->vdev_ms_shift; + boolean_t mirror; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(msi, ==, (offset + length - 1) >> vd->vdev_ms_shift); + + mirror = vdev_draid_ms_mirrored(vd, msi); + + while (length > 0 && !svs->svs_thread_exit) { + uint64_t group, group_left, chunksz; + char *action; + + /* + * Make sure we don't cross redundancy group boundary + */ + group = vdev_draid_offset2group(vd, offset, mirror); + group_left = vdev_draid_group2offset(vd, + group + 1, mirror) - offset; + + ASSERT(!vdev_draid_is_remainder_group(vd, group, mirror)); + ASSERT3U(group_left, <=, vdev_draid_get_groupsz(vd, mirror)); + + chunksz = MIN(length, group_left); + if (vdev_draid_group_degraded(vd, oldvd, + offset, chunksz, mirror)) { + action = "Fixing"; + spa_vdev_scan_rebuild(svs, pio, vd, offset, chunksz); + } else { + spa_t *spa = vd->vdev_spa; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + + action = "Skipping"; + + scn->scn_phys.scn_examined += chunksz; + spa->spa_scan_pass_exam += chunksz; + } + + draid_dbg(3, "\t%s: "U64FMT"K + "U64FMT"K (%s)\n", + action, offset >> 10, chunksz >> 10, + mirror ? "mirrored" : "dRAID"); + + length -= chunksz; + offset += chunksz; + } +} + +static void +spa_vdev_scan_ms_done(zio_t *zio) +{ + metaslab_t *msp = zio->io_private; + spa_vdev_scan_t *svs = zio->io_spa->spa_vdev_scan; + int *ms_done, msi; + + ASSERT(msp != NULL); + ASSERT(svs != NULL); + + mutex_enter(&msp->ms_lock); + msp->ms_rebuilding = B_FALSE; + mutex_exit(&msp->ms_lock); + + ms_done = svs->svs_ms_done; + ASSERT(ms_done != NULL); + ASSERT0(ms_done[msp->ms_id]); + + mutex_enter(&svs->svs_lock); + + if (svs->svs_thread_exit) { + /* + * Cannot mark this MS as "done", because the rebuild thread + * may have been interrupted in the middle of working on + * this MS. + */ + mutex_exit(&svs->svs_lock); + draid_dbg(1, "Aborted rebuilding metaslab "U64FMT"\n", + msp->ms_id); + return; + } + + ms_done[msp->ms_id] = 1; + + for (msi = svs->svs_msi_synced + 1; + msi < svs->svs_vd->vdev_top->vdev_ms_count; msi++) { + if (ms_done[msi] == 0) + break; + } + svs->svs_msi_synced = msi - 1; + + mutex_exit(&svs->svs_lock); + + draid_dbg(1, "Completed rebuilding metaslab "U64FMT"\n", msp->ms_id); + draid_dbg(1, "All metaslabs [0, %d) fully rebuilt.\n", msi) +} + +static void +spa_vdev_scan_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + spa_vdev_scan_t *svs = spa->spa_vdev_scan; + zio_t *rio = zio_root(spa, NULL, NULL, 0); + range_tree_t *allocd_segs; + uint64_t msi; + int *ms_done, err; + + ASSERT(svs != NULL); + ASSERT3P(svs->svs_vd, ==, vd); + ASSERT3P(svs->svs_ms_done, ==, NULL); + + vd = vd->vdev_top; + ASSERT3U(svs->svs_msi, >=, 0); + ASSERT3U(svs->svs_msi, <, vd->vdev_ms_count); + + /* + * Wait for newvd's DTL to propagate upward when + * spa_vdev_attach()->spa_vdev_exit() calls vdev_dtl_reassess(). + */ + txg_wait_synced(spa->spa_dsl_pool, svs->svs_dtl_max); + + allocd_segs = range_tree_create(NULL, NULL); + + ms_done = kmem_alloc(sizeof (*ms_done) * vd->vdev_ms_count, KM_SLEEP); + for (msi = 0; msi < vd->vdev_ms_count; msi++) { + if (msi < svs->svs_msi) + ms_done[msi] = 1; + else + ms_done[msi] = 0; + } + + mutex_enter(&svs->svs_lock); + svs->svs_ms_done = ms_done; + svs->svs_msi_synced = svs->svs_msi - 1; + mutex_exit(&svs->svs_lock); + + for (msi = svs->svs_msi; + msi < vd->vdev_ms_count && !svs->svs_thread_exit; msi++) { + metaslab_t *msp = vd->vdev_ms[msi]; + zio_t *pio = zio_null(rio, spa, NULL, + spa_vdev_scan_ms_done, msp, rio->io_flags); + + ASSERT0(range_tree_space(allocd_segs)); + + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + + while (msp->ms_condensing) { + mutex_exit(&msp->ms_lock); + + zfs_sleep_until(gethrtime() + 100 * MICROSEC); + + mutex_enter(&msp->ms_lock); + } + + VERIFY(!msp->ms_condensing); + VERIFY(!msp->ms_rebuilding); + msp->ms_rebuilding = B_TRUE; + + /* + * If the metaslab has ever been allocated from (ms_sm!=NULL), + * read the allocated segments from the space map object + * into svr_allocd_segs. Since we do this while holding + * svr_lock and ms_sync_lock, concurrent frees (which + * would have modified the space map) will wait for us + * to finish loading the spacemap, and then take the + * appropriate action (see free_from_removing_vdev()). + */ + if (msp->ms_sm != NULL) { + space_map_t *sm = NULL; + + /* + * We have to open a new space map here, because + * ms_sm's sm_length and sm_alloc may not reflect + * what's in the object contents, if we are in between + * metaslab_sync() and metaslab_sync_done(). + */ + VERIFY0(space_map_open(&sm, + spa->spa_dsl_pool->dp_meta_objset, + msp->ms_sm->sm_object, msp->ms_sm->sm_start, + msp->ms_sm->sm_size, msp->ms_sm->sm_shift)); + space_map_update(sm); + VERIFY0(space_map_load(sm, allocd_segs, SM_ALLOC)); + space_map_close(sm); + } + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + + draid_dbg(1, "Scanning %lu segments for MS "U64FMT"\n", + avl_numnodes(&allocd_segs->rt_root), msp->ms_id); + + while (!svs->svs_thread_exit && + range_tree_space(allocd_segs) != 0) { + uint64_t offset, length; + range_seg_t *rs = avl_first(&allocd_segs->rt_root); + + ASSERT(rs != NULL); + offset = rs->rs_start; + length = rs->rs_end - rs->rs_start; + + range_tree_remove(allocd_segs, offset, length); + + draid_dbg(2, "MS ("U64FMT" at "U64FMT"K) segment: " + U64FMT"K + "U64FMT"K\n", + msp->ms_id, msp->ms_start >> 10, + (offset - msp->ms_start) >> 10, length >> 10); + + if (vd->vdev_ops == &vdev_mirror_ops) + spa_vdev_scan_rebuild(svs, pio, + vd, offset, length); + else + spa_vdev_scan_draid_rebuild(svs, pio, vd, + svs->svs_vd, offset, length); + } + + zio_nowait(pio); + } + + err = zio_wait(rio); + if (err != 0) /* HH: handle error */ + err = SET_ERROR(err); + + mutex_enter(&svs->svs_lock); + if (svs->svs_thread_exit) { + range_tree_vacate(allocd_segs, NULL, NULL); + } + + svs->svs_thread = NULL; + svs->svs_ms_done = NULL; + cv_broadcast(&svs->svs_cv); + mutex_exit(&svs->svs_lock); + + ASSERT0(range_tree_space(allocd_segs)); + range_tree_destroy(allocd_segs); + kmem_free(ms_done, sizeof (*ms_done) * vd->vdev_ms_count); + thread_exit(); +} + +void +spa_vdev_scan_start(spa_t *spa, vdev_t *oldvd, int msi, uint64_t txg) +{ + dsl_scan_t *scan = spa->spa_dsl_pool->dp_scan; + spa_vdev_scan_t *svs = kmem_zalloc(sizeof (*svs), KM_SLEEP); + + ASSERT3U(msi, <, oldvd->vdev_top->vdev_ms_count); + + svs->svs_msi = msi; + svs->svs_vd = oldvd; + svs->svs_dtl_max = txg; + svs->svs_thread = NULL; + svs->svs_ms_done = NULL; + svs->svs_dp = spa->spa_dsl_pool; + mutex_init(&svs->svs_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&svs->svs_cv, NULL, CV_DEFAULT, NULL); + svs->svs_io_asize = 0; + mutex_init(&svs->svs_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&svs->svs_io_cv, NULL, CV_DEFAULT, NULL); + ASSERT3P(spa->spa_vdev_scan, ==, NULL); + spa->spa_vdev_scan = svs; + svs->svs_thread = thread_create(NULL, 0, spa_vdev_scan_thread, oldvd, + 0, NULL, TS_RUN, defclsyspri); + + scan->scn_restart_txg = txg; +} + +int +spa_vdev_scan_restart(vdev_t *rvd) +{ + spa_t *spa = rvd->vdev_spa; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + spa_rebuilding_phys_t svs_phys; + int err; + vdev_t *tvd, *oldvd, *pvd, *dspare; + + ASSERT(scn != NULL); + ASSERT3P(spa->spa_vdev_scan, ==, NULL); + + err = zap_lookup(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_REBUILDING, sizeof (uint64_t), + sizeof (spa_rebuilding_phys_t) / sizeof (uint64_t), &svs_phys); + + if (err != 0 || !DSL_SCAN_IS_REBUILD(scn) || + scn->scn_phys.scn_state == DSS_FINISHED || + svs_phys.sr_vdev == 0 || svs_phys.sr_oldvd == 0 || + svs_phys.sr_ms < -1) + return (SET_ERROR(ENOENT)); + + tvd = vdev_lookup_by_guid(rvd, svs_phys.sr_vdev); + oldvd = vdev_lookup_by_guid(rvd, svs_phys.sr_oldvd); + if (tvd == NULL || oldvd == NULL || oldvd->vdev_top != tvd) + return (SET_ERROR(ENOENT)); + + if (tvd->vdev_ops != &vdev_draid_ops) + return (SET_ERROR(ENOTSUP)); + + if (svs_phys.sr_ms >= tvd->vdev_ms_count - 1) + return (SET_ERROR(ENOENT)); + + pvd = oldvd->vdev_parent; + if (pvd->vdev_ops != &vdev_spare_ops || pvd->vdev_children != 2) + return (SET_ERROR(ENOENT)); + + dspare = pvd->vdev_child[1]; + if (dspare->vdev_ops != &vdev_draid_spare_ops || + !vdev_resilver_needed(dspare, NULL, NULL)) + return (SET_ERROR(ENOENT)); + + draid_dbg(1, "Restarting rebuild at metaslab "U64FMT"\n", + svs_phys.sr_ms + 1); + spa_vdev_scan_start(spa, oldvd, svs_phys.sr_ms + 1, + spa_last_synced_txg(spa) + 1 + TXG_CONCURRENT_STATES); + return (0); +} + +void +spa_vdev_scan_setup_sync(dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + spa_t *spa = scn->scn_dp->dp_spa; + spa_vdev_scan_t *svs = spa->spa_vdev_scan; + vdev_t *oldvd; + + ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); + ASSERT(svs != NULL); + + oldvd = svs->svs_vd; + bzero(&scn->scn_phys, sizeof (scn->scn_phys)); + scn->scn_phys.scn_func = POOL_SCAN_REBUILD; + scn->scn_phys.scn_state = DSS_SCANNING; + scn->scn_phys.scn_min_txg = 0; + scn->scn_phys.scn_max_txg = tx->tx_txg; + scn->scn_phys.scn_ddt_class_max = 0; + scn->scn_phys.scn_start_time = gethrestime_sec(); + scn->scn_phys.scn_errors = 0; + /* Rebuild only examines blocks on one vdev */ + scn->scn_phys.scn_to_examine = oldvd->vdev_top->vdev_stat.vs_alloc; + svs->svs_phys.sr_ms = -1; + svs->svs_phys.sr_vdev = oldvd->vdev_top->vdev_guid; + svs->svs_phys.sr_oldvd = oldvd->vdev_guid; + + scn->scn_restart_txg = 0; + scn->scn_done_txg = 0; + scn->scn_sync_start_time = gethrtime(); + + spa->spa_scrub_active = B_TRUE; + spa_scan_stat_init(spa); + spa->spa_scrub_started = B_TRUE; + spa_event_notify(spa, NULL, NULL, ESC_ZFS_REBUILD_START); +} + +int +spa_vdev_scan_rebuild_cb(dsl_pool_t *dp, + const blkptr_t *bp, const zbookmark_phys_t *zb) +{ + /* Rebuild happens in open context and does not use this callback */ + ASSERT0(1); + return (-ENOTSUP); +} + +void +spa_vdev_scan_destroy(spa_t *spa) +{ + spa_vdev_scan_t *svs = spa->spa_vdev_scan; + + if (svs == NULL) + return; + + ASSERT3P(svs->svs_thread, ==, NULL); + ASSERT3P(svs->svs_ms_done, ==, NULL); + ASSERT3U(svs->svs_io_asize, ==, 0); + + spa->spa_vdev_scan = NULL; + mutex_destroy(&svs->svs_lock); + cv_destroy(&svs->svs_cv); + mutex_destroy(&svs->svs_io_lock); + cv_destroy(&svs->svs_io_cv); + kmem_free(svs, sizeof (*svs)); +} + +void +spa_vdev_scan_suspend(spa_t *spa) +{ + spa_vdev_scan_t *svs = spa->spa_vdev_scan; + + if (svs == NULL) + return; + + mutex_enter(&svs->svs_lock); + svs->svs_thread_exit = B_TRUE; + while (svs->svs_thread != NULL) + cv_wait(&svs->svs_cv, &svs->svs_lock); + mutex_exit(&svs->svs_lock); +} + +void +spa_vdev_scan_sync_state(spa_vdev_scan_t *svs, dmu_tx_t *tx) +{ + VERIFY0(zap_update(svs->svs_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_REBUILDING, sizeof (uint64_t), + sizeof (spa_rebuilding_phys_t) / sizeof (uint64_t), + &svs->svs_phys, tx)); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(spa_vdev_scan_delay, int, 0644); +MODULE_PARM_DESC(spa_vdev_scan_delay, "Number of ticks to delay SPA rebuild"); + +module_param(spa_vdev_scan_idle, int, 0644); +MODULE_PARM_DESC(spa_vdev_scan_idle, + "Idle window in clock ticks for SPA rebuild"); +#endif diff --git a/module/zfs/zio.c b/module/zfs/zio.c index e8c2ca89aff9..9fd043f6602d 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3607,6 +3607,27 @@ zio_vdev_io_start(zio_t *zio) } } + /* + * We keep track of time-sensitive I/Os so that the scan thread + * can quickly react to certain workloads. In particular, we care + * about non-scrubbing, top-level reads and writes with the following + * characteristics: + * - synchronous writes of user data to non-slog devices + * - any reads of user data + * When these conditions are met, adjust the timestamp of vdev_last_io + * which allows the scan thread to adjust its workload accordingly. + */ + if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && + vd == vd->vdev_top && !vd->vdev_islog && + zio->io_bookmark.zb_objset != DMU_META_OBJSET && + zio->io_txg != spa_syncing_txg(spa)) { + uint64_t old = vd->vdev_last_io; + uint64_t new = ddi_get_lbolt64(); + + if (old != new) + (void) atomic_cas_64(&vd->vdev_last_io, old, new); + } + align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && @@ -3670,11 +3691,21 @@ zio_vdev_io_start(zio_t *zio) * However, indirect vdevs point off to other vdevs which may have * DTL's, so we never bypass them. The child i/os on concrete vdevs * will be properly bypassed instead. + * + * Leaf DTL_PARTIAL can be empty when a legitimate write comes from + * a dRAID spare vdev. For example, when a dRAID spare is first + * used, its spare blocks need to be written to but the leaf vdev's + * of such blocks can have empty DTL_PARTIAL. + * + * There seemed no clean way to allow such writes while bypassing + * spurious ones. At this point, just avoid all bypassing for dRAID + * for correctness. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ vd->vdev_ops != &vdev_indirect_ops && + vd->vdev_top->vdev_ops != &vdev_draid_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); @@ -3682,6 +3713,7 @@ zio_vdev_io_start(zio_t *zio) } if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) @@ -3718,8 +3750,8 @@ zio_vdev_io_done(zio_t *zio) if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; - if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { - + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) { vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 8907533c9e4b..3e57e11eae0b 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -87,5 +87,6 @@ if is_linux; then "feature@encryption" "feature@project_quota" "feature@allocation_classes" + "feature@draid" ) fi From d010e5d8e9bccc05e318e4c2ffb33106bb8caaa8 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Thu, 11 Oct 2018 17:07:02 -0600 Subject: [PATCH 2/3] Make ztest and zloop dRAID aware Included new command options to control ztest testing with dRAID: -K draid|raidz|random -- kind of RAID to test -D -- dRAID data drives per redundancy group -G -- dRAID redundancy group count -S -- dRAID distributed spare drives -R -- RAID parity (raidz or dRAID) The above values are used to generate a dRAID config from the draidcfg command. This also allows for a more comprehensive set of configurations when used with zloop. By default, dRAID will be provisioned 50% of the time in ztest runs. Added dRAID specific section to zloop to exercise dRAID across a wide range of configuration parameters. For example: ztest -VVVV -K draid -D 7 -G 6 -S 1 -m 0 -r 1 -R 1 -v 0 -a 12 -s 384m Updated the zloop ztest time range to be 30 - 120 seconds. Signed-off-by: Don Brady --- cmd/ztest/ztest.c | 349 ++++++++++++++++++++++++++++++++++++---------- man/man1/ztest.1 | 31 +++- scripts/zloop.sh | 86 +++++++++++- 3 files changed, 381 insertions(+), 85 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 40b107175edc..d9f543387845 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -107,6 +107,7 @@ #include #include #include +#include #include #include #include @@ -166,8 +167,12 @@ typedef struct ztest_shared_opts { size_t zo_vdev_size; int zo_ashift; int zo_mirrors; - int zo_raidz; - int zo_raidz_parity; + int zo_raid_children; + int zo_raid_parity; + char zo_raid_type[8]; + int zo_draid_data; + int zo_draid_groups; + int zo_draid_spares; int zo_datasets; int zo_threads; uint64_t zo_passtime; @@ -189,9 +194,13 @@ static const ztest_shared_opts_t ztest_opts_defaults = { .zo_vdevs = 5, .zo_ashift = SPA_MINBLOCKSHIFT, .zo_mirrors = 2, - .zo_raidz = 4, - .zo_raidz_parity = 1, + .zo_raid_children = 4, + .zo_raid_parity = 1, + .zo_raid_type = VDEV_TYPE_RAIDZ, .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ + .zo_draid_data = 4, /* data drives per redundancy group */ + .zo_draid_groups = 3, /* redundancy group count */ + .zo_draid_spares = 1, /* distributed spares */ .zo_datasets = 7, .zo_threads = 23, .zo_passtime = 60, /* 60 seconds */ @@ -229,7 +238,7 @@ static ztest_shared_ds_t *ztest_shared_ds; #define BT_MAGIC 0x123456789abcdefULL #define MAXFAULTS(zs) \ - (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1) + (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) enum ztest_io_type { ZTEST_IO_WRITE_TAG, @@ -502,8 +511,10 @@ enum ztest_object { ZTEST_OBJECTS }; +#define DRAID_CONFIG "draid.config" static void usage(boolean_t) __NORETURN; +static void make_draid_config(ztest_shared_opts_t *zo); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. @@ -663,7 +674,11 @@ usage(boolean_t requested) "\t[-a alignment_shift (default: %d)] use 0 for random\n" "\t[-m mirror_copies (default: %d)]\n" "\t[-r raidz_disks (default: %d)]\n" - "\t[-R raidz_parity (default: %d)]\n" + "\t[-R raid_parity (default: %d)]\n" + "\t[-K raid_kind (default: random)] raidz|draid|random\n" + "\t[-D draid_data_drives (default: %d)] per redundancy group\n" + "\t[-G draid_groups (default: %d)] redundancy group count\n" + "\t[-S draid_spares (default: %d)]\n" "\t[-d datasets (default: %d)]\n" "\t[-t threads (default: %d)]\n" "\t[-g gang_block_threshold (default: %s)]\n" @@ -681,7 +696,6 @@ usage(boolean_t requested) "\t[-C vdev class state (default: random)] special=on|off|random\n" "\t[-o variable=value] ... set global variable to an unsigned\n" "\t 32-bit integer value\n" - "\t[-G dump zfs_dbgmsg buffer before exiting due to an error\n" "\t[-h] (print help)\n" "", zo->zo_pool, @@ -689,8 +703,11 @@ usage(boolean_t requested) nice_vdev_size, /* -s */ zo->zo_ashift, /* -a */ zo->zo_mirrors, /* -m */ - zo->zo_raidz, /* -r */ - zo->zo_raidz_parity, /* -R */ + zo->zo_raid_children, /* -r */ + zo->zo_raid_parity, /* -R */ + zo->zo_draid_data, /* -D */ + zo->zo_draid_groups, /* -G */ + zo->zo_draid_spares, /* -S */ zo->zo_datasets, /* -d */ zo->zo_threads, /* -t */ nice_force_ganging, /* -g */ @@ -704,6 +721,21 @@ usage(boolean_t requested) exit(requested ? 0 : 1); } +static uint64_t +ztest_random(uint64_t range) +{ + uint64_t r; + + ASSERT3S(ztest_fd_rand, >=, 0); + + if (range == 0) + return (0); + + if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) + fatal(1, "short read from /dev/urandom"); + + return (r % range); +} static void ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) @@ -753,11 +785,12 @@ process_options(int argc, char **argv) int opt; uint64_t value; char altdir[MAXNAMELEN] = { 0 }; + char raid_kind[8] = { "random" }; bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); while ((opt = getopt(argc, argv, - "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) { + "v:s:a:m:r:R:K:D:G:S:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:")) != EOF) { value = 0; switch (opt) { case 'v': @@ -766,6 +799,9 @@ process_options(int argc, char **argv) case 'm': case 'r': case 'R': + case 'D': + case 'G': + case 'S': case 'd': case 't': case 'g': @@ -790,10 +826,22 @@ process_options(int argc, char **argv) zo->zo_mirrors = value; break; case 'r': - zo->zo_raidz = MAX(1, value); + zo->zo_raid_children = MAX(1, value); break; case 'R': - zo->zo_raidz_parity = MIN(MAX(value, 1), 3); + zo->zo_raid_parity = MIN(MAX(value, 1), 3); + break; + case 'K': + (void) strlcpy(raid_kind, optarg, sizeof (raid_kind)); + break; + case 'D': + zo->zo_draid_data = MAX(1, value); + break; + case 'G': + zo->zo_draid_groups = MAX(1, value); + break; + case 'S': + zo->zo_draid_spares = MAX(1, value); break; case 'd': zo->zo_datasets = MAX(1, value); @@ -855,9 +903,6 @@ process_options(int argc, char **argv) if (set_global_var(optarg) != 0) usage(B_FALSE); break; - case 'G': - ztest_dump_debug_buffer = B_TRUE; - break; case 'h': usage(B_TRUE); break; @@ -868,7 +913,49 @@ process_options(int argc, char **argv) } } - zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1); + /* When raid choice is 'random' add a draid pool 50% of the time */ + if (strcmp(raid_kind, "random") == 0) { + (void) strlcpy(raid_kind, (ztest_random(2) == 0) ? + "draid" : "raidz", sizeof (raid_kind)); + + if (ztest_opts.zo_verbose >= 3) + (void) printf("choosing RAID type '%s'\n", raid_kind); + } + + if (strcmp(raid_kind, "draid") == 0) { + uint64_t min_devsize; + + /* Compute dRAID total disks from inputs */ + ztest_opts.zo_raid_children = (zo->zo_draid_groups * + (zo->zo_draid_data + zo->zo_raid_parity)) + + zo->zo_draid_spares; + + /* With fewer disk use 256M, otherwise 128M is OK */ + min_devsize = (ztest_opts.zo_raid_children < 16) ? + (256ULL << 20) : (128ULL << 20); + + /* No top-level mirrors with dRAID for now */ + zo->zo_mirrors = 0; + + /* Use more appropriate defaults for dRAID */ + if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) + zo->zo_vdevs = 1; + if (zo->zo_ashift < 12) + zo->zo_ashift = 12; + if (zo->zo_vdev_size < min_devsize) + zo->zo_vdev_size = min_devsize; + + (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, + sizeof (zo->zo_raid_type)); + + make_draid_config(zo); + + } else /* using raidz */ { + ASSERT0(strcmp(raid_kind, "raidz")); + + zo->zo_raid_parity = MIN(zo->zo_raid_parity, + zo->zo_raid_children - 1); + } zo->zo_vdevtime = (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : @@ -939,22 +1026,6 @@ ztest_kill(ztest_shared_t *zs) (void) kill(getpid(), SIGKILL); } -static uint64_t -ztest_random(uint64_t range) -{ - uint64_t r; - - ASSERT3S(ztest_fd_rand, >=, 0); - - if (range == 0) - return (0); - - if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) - fatal(1, "short read from /dev/urandom"); - - return (r % range); -} - /* ARGSUSED */ static void ztest_record_enospc(const char *s) @@ -970,6 +1041,97 @@ ztest_get_ashift(void) return (ztest_opts.zo_ashift); } +static int +ztest_check_path(char *path) +{ + struct stat s; + /* return true on success */ + return (!stat(path, &s)); +} + +static void +ztest_get_draidcfg_bin(char *bin, int len) +{ + VERIFY(realpath(getexecname(), bin) != NULL); + + if (strstr(bin, "/ztest/")) { + strstr(bin, "/ztest/")[0] = '\0'; /* In-tree */ + strcat(bin, "/draidcfg/draidcfg"); + if (ztest_check_path(bin)) + return; + } + strcpy(bin, "draidcfg"); +} + +/* + * dRAID configured via draidcfg command + */ +static void +ztest_run_draidcfg(uint_t total, uint_t group, uint_t parity, uint_t spares, + const char *path) +{ + int status; + char *bin; + char *draidcfg; + char *zbuf; + const int len = MAXPATHLEN + MAXNAMELEN + 20; + FILE *fp; + + bin = umem_alloc(len, UMEM_NOFAIL); + draidcfg = umem_alloc(len, UMEM_NOFAIL); + zbuf = umem_alloc(1024, UMEM_NOFAIL); + + ztest_get_draidcfg_bin(bin, len); + + /* + * draidcfg -n total_drives -d drives_per_redundancy_group + * -p parity_per_group -s distributed_spare cfg_file_name + */ + (void) sprintf(draidcfg, "%s -n %d -d %d -p %d -s %d %s", + bin, total, group, parity, spares, path); + + if (ztest_opts.zo_verbose >= 3) + (void) printf("Executing %s\n", strstr(draidcfg, "draidcfg ")); + + fp = popen(draidcfg, "r"); + + while (fgets(zbuf, 1024, fp) != NULL) + if (ztest_opts.zo_verbose > 4) + (void) printf("%s", zbuf); + + status = pclose(fp); + + if (status == 0) + goto out; + + ztest_dump_core = 0; + if (WIFEXITED(status)) + fatal(0, "'%s' exit code %d", draidcfg, WEXITSTATUS(status)); + else + fatal(0, "'%s' died with signal %d", draidcfg, + WTERMSIG(status)); +out: + umem_free(bin, len); + umem_free(draidcfg, len); + umem_free(zbuf, 1024); +} + +static void +make_draid_config(ztest_shared_opts_t *zo) +{ + char *path; + + path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + (void) snprintf(path, MAXPATHLEN, "%s/%s", zo->zo_dir, DRAID_CONFIG); + + /* build a dRAID config */ + ztest_run_draidcfg(ztest_opts.zo_raid_children, + ztest_opts.zo_draid_data, ztest_opts.zo_raid_parity, + ztest_opts.zo_draid_spares, path); + + umem_free(path, MAXPATHLEN); +} + static nvlist_t * make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) { @@ -1009,7 +1171,12 @@ make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) } VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); + if (strstr(path, VDEV_TYPE_DRAID) != NULL) + VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_DRAID_SPARE) == 0); + else + VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_FILE) == 0); VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0); VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); umem_free(pathbuf, MAXPATHLEN); @@ -1018,10 +1185,10 @@ make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) } static nvlist_t * -make_vdev_raidz(char *path, char *aux, char *pool, size_t size, +make_vdev_raid(char *path, char *aux, char *pool, size_t size, uint64_t ashift, int r) { - nvlist_t *raidz, **child; + nvlist_t *raid, **child; int c; if (r < 2) @@ -1031,20 +1198,33 @@ make_vdev_raidz(char *path, char *aux, char *pool, size_t size, for (c = 0; c < r; c++) child[c] = make_vdev_file(path, aux, pool, size, ashift); - VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_RAIDZ) == 0); - VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, - ztest_opts.zo_raidz_parity) == 0); - VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, + VERIFY(nvlist_alloc(&raid, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(raid, ZPOOL_CONFIG_TYPE, + ztest_opts.zo_raid_type) == 0); + VERIFY(nvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, + ztest_opts.zo_raid_parity) == 0); + VERIFY(nvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, child, r) == 0); + if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { + nvlist_t *draidcfg; + char *path; + + path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + (void) snprintf(path, MAXPATHLEN, "%s/%s", ztest_opts.zo_dir, + DRAID_CONFIG); + draidcfg = draidcfg_read_file(path); + VERIFY(draidcfg != NULL); + VERIFY(vdev_draid_config_add(raid, draidcfg) == B_TRUE); + umem_free(path, MAXPATHLEN); + } + for (c = 0; c < r; c++) nvlist_free(child[c]); umem_free(child, r * sizeof (nvlist_t *)); - return (raidz); + return (raid); } static nvlist_t * @@ -1055,12 +1235,12 @@ make_vdev_mirror(char *path, char *aux, char *pool, size_t size, int c; if (m < 1) - return (make_vdev_raidz(path, aux, pool, size, ashift, r)); + return (make_vdev_raid(path, aux, pool, size, ashift, r)); child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < m; c++) - child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r); + child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, @@ -2874,6 +3054,11 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) if (ztest_opts.zo_mmp_test) return; + /* skip upgrade testing for dRAID */ + if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) + return; + + mutex_enter(&ztest_vdev_lock); name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); @@ -2883,13 +3068,13 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) (void) spa_destroy(name); nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, - NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); + NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); /* * If we're configuring a RAIDZ device then make sure that the * the initial version is capable of supporting that feature. */ - switch (ztest_opts.zo_raidz_parity) { + switch (ztest_opts.zo_raid_parity) { case 0: case 1: initial_version = SPA_VERSION_INITIAL; @@ -3047,7 +3232,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) return; mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * + ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -3101,7 +3287,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) */ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? - "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, + 1); error = spa_vdev_add(spa, nvroot); nvlist_free(nvroot); @@ -3155,14 +3342,15 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) return; } - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * + ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; spa_config_exit(spa, SCL_VDEV, FTAG); nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - class, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); nvlist_free(nvroot); @@ -3211,7 +3399,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) char *aux; char *path; uint64_t guid = 0; - int error; + int error, ignore_err = 0; if (ztest_opts.zo_mmp_test) return; @@ -3234,7 +3422,13 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) /* * Pick a random device to remove. */ - guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; + vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; + + /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ + if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) + ignore_err = ENOTSUP; + + guid = svd->vdev_guid; } else { /* * Find an unused device we can add. @@ -3291,7 +3485,9 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) case ZFS_ERR_DISCARDING_CHECKPOINT: break; default: - fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); + if (error != ignore_err) + fatal(0, "spa_vdev_remove(%llu) = %d", guid, + error); } } @@ -3320,7 +3516,7 @@ ztest_split_pool(ztest_ds_t *zd, uint64_t id) mutex_enter(&ztest_vdev_lock); /* ensure we have a useable config; mirrors of raidz aren't supported */ - if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) { + if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { mutex_exit(&ztest_vdev_lock); return; } @@ -3430,7 +3626,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -3470,14 +3666,17 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (zs->zs_mirrors >= 1) { ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); ASSERT(oldvd->vdev_children >= zs->zs_mirrors); - oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz]; + oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; } /* pick a child out of the raidz group */ - if (ztest_opts.zo_raidz > 1) { - ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); - ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz); - oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz]; + if (ztest_opts.zo_raid_children > 1) { + if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) + ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); + else + ASSERT(oldvd->vdev_ops == &vdev_draid_ops); + ASSERT(oldvd->vdev_children == ztest_opts.zo_raid_children); + oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; } /* @@ -3600,6 +3799,17 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) error == ZFS_ERR_DISCARDING_CHECKPOINT) expected_error = error; + /* + * dRAID doesn't allow distributed spares from a different vdev + * and a spare can be smaller than a slightly expanded vdev + */ + if (((error == ENOTSUP && expected_error != error) || + (error == 0 && expected_error == EOVERFLOW)) && + strstr(newpath, VDEV_TYPE_DRAID) != NULL) { + expected_error = error; + } + + /* XXX workaround 6690467 */ if (error != expected_error && expected_error != EBUSY) { fatal(0, "attach (%s %llu, %s %llu, %d) " @@ -3959,6 +4169,7 @@ ztest_dataset_create(char *dsname) * wrapping key. */ rand = ztest_random(2); + if (rand != 0) { nvlist_t *crypto_args = fnvlist_alloc(); nvlist_t *props = fnvlist_alloc(); @@ -5823,7 +6034,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) } maxfaults = MAXFAULTS(zs); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; mirror_save = zs->zs_mirrors; mutex_exit(&ztest_vdev_lock); @@ -6422,14 +6633,6 @@ ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) } } -static int -ztest_check_path(char *path) -{ - struct stat s; - /* return true on success */ - return (!stat(path, &s)); -} - static void ztest_get_zdb_bin(char *bin, int len) { @@ -7228,7 +7431,7 @@ ztest_import(ztest_shared_t *zs) libzfs_fini(hdl); kernel_fini(); - if (!ztest_opts.zo_mmp_test) { + if (!ztest_opts.zo_mmp_test && 0) { ztest_run_zdb(ztest_opts.zo_pool); ztest_freeze(); ztest_run_zdb(ztest_opts.zo_pool); @@ -7264,7 +7467,7 @@ ztest_init(ztest_shared_t *zs) zs->zs_splits = 0; zs->zs_mirrors = ztest_opts.zo_mirrors; nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); props = make_random_props(); /* @@ -7586,10 +7789,12 @@ main(int argc, char **argv) if (ztest_opts.zo_verbose >= 1) { (void) printf("%llu vdevs, %d datasets, %d threads," - " %llu seconds...\n", + "%d %s disks, %llu seconds...\n\n", (u_longlong_t)ztest_opts.zo_vdevs, ztest_opts.zo_datasets, ztest_opts.zo_threads, + ztest_opts.zo_raid_children, + ztest_opts.zo_raid_type, (u_longlong_t)ztest_opts.zo_time); } diff --git a/man/man1/ztest.1 b/man/man1/ztest.1 index b8cb0d45d92c..64e8c4b40816 100644 --- a/man/man1/ztest.1 +++ b/man/man1/ztest.1 @@ -23,6 +23,7 @@ .\" Copyright (c) 2009 Oracle and/or its affiliates. All rights reserved. .\" Copyright (c) 2009 Michael Gebetsroither . All rights .\" reserved. +.\" Copyright (c) 2017, Intel Corporation. .\" .TH ztest 1 "2009 NOV 01" "ZFS on Linux" "User Commands" @@ -70,7 +71,7 @@ Print a help summary. .IP Number of vdevs. .HP -.BI "\-s" " size_of_each_vdev" " (default: 64M)" +.BI "\-s" " size_of_each_vdev" " (default: 256M)" .IP Size of each vdev. .HP @@ -86,9 +87,29 @@ Number of mirror copies. .IP Number of raidz disks. .HP -.BI "\-R" " raidz_parity" " (default: 1)" +.BI "\-R" " raid_parity" " (default: 1)" .IP -Raidz parity. +Raid parity (raidz & draid). +.HP +.BI "\-K" " raid_kind" " (default: 'random') raidz|draid|random" +.IP +The kind of RAID config to use. With 'random' the kind alternates between raidz and draid. +.HP +.BI "\-D" " draid_data_disks" " (default: 4)" +.IP +Number of data disks in a dRAID redundancy group. +.HP +.BI "\-G" " draidz_groups" " (default: 2)" +.IP +Number of dRAID redundancy groups. +.HP +.BI "\-S" " draid_spares" " (default: 1)" +.IP +Number of dRAID distributed spare disks. +.HP +.BI "\-C" " vdev_class_state" " (default: random)" +.IP +The vdev allocation class state: special=on|off|random. .HP .BI "\-d" " datasets" " (default: 7)" .IP @@ -129,10 +150,6 @@ Total test run time. .BI "\-z" " zil_failure_rate" " (default: fail every 2^5 allocs) .IP Injected failure rate. -.HP -.BI "\-G" -.IP -Dump zfs_dbgmsg buffer before exiting. .SH "EXAMPLES" .LP To override /tmp as your location for block files, you can use the -f diff --git a/scripts/zloop.sh b/scripts/zloop.sh index 1f36f865b594..43e422924224 100755 --- a/scripts/zloop.sh +++ b/scripts/zloop.sh @@ -18,6 +18,7 @@ # # Copyright (c) 2015 by Delphix. All rights reserved. # Copyright (C) 2016 Lawrence Livermore National Security, LLC. +# Copyright (c) 2017, Intel Corporation. # BASE_DIR=$(dirname "$0") @@ -229,27 +230,100 @@ curtime=$starttime # if no timeout was specified, loop forever. while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do - zopt="-G -VVVVV" + zopt="-VVVVV" # start each run with an empty directory workdir="$basedir/$rundir" or_die rm -rf "$workdir" or_die mkdir "$workdir" - # switch between common arrangements & fully randomized - if [[ $((RANDOM % 2)) -eq 0 ]]; then + # switch between three types of configs + # 25% basic, 25% raidz mix, and 50% draid mix + choice=$((RANDOM % 4)) + + # ashift range 9 - 15 + align=$(((RANDOM % 2) * 3 + 9)) + + if [[ $choice -eq 0 ]]; then + # basic mirror only mirrors=2 raidz=0 parity=1 vdevs=2 - else + zopt="$zopt -K raidz" + elif [[ $choice -eq 1 ]]; then + # fully randomized mirror/raidz (sans dRAID) mirrors=$(((RANDOM % 3) * 1)) parity=$(((RANDOM % 3) + 1)) raidz=$((((RANDOM % 9) + parity + 1) * (RANDOM % 2))) vdevs=$(((RANDOM % 3) + 3)) + zopt="$zopt -K raidz" + else + # mix of draid fixed (one per parity) and fully random + mirrors=0 + raidz=1 + align=12 + case $((RANDOM % 4)) in + 0 ) # draid1: 3 x (4 + 1) + 1 = 16 drives + parity=1 + draid_data=4 + draid_groups=3 + draid_spares=1 + vdevs=2 + class="special=off" + size=320m + ;; + 1 ) # draid2: 3 x (5 + 2) + 2 = 23 drives + parity=2 + draid_data=5 + draid_groups=3 + draid_spares=2 + vdevs=0 + class="special=on" + size=240m + ;; + 2 ) # draid3: 4 x (6 + 3) + 3 = 39 drives + parity=3 + draid_data=6 + draid_groups=4 + draid_spares=3 + vdevs=0 + class="special=on" + size=160m + ;; + 3 ) # dRAID with varying choices + # parity: 1 --> 3 + # data: 3 --> 6 + # groups: 2 --> 6 + # spares: 0 --> parity + 1 + # yields max drives = 6 x (6 + 3) + 4 = 58 drives + parity=$(((RANDOM % 3) + 1)) + draid_data=$(((RANDOM % 4) + 3 + parity)) + draid_groups=$(((RANDOM % 5) + 2)) + draid_spares=$(((RANDOM % 2) + parity)) + vdevs=$((RANDOM % 3)) + class="special=random" + gbavail=$(df -B 1G "$workdir" | awk 'NR==2 {print $4}') + # check if we have at least 60G availible + if [[ $gbavail -gt 60 ]]; then + # use size range 128MB - 1GB + megabytes=$((((RANDOM % 8) + 1) * 128)) + size="${megabytes}m" + else + echo "limited storage on '$workdir'" >>ztest.out + size=128m + fi + esac + + zopt="$zopt -K draid" + zopt="$zopt -D $draid_data" + zopt="$zopt -G $draid_groups" + zopt="$zopt -S $draid_spares" + zopt="$zopt -C $class" fi - align=$(((RANDOM % 2) * 3 + 9)) - runtime=$((RANDOM % 100)) + + # run from 30 to 120 seconds + runtime=$(((RANDOM % 90) + 30)) passtime=$((RANDOM % (runtime / 3 + 1) + 10)) zopt="$zopt -m $mirrors" From 296497092bbbc3c3ca0cbeaf97fe637c65cfac67 Mon Sep 17 00:00:00 2001 From: Don Brady Date: Thu, 11 Oct 2018 21:42:18 -0600 Subject: [PATCH 3/3] Cleanup draid config code dependencies Removed libzfs dependency on libzpool The new source relationship is as follows: [include/draid_config.h]<--------+ ^ ^ ^ | | | | (include) | | | | | [zpool] [draidcfg] [ztest] | | | | | | | | [vdev_draid_impl.h] +-+ +-+ | ^ | | (link) | | v v v | [libzfs] [libzpool] [zfs-mod] | | | +-----+ | (object) | | | | v v v [zcommon/draid_config.c] Signed-off-by: Don Brady --- cmd/draidcfg/draid_permutation.c | 2 + cmd/draidcfg/draid_permutation.h | 2 +- cmd/draidcfg/draidcfg.c | 3 +- cmd/zpool/zpool_main.c | 1 - cmd/zpool/zpool_vdev.c | 2 +- include/Makefile.am | 1 + include/draid_config.h | 72 ++++++++++ include/libzfs.h | 5 - include/sys/vdev_draid_impl.h | 24 +--- include/sys/vdev_impl.h | 2 - lib/libzfs/Makefile.am | 2 +- lib/libzfs/libzfs_import.c | 59 -------- lib/libzfs/libzfs_pool.c | 2 +- lib/libzpool/Makefile.am | 1 + module/zcommon/Makefile.in | 1 + module/zcommon/draid_config.c | 238 +++++++++++++++++++++++++++++++ module/zfs/vdev_draid.c | 139 ------------------ 17 files changed, 322 insertions(+), 234 deletions(-) create mode 100644 include/draid_config.h create mode 100644 module/zcommon/draid_config.c diff --git a/cmd/draidcfg/draid_permutation.c b/cmd/draidcfg/draid_permutation.c index 4753f3f31f66..b8d8c0cc1295 100644 --- a/cmd/draidcfg/draid_permutation.c +++ b/cmd/draidcfg/draid_permutation.c @@ -22,6 +22,7 @@ * Copyright (c) 2016 Intel Corporation. */ +#include #include #include #include @@ -29,6 +30,7 @@ #include #include #include +#include #include "draid_permutation.h" diff --git a/cmd/draidcfg/draid_permutation.h b/cmd/draidcfg/draid_permutation.h index 8562ccf09852..b65b3ae4e688 100644 --- a/cmd/draidcfg/draid_permutation.h +++ b/cmd/draidcfg/draid_permutation.h @@ -26,7 +26,7 @@ #ifndef _DRAID_PERMUTATION_H #define _DRAID_PERMUTATION_H -#include +#include "draid_config.h" #ifdef __cplusplus extern "C" { diff --git a/cmd/draidcfg/draidcfg.c b/cmd/draidcfg/draidcfg.c index 90e40a61a2e1..4658e1c9ac7d 100644 --- a/cmd/draidcfg/draidcfg.c +++ b/cmd/draidcfg/draidcfg.c @@ -28,9 +28,8 @@ #include #include #include -#include -#include +#include "draid_config.h" #include "draid_permutation.h" diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 7e48c67c0b4c..728c10c3b475 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -58,7 +58,6 @@ #include #include #include -#include #include #include diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index deefcaa53a87..fa83886e7498 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -84,7 +84,7 @@ #include #include "zpool_util.h" #include -#include +#include "draid_config.h" /* * For any given vdev specification, we can have multiple errors. The diff --git a/include/Makefile.am b/include/Makefile.am index 5f13505f2790..5000a500a7c6 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -1,6 +1,7 @@ SUBDIRS = linux spl sys COMMON_H = \ + $(top_srcdir)/include/draid_config.h \ $(top_srcdir)/include/zfeature_common.h \ $(top_srcdir)/include/zfs_comutil.h \ $(top_srcdir)/include/zfs_deleg.h \ diff --git a/include/draid_config.h b/include/draid_config.h new file mode 100644 index 000000000000..8e4b7736af4b --- /dev/null +++ b/include/draid_config.h @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _DRAID_CONFIG_H +#define _DRAID_CONFIG_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define VDEV_DRAID_MAX_CHILDREN 255 +#define VDEV_DRAID_U8_MAX ((uint8_t)-1) + +#define VDEV_RAIDZ_MAXPARITY 3 + +/* + * Double '%' characters in the front because it's used as format string in + * scanf()/printf() family of functions + */ +#define VDEV_DRAID_SPARE_PATH_FMT "%%"VDEV_TYPE_DRAID"%lu-%lu-s%lu" + +struct abd; + +struct vdev_draid_configuration { + uint64_t dcf_data; + uint64_t dcf_parity; + uint64_t dcf_spare; + uint64_t dcf_children; + uint64_t dcf_bases; + struct abd *dcf_zero_abd; /* zfs module and libzpool only */ + const uint64_t *dcf_base_perms; +}; + +struct vdev; +typedef struct vdev vdev_t; + +extern boolean_t vdev_draid_config_validate(const vdev_t *, nvlist_t *); + +#ifndef _KERNEL +extern boolean_t vdev_draid_config_add(nvlist_t *, nvlist_t *); +extern nvlist_t *draidcfg_read_file(const char *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _DRAID_CONFIG_H */ diff --git a/include/libzfs.h b/include/libzfs.h index a93b142b3127..45b8e3d5f1a1 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -892,11 +892,6 @@ int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); extern int zpool_enable_datasets(zpool_handle_t *, const char *, int); extern int zpool_disable_datasets(zpool_handle_t *, boolean_t); -/* - * dRAID import support - */ -nvlist_t *draidcfg_read_file(const char *); - /* * Support for Linux libudev derived persistent device strings */ diff --git a/include/sys/vdev_draid_impl.h b/include/sys/vdev_draid_impl.h index 1bc9ebeab0d0..4334a21f2b46 100644 --- a/include/sys/vdev_draid_impl.h +++ b/include/sys/vdev_draid_impl.h @@ -32,21 +32,12 @@ #include #include +#include "draid_config.h" + #ifdef __cplusplus extern "C" { #endif - -struct vdev_draid_configuration { - uint64_t dcf_data; - uint64_t dcf_parity; - uint64_t dcf_spare; - uint64_t dcf_children; - uint64_t dcf_bases; - abd_t *dcf_zero_abd; - const uint64_t *dcf_base_perms; -}; - extern boolean_t vdev_draid_ms_mirrored(const vdev_t *, uint64_t); extern boolean_t vdev_draid_group_degraded(vdev_t *, vdev_t *, uint64_t, uint64_t, boolean_t); @@ -57,8 +48,6 @@ extern uint64_t vdev_draid_group2offset(const vdev_t *, uint64_t, boolean_t); extern boolean_t vdev_draid_is_remainder_group(const vdev_t *, uint64_t, boolean_t); extern uint64_t vdev_draid_get_groupsz(const vdev_t *, boolean_t); -extern boolean_t vdev_draid_config_validate(const vdev_t *, nvlist_t *); -extern boolean_t vdev_draid_config_add(nvlist_t *, nvlist_t *); extern void vdev_draid_fix_skip_sectors(zio_t *); extern int vdev_draid_hide_skip_sectors(raidz_map_t *); extern void vdev_draid_restore_skip_sectors(raidz_map_t *, int); @@ -70,15 +59,6 @@ extern nvlist_t *vdev_draid_spare_read_config(vdev_t *); extern uint64_t vdev_draid_asize2psize(vdev_t *, uint64_t, uint64_t); extern uint64_t vdev_draid_max_rebuildable_asize(vdev_t *, uint64_t); -#define VDEV_DRAID_MAX_CHILDREN 255 -#define VDEV_DRAID_U8_MAX ((uint8_t)-1) - -/* - * Double '%' characters in the front because it's used as format string in - * scanf()/printf() family of functions - */ -#define VDEV_DRAID_SPARE_PATH_FMT "%%"VDEV_TYPE_DRAID"%lu-%lu-s%lu" - #ifdef _KERNEL #define U64FMT "%llu" #ifdef ZFS_IS_GPL_COMPATIBLE diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 1a7e60263545..decb565580ce 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -366,8 +366,6 @@ struct vdev { zfs_ratelimit_t vdev_checksum_rl; }; -#define VDEV_RAIDZ_MAXPARITY 3 - #define VDEV_PAD_SIZE (8 << 10) /* 2 padding areas (vl_pad1 and vl_pad2) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index 0709c5c5298e..a63c81690024 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -34,6 +34,7 @@ USER_C = \ KERNEL_C = \ algs/sha2/sha2.c \ + draid_config.c \ zfeature_common.c \ zfs_comutil.c \ zfs_deleg.c \ @@ -60,7 +61,6 @@ libzfs_la_LIBADD = \ $(top_builddir)/lib/libshare/libshare.la \ $(top_builddir)/lib/libtpool/libtpool.la \ $(top_builddir)/lib/libuutil/libuutil.la \ - $(top_builddir)/lib/libzpool/libzpool.la \ $(top_builddir)/lib/libzfs_core/libzfs_core.la libzfs_la_LIBADD += -lm $(LIBBLKID) $(LIBUDEV) $(LIBSSL) diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index b0d574e55501..a5038bb0209b 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -63,7 +63,6 @@ #include #include #include -#include #include #include "libzfs.h" #include "libzfs_impl.h" @@ -927,64 +926,6 @@ vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) return (B_FALSE); } -nvlist_t * -draidcfg_read_file(const char *path) -{ - int fd; - struct stat64 sb; - char *buf; - nvlist_t *config; - - if ((fd = open(path, O_RDONLY)) < 0) { - (void) fprintf(stderr, "Cannot open '%s'\n", path); - return (NULL); - } - - if (fstat64(fd, &sb) != 0) { - (void) fprintf(stderr, "Failed to stat '%s'\n", path); - close(fd); - return (NULL); - } - - if (!S_ISREG(sb.st_mode)) { - (void) fprintf(stderr, "Not a regular file '%s'\n", path); - close(fd); - return (NULL); - } - - if ((buf = malloc(sb.st_size)) == NULL) { - (void) fprintf(stderr, "Failed to allocate %llu bytes\n", - (u_longlong_t)sb.st_size); - close(fd); - return (NULL); - } - - if (read(fd, buf, sb.st_size) != sb.st_size) { - (void) fprintf(stderr, "Failed to read %llu bytes\n", - (u_longlong_t)sb.st_size); - close(fd); - free(buf); - return (NULL); - } - - (void) close(fd); - - if (nvlist_unpack(buf, sb.st_size, &config, 0) != 0) { - (void) fprintf(stderr, "Failed to unpack nvlist\n"); - free(buf); - return (NULL); - } - - free(buf); - - if (!vdev_draid_config_validate(NULL, config)) { - nvlist_free(config); - return (NULL); - } - - return (config); -} - /* * Convert our list of pools into the definitive set of configurations. We * start by picking the best config for each toplevel vdev. Once that's done, diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 5fb041b8ce07..dbc0e320fafc 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -46,9 +46,9 @@ #include #include #include -#include #include +#include "draid_config.h" #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 1a2c2c9b39b8..8c792dbccc8e 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -26,6 +26,7 @@ USER_C = \ util.c KERNEL_C = \ + draid_config.c \ zfeature_common.c \ zfs_comutil.c \ zfs_deleg.c \ diff --git a/module/zcommon/Makefile.in b/module/zcommon/Makefile.in index 0ac0d43ee833..89a7eb188e9d 100644 --- a/module/zcommon/Makefile.in +++ b/module/zcommon/Makefile.in @@ -13,6 +13,7 @@ ifeq ($(target_cpu),sparc64) ccflags-y += -Wno-unused-value endif +$(MODULE)-objs += draid_config.o $(MODULE)-objs += zfeature_common.o $(MODULE)-objs += zfs_comutil.o $(MODULE)-objs += zfs_deleg.o diff --git a/module/zcommon/draid_config.c b/module/zcommon/draid_config.c new file mode 100644 index 000000000000..3dddc97d4639 --- /dev/null +++ b/module/zcommon/draid_config.c @@ -0,0 +1,238 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 Intel Corporation. + */ + +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#else +#include +#include +#endif + +boolean_t +vdev_draid_config_validate(const vdev_t *vd, nvlist_t *config) +{ + int i; + uint_t c; + uint8_t *perm = NULL; + uint64_t n, d, p, s, b; + + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_DRAIDCFG_CHILDREN, &n) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_CHILDREN); + return (B_FALSE); + } + + if (n - 1 > VDEV_DRAID_U8_MAX) { + draid_dbg(0, "%s configuration too large: "U64FMT"\n", + ZPOOL_CONFIG_DRAIDCFG_CHILDREN, n); + return (B_FALSE); + } + if (vd != NULL && n != vd->vdev_children) + return (B_FALSE); + + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_DRAIDCFG_PARITY, &p) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_PARITY); + return (B_FALSE); + } + + if (vd != NULL && p != vd->vdev_nparity) + return (B_FALSE); + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_DATA, &d) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_DATA); + return (B_FALSE); + } + + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_DRAIDCFG_SPARE, &s) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_SPARE); + return (B_FALSE); + } + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_BASE, &b) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_BASE); + return (B_FALSE); + } + + if (n == 0 || d == 0 || p == 0 || s == 0 || b == 0) { + draid_dbg(0, "Zero n/d/p/s/b\n"); + return (B_FALSE); + } + + if (p > VDEV_RAIDZ_MAXPARITY) { + draid_dbg(0, "Invalid parity "U64FMT"\n", p); + return (B_FALSE); + } + + if ((n - s) % (p + d) != 0) { + draid_dbg(0, U64FMT" mod "U64FMT" is not 0\n", n - s, p + d); + return (B_FALSE); + } + + if (nvlist_lookup_uint8_array(config, + ZPOOL_CONFIG_DRAIDCFG_PERM, &perm, &c) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_PERM); + return (B_FALSE); + } + + if (c != b * n) { + draid_dbg(0, + "Permutation array has %u items, but "U64FMT" expected\n", + c, b * n); + return (B_FALSE); + } + + for (i = 0; i < b; i++) { + int j, k; + for (j = 0; j < n; j++) { + uint64_t val = perm[i * n + j]; + + if (val >= n) { + draid_dbg(0, + "Invalid value "U64FMT" in " + "permutation %d\n", val, i); + return (B_FALSE); + } + + for (k = 0; k < j; k++) { + if (val == perm[i * n + k]) { + draid_dbg(0, + "Duplicated value "U64FMT" in " + "permutation %d\n", + val, i); + return (B_FALSE); + } + } + } + } + + return (B_TRUE); +} + +#if !defined(_KERNEL) +boolean_t +vdev_draid_config_add(nvlist_t *top, nvlist_t *draidcfg) +{ + char *type; + uint64_t parity; + nvlist_t **children = NULL; + uint_t c = 0; + + if (draidcfg == NULL) + return (B_FALSE); + + type = fnvlist_lookup_string(top, ZPOOL_CONFIG_TYPE); + if (strcmp(type, VDEV_TYPE_DRAID) != 0) + return (B_FALSE); + + parity = fnvlist_lookup_uint64(top, ZPOOL_CONFIG_NPARITY); + if (parity != fnvlist_lookup_uint64(draidcfg, + ZPOOL_CONFIG_DRAIDCFG_PARITY)) + return (B_FALSE); + + VERIFY0(nvlist_lookup_nvlist_array(top, + ZPOOL_CONFIG_CHILDREN, &children, &c)); + if (c != + fnvlist_lookup_uint64(draidcfg, ZPOOL_CONFIG_DRAIDCFG_CHILDREN)) + return (B_FALSE); + + /* HH: todo: check permutation array csum */ + fnvlist_add_nvlist(top, ZPOOL_CONFIG_DRAIDCFG, draidcfg); + return (B_TRUE); +} + +nvlist_t * +draidcfg_read_file(const char *path) +{ + int fd; + struct stat64 sb; + char *buf; + nvlist_t *config; + + if ((fd = open(path, O_RDONLY)) < 0) { + (void) fprintf(stderr, "Cannot open '%s'\n", path); + return (NULL); + } + + if (fstat64(fd, &sb) != 0) { + (void) fprintf(stderr, "Failed to stat '%s'\n", path); + close(fd); + return (NULL); + } + + if (!S_ISREG(sb.st_mode)) { + (void) fprintf(stderr, "Not a regular file '%s'\n", path); + close(fd); + return (NULL); + } + + if ((buf = malloc(sb.st_size)) == NULL) { + (void) fprintf(stderr, "Failed to allocate %llu bytes\n", + (u_longlong_t)sb.st_size); + close(fd); + return (NULL); + } + + if (read(fd, buf, sb.st_size) != sb.st_size) { + (void) fprintf(stderr, "Failed to read %llu bytes\n", + (u_longlong_t)sb.st_size); + close(fd); + free(buf); + return (NULL); + } + + (void) close(fd); + + if (nvlist_unpack(buf, sb.st_size, &config, 0) != 0) { + (void) fprintf(stderr, "Failed to unpack nvlist\n"); + free(buf); + return (NULL); + } + + free(buf); + + if (!vdev_draid_config_validate(NULL, config)) { + nvlist_free(config); + return (NULL); + } + + return (config); +} +#endif /* _KERNEL */ + +#if defined(_KERNEL) +EXPORT_SYMBOL(vdev_draid_config_validate); +#endif diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 8476384d2cf0..b5a835ca1faf 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -689,145 +689,6 @@ vdev_draid_group_degraded(vdev_t *vd, vdev_t *oldvd, return (degraded); } -boolean_t -vdev_draid_config_validate(const vdev_t *vd, nvlist_t *config) -{ - int i; - uint_t c; - uint8_t *perm = NULL; - uint64_t n, d, p, s, b; - - if (nvlist_lookup_uint64(config, - ZPOOL_CONFIG_DRAIDCFG_CHILDREN, &n) != 0) { - draid_dbg(0, "Missing %s in configuration\n", - ZPOOL_CONFIG_DRAIDCFG_CHILDREN); - return (B_FALSE); - } - - if (n - 1 > VDEV_DRAID_U8_MAX) { - draid_dbg(0, "%s configuration too large: "U64FMT"\n", - ZPOOL_CONFIG_DRAIDCFG_CHILDREN, n); - return (B_FALSE); - } - if (vd != NULL && n != vd->vdev_children) - return (B_FALSE); - - if (nvlist_lookup_uint64(config, - ZPOOL_CONFIG_DRAIDCFG_PARITY, &p) != 0) { - draid_dbg(0, "Missing %s in configuration\n", - ZPOOL_CONFIG_DRAIDCFG_PARITY); - return (B_FALSE); - } - - if (vd != NULL && p != vd->vdev_nparity) - return (B_FALSE); - - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_DATA, &d) != 0) { - draid_dbg(0, "Missing %s in configuration\n", - ZPOOL_CONFIG_DRAIDCFG_DATA); - return (B_FALSE); - } - - if (nvlist_lookup_uint64(config, - ZPOOL_CONFIG_DRAIDCFG_SPARE, &s) != 0) { - draid_dbg(0, "Missing %s in configuration\n", - ZPOOL_CONFIG_DRAIDCFG_SPARE); - return (B_FALSE); - } - - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_BASE, &b) != 0) { - draid_dbg(0, "Missing %s in configuration\n", - ZPOOL_CONFIG_DRAIDCFG_BASE); - return (B_FALSE); - } - - if (n == 0 || d == 0 || p == 0 || s == 0 || b == 0) { - draid_dbg(0, "Zero n/d/p/s/b\n"); - return (B_FALSE); - } - - if (p > VDEV_RAIDZ_MAXPARITY) { - draid_dbg(0, "Invalid parity "U64FMT"\n", p); - return (B_FALSE); - } - - if ((n - s) % (p + d) != 0) { - draid_dbg(0, U64FMT" mod "U64FMT" is not 0\n", n - s, p + d); - return (B_FALSE); - } - - if (nvlist_lookup_uint8_array(config, - ZPOOL_CONFIG_DRAIDCFG_PERM, &perm, &c) != 0) { - draid_dbg(0, "Missing %s in configuration\n", - ZPOOL_CONFIG_DRAIDCFG_PERM); - return (B_FALSE); - } - - if (c != b * n) { - draid_dbg(0, - "Permutation array has %u items, but "U64FMT" expected\n", - c, b * n); - return (B_FALSE); - } - - for (i = 0; i < b; i++) { - int j, k; - for (j = 0; j < n; j++) { - uint64_t val = perm[i * n + j]; - - if (val >= n) { - draid_dbg(0, - "Invalid value "U64FMT" in " - "permutation %d\n", val, i); - return (B_FALSE); - } - - for (k = 0; k < j; k++) { - if (val == perm[i * n + k]) { - draid_dbg(0, - "Duplicated value "U64FMT" in " - "permutation %d\n", - val, i); - return (B_FALSE); - } - } - } - } - - return (B_TRUE); -} - -boolean_t -vdev_draid_config_add(nvlist_t *top, nvlist_t *draidcfg) -{ - char *type; - uint64_t parity; - nvlist_t **children = NULL; - uint_t c = 0; - - if (draidcfg == NULL) - return (B_FALSE); - - type = fnvlist_lookup_string(top, ZPOOL_CONFIG_TYPE); - if (strcmp(type, VDEV_TYPE_DRAID) != 0) - return (B_FALSE); - - parity = fnvlist_lookup_uint64(top, ZPOOL_CONFIG_NPARITY); - if (parity != fnvlist_lookup_uint64(draidcfg, - ZPOOL_CONFIG_DRAIDCFG_PARITY)) - return (B_FALSE); - - VERIFY0(nvlist_lookup_nvlist_array(top, - ZPOOL_CONFIG_CHILDREN, &children, &c)); - if (c != - fnvlist_lookup_uint64(draidcfg, ZPOOL_CONFIG_DRAIDCFG_CHILDREN)) - return (B_FALSE); - - /* HH: todo: check permutation array csum */ - fnvlist_add_nvlist(top, ZPOOL_CONFIG_DRAIDCFG, draidcfg); - return (B_TRUE); -} - /* Unfortunately this requires GPL-only symbols */ #ifdef ZFS_IS_GPL_COMPATIBLE #define __DRAID_HARDENING