From f45873070b4f41088275a60ea3acfd0dbf6ff1d5 Mon Sep 17 00:00:00 2001 From: Isaac Huang Date: Mon, 30 Apr 2018 21:18:23 -0600 Subject: [PATCH 1/3] dRAID implementation. Signed-off-by: Isaac Huang --- cmd/Makefile.am | 2 +- cmd/draidcfg/.gitignore | 1 + cmd/draidcfg/Makefile.am | 20 + cmd/draidcfg/draid_permutation.c | 763 ++++++++ cmd/draidcfg/draid_permutation.h | 41 + cmd/draidcfg/draidcfg.c | 343 ++++ cmd/zdb/zdb.c | 5 +- cmd/zpool/Makefile.am | 1 + cmd/zpool/zpool_main.c | 20 +- cmd/zpool/zpool_vdev.c | 90 +- configure.ac | 1 + include/libzfs.h | 6 +- include/sys/Makefile.am | 2 + include/sys/arc.h | 1 + include/sys/dmu.h | 1 + include/sys/fs/zfs.h | 16 +- include/sys/metaslab_impl.h | 1 + include/sys/nvpair.h | 1 + include/sys/spa.h | 6 - include/sys/spa_impl.h | 5 + include/sys/sysevent/eventdefs.h | 2 + include/sys/vdev.h | 1 + include/sys/vdev_draid_impl.h | 108 ++ include/sys/vdev_impl.h | 29 + include/sys/vdev_raidz_impl.h | 7 + include/sys/vdev_scan.h | 78 + include/zfeature_common.h | 1 + lib/libzfs/Makefile.am | 1 + lib/libzfs/libzfs_import.c | 72 +- lib/libzfs/libzfs_pool.c | 19 +- lib/libzpool/Makefile.am | 2 + man/man5/zpool-features.5 | 17 + module/nvpair/fnvpair.c | 19 +- module/zcommon/zfeature_common.c | 4 + module/zcommon/zfs_namecheck.c | 4 +- module/zfs/Makefile.in | 2 + module/zfs/arc.c | 6 + module/zfs/dsl_scan.c | 125 +- module/zfs/metaslab.c | 113 +- module/zfs/spa.c | 154 +- module/zfs/vdev.c | 70 +- module/zfs/vdev_draid.c | 1660 +++++++++++++++++ module/zfs/vdev_label.c | 34 +- module/zfs/vdev_mirror.c | 62 +- module/zfs/vdev_raidz.c | 90 +- module/zfs/vdev_raidz.h | 33 + module/zfs/vdev_removal.c | 40 +- module/zfs/vdev_scan.c | 583 ++++++ module/zfs/zio.c | 36 +- .../cli_root/zpool_get/zpool_get.cfg | 1 + 50 files changed, 4514 insertions(+), 185 deletions(-) create mode 100644 cmd/draidcfg/.gitignore create mode 100644 cmd/draidcfg/Makefile.am create mode 100644 cmd/draidcfg/draid_permutation.c create mode 100644 cmd/draidcfg/draid_permutation.h create mode 100644 cmd/draidcfg/draidcfg.c create mode 100644 include/sys/vdev_draid_impl.h create mode 100644 include/sys/vdev_scan.h create mode 100644 module/zfs/vdev_draid.c create mode 100644 module/zfs/vdev_raidz.h create mode 100644 module/zfs/vdev_scan.c diff --git a/cmd/Makefile.am b/cmd/Makefile.am index 9dd7b8b4f07d..0d73d0ba54ff 100644 --- a/cmd/Makefile.am +++ b/cmd/Makefile.am @@ -1,3 +1,3 @@ SUBDIRS = zfs zpool zdb zhack zinject zstreamdump ztest SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat dbufstat zed -SUBDIRS += arc_summary raidz_test zgenhostid +SUBDIRS += arc_summary raidz_test zgenhostid draidcfg diff --git a/cmd/draidcfg/.gitignore b/cmd/draidcfg/.gitignore new file mode 100644 index 000000000000..ad7c307b04e3 --- /dev/null +++ b/cmd/draidcfg/.gitignore @@ -0,0 +1 @@ +/draidcfg diff --git a/cmd/draidcfg/Makefile.am b/cmd/draidcfg/Makefile.am new file mode 100644 index 000000000000..f587d271860e --- /dev/null +++ b/cmd/draidcfg/Makefile.am @@ -0,0 +1,20 @@ +include $(top_srcdir)/config/Rules.am + +AM_CPPFLAGS += -DDEBUG + +DEFAULT_INCLUDES += \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib/libspl/include + +bin_PROGRAMS = draidcfg + +draidcfg_SOURCES = \ + draidcfg.c \ + draid_permutation.c \ + draid_permutation.h + +draidcfg_LDADD = \ + $(top_builddir)/lib/libnvpair/libnvpair.la \ + $(top_builddir)/lib/libzpool/libzpool.la \ + $(top_builddir)/lib/libzfs/libzfs.la +draidcfg_LDADD += -lm diff --git a/cmd/draidcfg/draid_permutation.c b/cmd/draidcfg/draid_permutation.c new file mode 100644 index 000000000000..4753f3f31f66 --- /dev/null +++ b/cmd/draidcfg/draid_permutation.c @@ -0,0 +1,763 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016 Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "draid_permutation.h" + + +#define MAX_GROUPSIZE 32 +#define MAX_GROUPS 128 +#define MAX_SPARES 100 +#define MAX_DEVS (MAX_GROUPSIZE * MAX_GROUPS + MAX_SPARES) +#define MAX_ROWS 16384 + +#define UNOPT 0 +#define EVAL_WORST 1 +#define EVAL_MEAN 2 +#define EVAL_RMS 3 + +static int verbose = 0; + +typedef struct +{ + int groupsz; + int ngroups; + int nspares; + int ndevs; + int nrows; + /* each row maps all drives, groups from 0, spares down from ndevs-1 */ + int **rows; + int nbroken; /* # broken drives */ + int *broken; /* which drives are broken */ +} map_t; + +typedef struct +{ + int value; + int order; +} pair_t; + +static void +permute_devs(int *in, int *out, int ndevs) +{ + pair_t tmp[MAX_DEVS]; + int i; + int j; + + if (ndevs == 2) { /* swap */ + i = in[0]; + j = in[1]; + out[0] = j; + out[1] = i; + return; + } + + for (i = 0; i < ndevs; i++) { /* assign random order */ + tmp[i].value = in[i]; + tmp[i].order = mrand48(); + } + + for (i = 1; i < ndevs; i++) /* sort */ + for (j = 0; j < i; j++) + if (tmp[i].order < tmp[j].order) { + pair_t t = tmp[i]; + tmp[i] = tmp[j]; + tmp[j] = t; + } + + for (i = 0; i < ndevs; i++) + out[i] = tmp[i].value; +} + +static void +print_map(map_t *map) +{ + int i; + int j; + + for (i = 0; i < map->nrows; i++) { + for (j = 0; j < map->ndevs; j++) { + if (j == map->ndevs - map->nspares) + printf("S "); + + printf("%2d ", map->rows[i][j]); + } + printf("\n"); + } +} + +static void +check_map(map_t *map) +{ + int i; + int j; + int nrows = map->nrows; + int ndevs = map->ndevs; + int **rows = map->rows; + int devcounts[MAX_DEVS]; + int brokencounts[MAX_DEVS]; + + ASSERT(map->groupsz <= MAX_GROUPSIZE); + ASSERT(map->ngroups <= MAX_GROUPS); + ASSERT(map->nspares <= MAX_SPARES); + ASSERT(map->ndevs == map->nspares + map->ngroups * map->groupsz); + ASSERT(map->nrows <= MAX_ROWS); + ASSERT(map->nbroken <= MAX_SPARES); + + /* Ensure each dev appears once in every row */ + memset(devcounts, 0, sizeof (int) * map->ndevs); + + for (i = 0; i < nrows; i++) { + int *row = rows[i]; + + for (j = 0; j < ndevs; j++) { + int dev = row[j]; + + ASSERT(0 <= dev && dev < ndevs); + ASSERT(devcounts[dev] == i); + devcounts[dev] = i+1; + } + } + + /* Ensure broken drives only appear once */ + memset(brokencounts, 0, sizeof (int) * map->ndevs); + + for (i = 0; i < map->nbroken; i++) { + int dev = map->broken[i]; + + ASSERT(0 <= dev && dev < map->ndevs); /* valid drive */ + ASSERT(brokencounts[i] == 0); /* not used already */ + brokencounts[i] = 1; + } +} + +static map_t * +dup_map(map_t *oldmap) +{ + int groupsz = oldmap->groupsz; + int ngroups = oldmap->ngroups; + int nspares = oldmap->nspares; + int ndevs = oldmap->ndevs; + int nrows = oldmap->nrows; + map_t *map = malloc(sizeof (map_t)); + int i; + + ASSERT(nrows <= MAX_ROWS); + ASSERT(ndevs <= MAX_DEVS); + + map->groupsz = groupsz; + map->ngroups = ngroups; + map->nspares = nspares; + map->ndevs = ndevs; + map->nrows = nrows; + map->rows = malloc(sizeof (int *) * nrows); + + for (i = 0; i < nrows; i++) { + map->rows[i] = malloc(sizeof (int) * ndevs); + memcpy(map->rows[i], oldmap->rows[i], sizeof (int) * ndevs); + } + + /* Init to no failures (nothing broken) */ + map->broken = malloc(sizeof (int) * nspares); + map->nbroken = 0; + + check_map(map); + return (map); +} + +static map_t * +new_map(int groupsz, int ngroups, int nspares, int nrows) +{ + map_t *map = malloc(sizeof (map_t)); + int ndevs = nspares + ngroups * groupsz; + int i; + int j; + + ASSERT(nrows <= MAX_ROWS); + ASSERT(ndevs <= MAX_DEVS); + + map->groupsz = groupsz; + map->ngroups = ngroups; + map->nspares = nspares; + map->ndevs = ndevs; + map->nrows = nrows; + map->rows = malloc(sizeof (int *) * nrows); + + for (i = 0; i < nrows; i++) { + map->rows[i] = malloc(sizeof (int) * ndevs); + + if (i == 0) + for (j = 0; j < ndevs; j++) + map->rows[i][j] = j; + else + permute_devs(map->rows[i-1], map->rows[i], ndevs); + } + + /* Init to no failures (nothing broken) */ + map->broken = malloc(sizeof (int) * nspares); + map->nbroken = 0; + + check_map(map); + return (map); +} + +static void +free_map(map_t *map) +{ + int i; + + free(map->broken); + for (i = 0; i < map->nrows; i++) + free(map->rows[i]); + free(map->rows); + free(map); +} + +static inline int +is_broken(map_t *map, int dev) +{ + int i; + + for (i = 0; i < map->nbroken; i++) + if (dev == map->broken[i]) + return (1); + + return (0); +} + +static int +eval_resilver(map_t *map, int print) +{ + /* Evaluate how resilvering I/O will be distributed */ + int i; + int j; + int k; + int spare; + int dev; + int ndevs = map->ndevs; + int nspares = map->nspares; + int ngroups = map->ngroups; + int groupsz = map->groupsz; + int nrows = map->nrows; + int writes[MAX_DEVS]; + int reads[MAX_DEVS]; + int max_reads = 0; + int max_writes = 0; + int max_ios = 0; + + memset(reads, 0, sizeof (int) * ndevs); + memset(writes, 0, sizeof (int) * ndevs); + + /* resilver all rows */ + for (i = 0; i < nrows; i++) { + int *row = map->rows[i]; + + /* resilver all groups with broken drives */ + for (j = 0; j < ngroups; j++) { + int fix = 0; + + /* See if any disk in this group is broken */ + for (k = 0; k < groupsz && !fix; k++) + fix = is_broken(map, row[j*groupsz + k]); + + if (!fix) + continue; + + /* + * This group needs fixing + * Read all the non-broken drives and write all the + * broken drives to their hot spare for this row + */ + spare = ndevs - nspares; + for (k = 0; k < groupsz; k++) { + dev = row[j*groupsz + k]; + + if (!is_broken(map, dev)) { + reads[dev]++; + } else { + ASSERT(spare < ndevs); + + while (is_broken(map, row[spare])) { + spare++; + ASSERT(spare < ndevs); + } + writes[row[spare++]]++; + } + } + } + } + + /* find drives with most I/O */ + for (i = 0; i < ndevs; i++) { + if (reads[i] > max_reads) + max_reads = reads[i]; + if (writes[i] > max_writes) + max_writes = writes[i]; + + if (reads[i] + writes[i] > max_ios) + max_ios = reads[i] + writes[i]; + } + + if (print) { + printf("Reads: "); + for (i = 0; i < ndevs; i++) + printf(" %5.3f", ((double)reads[i]*ngroups)/nrows); + printf("\n"); + printf("Writes: "); + for (i = 0; i < ndevs; i++) + printf(" %5.3f", ((double)writes[i]*ngroups)/nrows); + printf("\n"); + } + + return (max_ios); +} + +static double +eval_decluster(map_t *map, int how, int faults, int print) +{ + int f1; + int f2; + int ios; + int worst1 = -1; + int worst2 = -1; + int n = 0; + long sum = 0; + long sumsq = 0; + long max_ios = 0; + double val; + + ASSERT(eval_resilver(map, 0) == 0); /* not broken already */ + ASSERT(faults == 1 || faults == 2); + + map->nbroken = faults; + + for (f1 = 0; f1 < map->ndevs; f1++) { + map->broken[0] = f1; + + if (faults < 2) { + ios = eval_resilver(map, 0); /* eval single failure */ + n++; + sum += ios; + sumsq += ios*ios; + if (max_ios < ios) { + worst1 = f1; + max_ios = ios; + } + } else { /* eval double failure */ + for (f2 = f1 + 1; f2 < map->ndevs; f2++) { + map->broken[1] = f2; /* use 2nd hot spare */ + + ios = eval_resilver(map, 0); + n++; + sum += ios; + sumsq += ios*ios; + if (max_ios < ios) { + worst1 = f1; + worst2 = f2; + max_ios = ios; + } + } + } + } + map->nbroken = 0; + + if (print) { + map->nbroken = faults; + map->broken[0] = worst1; + map->broken[2] = worst2; + + eval_resilver(map, 1); + + map->nbroken = 0; + } + + switch (how) { + case EVAL_WORST: + /* + * imbalance from worst possible drive failure + * insensitive to failures handled better + */ + val = max_ios; + break; + case EVAL_MEAN: + /* + * average over all possible drive failures + * sensitive to all possible failures + */ + val = ((double)sum)/n; + break; + case EVAL_RMS: + /* + * root mean square over all possible drive failures + * penalizes higher imbalance more + */ + val = sqrt(((double)sumsq)/n); + break; + default: + ASSERT(0); + } + return ((val/map->nrows)*map->ngroups); +} + +static int +rand_in_range(int min, int count) +{ + return (min + drand48()*count); +} + +static void +permute_map(map_t *map, int temp) +{ + static int prev_temp; + + int nrows = (temp < 1) ? 1 : (temp > 100) ? + map->nrows : rand_in_range(1, (map->nrows * temp)/100); + int row = rand_in_range(0, map->nrows - nrows); + int ncols = map->ndevs; + int col = rand_in_range(0, map->ndevs - ncols); + int i; + + if (verbose > 0 && + temp != prev_temp && + (temp < 10 || (temp % 10 == 0))) + printf("Permute t %3d (%d-%d, %d-%d)\n", + temp, col, ncols, row, nrows); + prev_temp = temp; + + for (i = row; i < row + nrows; i++) + permute_devs(&map->rows[i][col], &map->rows[i][col], ncols); +} + +static map_t * +develop_map(map_t *map) +{ + map_t *dmap = new_map(map->groupsz, map->ngroups, + map->nspares, map->nrows * map->ndevs); + int base; + int dev; + int i; + + for (base = 0; base < map->nrows; base++) + for (dev = 0; dev < map->ndevs; dev++) + for (i = 0; i < map->ndevs; i++) + dmap->rows[base*map->ndevs + dev][i] = + (map->rows[base][i] + dev) % map->ndevs; + + return (dmap); +} + +static map_t * +optimize_map(map_t *map, int eval, int faults) +{ + double temp = 100.0; + double alpha = 0.995; + double epsilon = 0.001; + double val = eval_decluster(map, eval, faults, 0); + int ups = 0; + int downs = 0; + int sames = 0; + int iter = 0; + + while (temp > epsilon) { + map_t *map2 = dup_map(map); + double val2; + double delta; + + permute_map(map2, (int)temp); + + val2 = eval_decluster(map2, eval, faults, 0); + delta = (val2 - val); + + if (delta < 0 || exp(-10000*delta/temp) > drand48()) { + if (delta > 0) + ups++; + else if (delta < 0) + downs++; + else + sames++; + + free_map(map); + map = map2; + val = val2; + } else { + free_map(map2); + } + + temp *= alpha; + + if ((++iter % 100) == 0) { + if (verbose > 0) + printf("%f (%d ups, %d sames, %d downs)\n", + val, ups, sames, downs); + ups = downs = sames = 0; + } + } + + if (verbose > 0) + printf("%d iters, %d ups %d sames %d downs\n", + iter, ups, sames, downs); + return (map); +} + +static void +print_map_stats(map_t *map, int optimize, int print_ios) +{ + double score = eval_decluster(map, EVAL_WORST, 1, 0); + + printf("%6s (%2d x %2d + %2d) x %5d: %2.3f\n", + (optimize == UNOPT) ? "Unopt" : + (optimize == EVAL_WORST) ? "Worst" : + (optimize == EVAL_MEAN) ? "Mean" : "Rms", + map->ngroups, map->groupsz, map->nspares, map->nrows, score); + + if (map->ndevs < 80 && score >= 1.05) + printf("Warning score %6.3f has over 5 percent imbalance!\n", + score); + else if (score >= 1.1) + printf("Warning score %6.3f has over 10 percent imbalance!\n", + score); + +#ifdef FOOO + printf("Single: worst %6.3f mean %6.3f\n", + eval_decluster(map, EVAL_WORST, 1, 0), + eval_decluster(map, EVAL_MEAN, 1, 0)); + + printf("Double: worst %6.3f mean %6.3f\n", + eval_decluster(map, EVAL_WORST, 2, 0), + eval_decluster(map, EVAL_MEAN, 2, 0)); +#endif + + if (print_ios) { + eval_decluster(map, EVAL_WORST, 1, 1); + eval_decluster(map, EVAL_WORST, 2, 1); + } +} + +int +draid_permutation_generate(struct vdev_draid_configuration *cfg) +{ + const int loop = 16; /* HH: make this a parameter */ + const int faults = 1; + const int eval = EVAL_WORST; + + int groupsz = cfg->dcf_data + cfg->dcf_parity; + int nspares = cfg->dcf_spare; + int ngroups = (cfg->dcf_children - nspares) / groupsz; + int nrows; + int i, fd, urand_fd; + long int best_seed; + map_t *best_map; + + fd = open("/dev/random", O_RDONLY | O_NONBLOCK); + if (fd == -1) { + perror("Cannot open /dev/random\n"); + return (-1); + } + urand_fd = open("/dev/urandom", O_RDONLY); + + /* HH: fine tune these heuristics */ + if (cfg->dcf_children - nspares > 80) + nrows = 128; /* 81 - ? */ + else if (cfg->dcf_children - nspares > 40) + nrows = 64; /* 41 - 80 */ + else + nrows = 32; /* 1 - 40 */ + + for (i = 0, best_map = NULL; i < loop; i++) { + int rc; + long int seed; + map_t *map, *omap; + + rc = read(fd, &seed, sizeof (seed)); + if (rc != sizeof (seed)) { + printf("Not enough entropy at /dev/random: read %d, " + "wanted %lu.\n", rc, sizeof (seed)); + /* urand_fd may not be valid but it does not matter */ + rc = read(urand_fd, &seed, sizeof (seed)); + if (rc != sizeof (seed)) + break; + printf("Using /dev/urandom instead.\n"); + } + + srand48(seed); + + map = new_map(groupsz, ngroups, nspares, nrows); + omap = optimize_map(dup_map(map), eval, faults); + if (eval_decluster(omap, eval, faults, 0) > + eval_decluster(map, eval, faults, 0)) { + /* + * optimize_map() may create a worse map, because the + * simulated annealing process may accept worse + * neighbors to avoid getting stuck in local optima + */ + free_map(omap); + } else { + free_map(map); + map = omap; + } + + if (best_map == NULL || + eval_decluster(map, eval, faults, 0) < + eval_decluster(best_map, eval, faults, 0)) { + if (best_map != NULL) + free_map(best_map); + best_map = map; + best_seed = seed; + } else { + free_map(map); + } + } + + close(fd); + close(urand_fd); + if (i != loop) + fprintf(stderr, "Early termination at loop %d. Generated " + "permutations may not be optimal!\n", i + 1); + + if (best_map != NULL) { + int j; + map_t *dmap; + uint64_t *perms; + + assert(best_map->nrows == nrows); + assert(best_map->ndevs == cfg->dcf_children); + + perms = malloc(sizeof (*perms) * nrows * best_map->ndevs); + assert(perms != NULL); + + for (i = 0; i < nrows; i++) + for (j = 0; j < best_map->ndevs; j++) + perms[i * best_map->ndevs + j] = + best_map->rows[i][j]; + + cfg->dcf_bases = nrows; + cfg->dcf_base_perms = perms; + + if (verbose > 1) + print_map(best_map); + dmap = develop_map(best_map); + free_map(best_map); + print_map_stats(dmap, eval, 0); + printf("Seed chosen: %lx\n", best_seed); + free_map(dmap); + return (0); + } else { + return (-1); + } +} + +int +debug_main(int argc, char **argv) +{ + int ngroups = 0; + int groupsz = 0; + int nspares = 0; + int nrows = 0; + int optimize = UNOPT; + int faults = 1; + int develop = 0; + map_t *map; + int c; + + while ((c = getopt(argc, argv, "g:d:s:n:vUWMR12D")) != -1) + switch (c) { + case 'D': + develop = 1; + break; + case 'g': + sscanf(optarg, "%d", &ngroups); + break; + case 'd': + sscanf(optarg, "%d", &groupsz); + break; + case 's': + sscanf(optarg, "%d", &nspares); + break; + case 'n': + sscanf(optarg, "%d", &nrows); + break; + case 'v': + verbose++; + break; + case 'U': + optimize = UNOPT; + break; + case 'W': + optimize = EVAL_WORST; + break; + case 'M': + optimize = EVAL_MEAN; + break; + case 'R': + optimize = EVAL_RMS; + break; + case '1': + faults = 1; + break; + case '2': + faults = 2; + break; + default: + fprintf(stderr, "arg???\n"); + return (1); + } + + if (ngroups <= 0 || groupsz <= 0 || nspares <= 0 || nrows <= 0) { + fprintf(stderr, "missing arg???\n"); + return (1); + } + + map = new_map(groupsz, ngroups, nspares, nrows); + if (verbose > 1) + print_map(map); + + if (verbose > 0) + print_map_stats(map, UNOPT, 1); + + if (optimize != UNOPT) { + map = optimize_map(map, optimize, faults); + + if (verbose > 1) + print_map(map); + if (verbose > 0) + print_map_stats(map, optimize, 1); + } + + if (develop) { + map_t *dmap = develop_map(map); + + free_map(map); + map = dmap; + } + + print_map_stats(map, optimize, verbose > 0); + return (0); +} diff --git a/cmd/draidcfg/draid_permutation.h b/cmd/draidcfg/draid_permutation.h new file mode 100644 index 000000000000..8562ccf09852 --- /dev/null +++ b/cmd/draidcfg/draid_permutation.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016 Intel Corporation. + */ + + +#ifndef _DRAID_PERMUTATION_H +#define _DRAID_PERMUTATION_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern int draid_permutation_generate(struct vdev_draid_configuration *); + +#ifdef __cplusplus +} +#endif + +#endif /* _DRAID_PERMUTATION_H */ diff --git a/cmd/draidcfg/draidcfg.c b/cmd/draidcfg/draidcfg.c new file mode 100644 index 000000000000..90e40a61a2e1 --- /dev/null +++ b/cmd/draidcfg/draidcfg.c @@ -0,0 +1,343 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016 Intel Corporation. + */ + + +#include +#include +#include +#include +#include +#include +#include + +#include "draid_permutation.h" + + +static struct vdev_draid_configuration * +draidcfg_find(const uint64_t data, const uint64_t parity, + const uint64_t spare, const uint64_t children) +{ + /* P D D... P D D... S */ + static const uint64_t bases7[1][7] = {{1, 2, 4, 3, 6, 5, 0}}; + static const uint64_t bases11[1][11] = {{ + 1, 4, 5, 9, 3, 2, 8, 10, 7, 6, 0}}; + static const uint64_t bases19[1][19] = {{ + 1, 5, 6, 11, 17, 9, 7, 16, 4, 10, 12, 3, 15, 18, 14, 13, 8, 2, 0}}; + static const uint64_t bases23[1][23] = {{ + 1, 8, 18, 6, 2, 16, 13, 12, 4, 9, 3, 10, 11, 19, 14, 20, 22, + 15, 5, 17, 21, 7, 0}}; + static const uint64_t bases31[1][31] = {{ + 1, 8, 2, 16, 4, 17, 12, 3, 24, 6, 10, 18, 20, 5, 9, 15, 27, 30, 23, + 29, 7, 25, 14, 19, 28, 26, 22, 21, 13, 11, 0}}; + static const uint64_t bases41[1][41] = {{ + 1, 25, 10, 4, 18, 40, 16, 31, 37, 23, 6, 27, 19, + 24, 26, 35, 14, 22, 17, 15, 36, 39, 32, 21, 33, + 5, 2, 9, 20, 8, 11, 29, 28, 3, 34, 30, 12, 13, 38, 7, 0}}; + + static struct vdev_draid_configuration known_cfgs[6] = { + { + .dcf_data = 2, .dcf_parity = 1, .dcf_spare = 1, .dcf_children = 7, + .dcf_bases = 1, .dcf_base_perms = &bases7[0][0] + }, + { + .dcf_data = 4, .dcf_parity = 1, .dcf_spare = 1, .dcf_children = 11, + .dcf_bases = 1, .dcf_base_perms = &bases11[0][0] + }, + { + .dcf_data = 8, .dcf_parity = 1, .dcf_spare = 1, .dcf_children = 19, + .dcf_bases = 1, .dcf_base_perms = &bases19[0][0] + }, + { + .dcf_data = 8, .dcf_parity = 3, .dcf_spare = 1, .dcf_children = 23, + .dcf_bases = 1, .dcf_base_perms = &bases23[0][0] + }, + { + .dcf_data = 4, .dcf_parity = 1, .dcf_spare = 1, .dcf_children = 31, + .dcf_bases = 1, .dcf_base_perms = &bases31[0][0] + }, + { + .dcf_data = 8, .dcf_parity = 2, .dcf_spare = 1, .dcf_children = 41, + .dcf_bases = 1, .dcf_base_perms = &bases41[0][0] + }, + }; + + int i; + + for (i = 0; i < sizeof (known_cfgs) / sizeof (known_cfgs[0]); i++) { + struct vdev_draid_configuration *cfg = &known_cfgs[i]; + + if (data == cfg->dcf_data && parity == cfg->dcf_parity && + spare == cfg->dcf_spare && children == cfg->dcf_children) + return (cfg); + } + + return (NULL); +} + +static struct vdev_draid_configuration * +draidcfg_create(const uint64_t data, const uint64_t parity, + const uint64_t spare, const uint64_t children) +{ + struct vdev_draid_configuration *cfg = calloc(1, sizeof (*cfg)); + + assert(cfg != NULL); + cfg->dcf_data = data; + cfg->dcf_parity = parity; + cfg->dcf_spare = spare; + cfg->dcf_children = children; + + cfg->dcf_bases = 0; + cfg->dcf_base_perms = NULL; + if (draid_permutation_generate(cfg) != 0) { + free(cfg); + return (NULL); + } + + assert(cfg->dcf_bases != 0); + assert(cfg->dcf_base_perms != NULL); + return (cfg); +} + +static inline void +draidcfg_free(struct vdev_draid_configuration *cfg) +{ + free((void *)cfg->dcf_base_perms); + free(cfg); +} + +static int +draidcfg_create_file(const uint64_t data, const uint64_t parity, + const uint64_t spare, const uint64_t children, const char *path) +{ + FILE *fp; + size_t len; + int ret = 0; + void *packed; + nvlist_t *nvl; + boolean_t freecfg = B_FALSE; + struct vdev_draid_configuration *cfg; + + ASSERT(children != 0); + ASSERT3U(children, <=, VDEV_DRAID_MAX_CHILDREN); + + if (children - 1 > VDEV_DRAID_U8_MAX) { + fprintf(stderr, "Configuration for over %u children " + "is not supported\n", VDEV_DRAID_U8_MAX + 1); + return (1); + } + + cfg = draidcfg_find(data, parity, spare, children); + if (cfg == NULL) { + cfg = draidcfg_create(data, parity, spare, children); + if (cfg == NULL) { + fprintf(stderr, "Cannot create" + "supported configuration\n"); + return (1); + } + freecfg = B_TRUE; + } + + fp = fopen(path, "w+"); + if (fp == NULL) { + fprintf(stderr, "Cannot open file %s for write\n", path); + if (freecfg) + draidcfg_free(cfg); + return (1); + } + + nvl = fnvlist_alloc(); + fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_DATA, data); + fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_PARITY, parity); + fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_SPARE, spare); + fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_CHILDREN, children); + fnvlist_add_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_BASE, cfg->dcf_bases); + + if (children - 1 <= VDEV_DRAID_U8_MAX) { + int i, j; + uint8_t *val = calloc(children * cfg->dcf_bases, sizeof (*val)); + + for (i = 0; i < cfg->dcf_bases; i++) { + for (j = 0; j < children; j++) { + uint64_t c = + cfg->dcf_base_perms[i * children + j]; + + ASSERT3U(c, <, children); + ASSERT3U(c, <=, VDEV_DRAID_U8_MAX); + val[i * children + j] = (uint8_t)c; + } + } + + fnvlist_add_uint8_array(nvl, ZPOOL_CONFIG_DRAIDCFG_PERM, + val, children * cfg->dcf_bases); + free(val); + } else { + ASSERT3U(children, ==, 0); /* not supported yet */ + } + + assert(vdev_draid_config_validate(NULL, nvl)); + + packed = fnvlist_pack_xdr(nvl, &len); + if (fwrite(packed, 1, len, fp) != len) { + ret = 1; + fprintf(stderr, "Cannot write %lu bytes to %s\n", len, path); + } + + fnvlist_pack_free(packed, len); + fnvlist_free(nvl); + if (freecfg) + draidcfg_free(cfg); + fclose(fp); + return (ret); +} + +static void +draidcfg_print(nvlist_t *config) +{ + uint_t c; + uint8_t *perm = NULL; + uint64_t n, d, p, s, b, i; + + n = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_CHILDREN); + d = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_DATA); + p = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_PARITY); + s = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_SPARE); + b = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_BASE); + + printf("dRAID%lu vdev of %lu child drives: %lu x (%lu data + " + "%lu parity) and %lu distributed spare\n", + p, n, (n - s) / (d + p), d, p, s); + printf("Using %lu base permutation%s\n", b, b > 1 ? "s" : ""); + + VERIFY0(nvlist_lookup_uint8_array(config, + ZPOOL_CONFIG_DRAIDCFG_PERM, &perm, &c)); + ASSERT3U(c, ==, b * n); + + for (i = 0; i < b; i++) { + int j; + + printf(" "); + for (j = 0; j < n; j++) + printf("%*u,", n > 99 ? 3 : 2, perm[i * n + j]); + printf("\n"); + } +} + +static inline int usage(void) +{ + printf(gettext("draidcfg [-r] [-n children] [-d data] [-p parity]" + " [-s spare] \n")); + return (1); +} + +int +main(int argc, char **argv) +{ + boolean_t read = B_FALSE; + char *cfg = NULL; + uint64_t data = 0, parity = 0, spare = 0, children = 0; + int c; + + while ((c = getopt(argc, argv, "rn:d:p:s:")) != -1) { + char *endptr; + uint64_t *p = NULL; + + switch (c) { + case 'r': + read = B_TRUE; + break; + case 'n': + p = &children; + case 'd': + if (p == NULL) + p = &data; + case 'p': + if (p == NULL) + p = &parity; + case 's': + if (p == NULL) + p = &spare; + + errno = 0; + *p = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0') { + fprintf(stderr, + gettext("Invalid -%c value: %s\n"), + c, optarg); + return (usage()); + } + break; + case ':': + fprintf(stderr, gettext("Missing argument for " + "'%c' option\n"), optopt); + return (usage()); + case '?': + fprintf(stderr, gettext("Invalid option '%c'\n"), + optopt); + return (usage()); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + fprintf(stderr, + gettext("Missing configuration file argument\n")); + return (usage()); + } + + cfg = argv[0]; + + if (read) { + nvlist_t *nvl = draidcfg_read_file(cfg); + + if (nvl == NULL) { + return (1); + } else { + draidcfg_print(nvl); + nvlist_free(nvl); + return (0); + } + } + + assert(!read); + + if (data == 0 || parity == 0 || spare == 0 || children == 0) { + fprintf(stderr, + gettext("Missing data/parity/spare/children argument\n")); + return (usage()); + } + + if (parity > VDEV_RAIDZ_MAXPARITY) { + fprintf(stderr, gettext("Invalid parity %lu\n"), parity); + return (usage()); + } + + if (children % (data + parity) != spare) { + fprintf(stderr, gettext("Invalid draid configration\n")); + return (usage()); + } + + return (draidcfg_create_file(data, parity, spare, children, cfg)); +} diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 852f853559c7..963dca43907f 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -871,7 +871,10 @@ dump_metaslab(metaslab_t *msp) } if (dump_opt['d'] > 5 || dump_opt['m'] > 3) { - ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); + if (vd->vdev_ops == &vdev_draid_ops) + ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift); + else + ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift); dump_spacemap(spa->spa_meta_objset, msp->ms_sm); } diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am index 8e98fddd6772..8555a3efd9bf 100644 --- a/cmd/zpool/Makefile.am +++ b/cmd/zpool/Makefile.am @@ -16,6 +16,7 @@ zpool_SOURCES = \ zpool_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ $(top_builddir)/lib/libuutil/libuutil.la \ + $(top_builddir)/lib/libzpool/libzpool.la \ $(top_builddir)/lib/libzfs/libzfs.la zpool_LDADD += -lm $(LIBBLKID) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 453fb2131ffb..aafde8f37a67 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -6080,7 +6081,8 @@ print_scan_status(pool_scan_stat_t *ps) zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf)); assert(ps->pss_func == POOL_SCAN_SCRUB || - ps->pss_func == POOL_SCAN_RESILVER); + ps->pss_func == POOL_SCAN_RESILVER || + ps->pss_func == POOL_SCAN_REBUILD); /* Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { @@ -6104,6 +6106,13 @@ print_scan_status(pool_scan_stat_t *ps) (u_longlong_t)days_left, (u_longlong_t)hours_left, (u_longlong_t)mins_left, (u_longlong_t)secs_left, (u_longlong_t)ps->pss_errors, ctime(&end)); + } else if (ps->pss_func == POOL_SCAN_REBUILD) { + (void) printf(gettext("rebuilt %s " + "in %llu days %02llu:%02llu:%02llu " + "with %llu errors on %s"), processed_buf, + (u_longlong_t)days_left, (u_longlong_t)hours_left, + (u_longlong_t)mins_left, (u_longlong_t)secs_left, + (u_longlong_t)ps->pss_errors, ctime(&end)); } return; } else if (ps->pss_state == DSS_CANCELED) { @@ -6113,6 +6122,9 @@ print_scan_status(pool_scan_stat_t *ps) } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver canceled on %s"), ctime(&end)); + } else if (ps->pss_func == POOL_SCAN_REBUILD) { + (void) printf(gettext("rebuild canceled on %s"), + ctime(&end)); } return; } @@ -6133,6 +6145,9 @@ print_scan_status(pool_scan_stat_t *ps) } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver in progress since %s"), ctime(&start)); + } else if (ps->pss_func == POOL_SCAN_REBUILD) { + (void) printf(gettext("rebuild in progress since %s"), + ctime(&start)); } scanned = ps->pss_examined; @@ -6182,6 +6197,9 @@ print_scan_status(pool_scan_stat_t *ps) } else if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("\t%s repaired, %.2f%% done"), processed_buf, 100 * fraction_done); + } else if (ps->pss_func == POOL_SCAN_REBUILD) { + (void) printf(gettext("\t%s rebuilt, %.2f%% done\n"), + processed_buf, 100 * fraction_done); } if (pause == 0) { diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 620dbc912287..b7738aa242e9 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -84,6 +84,7 @@ #include #include "zpool_util.h" #include +#include /* * For any given vdev specification, we can have multiple errors. The @@ -591,6 +592,7 @@ is_spare(nvlist_t *config, const char *path) * /dev/xxx Complete disk path * /xxx Full path to file * xxx Shorthand for /xxx + * $draidxxx dRAID spare, see VDEV_DRAID_SPARE_PATH_FMT */ static nvlist_t * make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) @@ -633,6 +635,11 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) /* After whole disk check restore original passed path */ strlcpy(path, arg, sizeof (path)); + } else if (arg[0] == VDEV_DRAID_SPARE_PATH_FMT[0]) { + ashift = 12; + wholedisk = B_TRUE; + strlcpy(path, arg, sizeof (path)); + type = VDEV_TYPE_DRAID_SPARE; } else { err = is_shorthand_path(arg, path, sizeof (path), &statbuf, &wholedisk); @@ -661,17 +668,19 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) } } - /* - * Determine whether this is a device or a file. - */ - if (wholedisk || S_ISBLK(statbuf.st_mode)) { - type = VDEV_TYPE_DISK; - } else if (S_ISREG(statbuf.st_mode)) { - type = VDEV_TYPE_FILE; - } else { - (void) fprintf(stderr, gettext("cannot use '%s': must be a " - "block device or regular file\n"), path); - return (NULL); + if (type == NULL) { + /* + * Determine whether this is a device or a file. + */ + if (wholedisk || S_ISBLK(statbuf.st_mode)) { + type = VDEV_TYPE_DISK; + } else if (S_ISREG(statbuf.st_mode)) { + type = VDEV_TYPE_FILE; + } else { + fprintf(stderr, gettext("cannot use '%s': must " + "be a block device or regular file\n"), path); + return (NULL); + } } /* @@ -829,7 +838,8 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) rep.zprl_type = type; rep.zprl_children = 0; - if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || + strcmp(type, VDEV_TYPE_DRAID) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &rep.zprl_parity) == 0); @@ -1420,7 +1430,8 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, static const char * is_grouping(const char *type, int *mindev, int *maxdev) { - if (strncmp(type, "raidz", 5) == 0) { + if (strncmp(type, VDEV_TYPE_RAIDZ, 5) == 0 || + strncmp(type, VDEV_TYPE_DRAID, 5) == 0) { const char *p = type + 5; char *end; long nparity; @@ -1440,8 +1451,12 @@ is_grouping(const char *type, int *mindev, int *maxdev) if (mindev != NULL) *mindev = nparity + 1; if (maxdev != NULL) - *maxdev = 255; - return (VDEV_TYPE_RAIDZ); + *maxdev = VDEV_DRAID_MAX_CHILDREN; + + if (strncmp(type, VDEV_TYPE_RAIDZ, 5) == 0) + return (VDEV_TYPE_RAIDZ); + else + return (VDEV_TYPE_DRAID); } if (maxdev != NULL) @@ -1510,6 +1525,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { nvlist_t **child = NULL; int c, children = 0; + nvlist_t *draidcfg = NULL; if (strcmp(type, VDEV_TYPE_SPARE) == 0) { if (spares != NULL) { @@ -1566,6 +1582,34 @@ construct_spec(nvlist_t *props, int argc, char **argv) for (c = 1; c < argc; c++) { if (is_grouping(argv[c], NULL, NULL) != NULL) break; + + if (strcmp(type, VDEV_TYPE_DRAID) == 0 && + strncmp(argv[c], "cfg=", 4) == 0) { + if (draidcfg == NULL) { + draidcfg = + draidcfg_read_file(argv[c] + + 4); + if (draidcfg != NULL) + continue; + fprintf(stderr, + gettext("invalid draid " + "configuration '%s'\n"), + argv[c]); + } else { + fprintf(stderr, + gettext("dRAID config " + "specified more than " + "once: %s\n"), argv[c]); + } + + for (c = 0; c < children - 1; c++) + nvlist_free(child[c]); + free(child); + if (draidcfg != NULL) + nvlist_free(draidcfg); + return (NULL); + } + children++; child = realloc(child, children * sizeof (nvlist_t *)); @@ -1620,7 +1664,8 @@ construct_spec(nvlist_t *props, int argc, char **argv) type) == 0); verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, is_log) == 0); - if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || + strcmp(type, VDEV_TYPE_DRAID) == 0) { verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, mindev - 1) == 0); @@ -1632,6 +1677,19 @@ construct_spec(nvlist_t *props, int argc, char **argv) for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); + + if (draidcfg != NULL) { + ASSERT0(strcmp(type, VDEV_TYPE_DRAID)); + + if (!vdev_draid_config_add(nv, + draidcfg)) + fprintf(stderr, + gettext("ignoring invalid " + "draid config\n")); + + nvlist_free(draidcfg); + draidcfg = NULL; + } } } else { /* diff --git a/configure.ac b/configure.ac index 6dc313b4d7bc..2ad5c6449f01 100644 --- a/configure.ac +++ b/configure.ac @@ -116,6 +116,7 @@ AC_CONFIG_FILES([ cmd/arc_summary/Makefile cmd/zed/Makefile cmd/raidz_test/Makefile + cmd/draidcfg/Makefile cmd/zgenhostid/Makefile contrib/Makefile contrib/bash_completion.d/Makefile diff --git a/include/libzfs.h b/include/libzfs.h index cbaaa13a2154..cb4982feacf2 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -420,7 +420,6 @@ extern int zpool_tryimport(libzfs_handle_t *hdl, char *target, nvlist_t **configp, importargs_t *args); /* legacy pool search routines */ -extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **); extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *, char *, uint64_t); @@ -881,6 +880,11 @@ int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); extern int zpool_enable_datasets(zpool_handle_t *, const char *, int); extern int zpool_disable_datasets(zpool_handle_t *, boolean_t); +/* + * dRAID import support + */ +nvlist_t *draidcfg_read_file(const char *); + /* * Support for Linux libudev derived persistent device strings */ diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index e73be52f3ec8..06911c977899 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -87,11 +87,13 @@ COMMON_H = \ $(top_srcdir)/include/sys/vdev_disk.h \ $(top_srcdir)/include/sys/vdev_file.h \ $(top_srcdir)/include/sys/vdev.h \ + $(top_srcdir)/include/sys/vdev_scan.h \ $(top_srcdir)/include/sys/vdev_impl.h \ $(top_srcdir)/include/sys/vdev_indirect_births.h \ $(top_srcdir)/include/sys/vdev_indirect_mapping.h \ $(top_srcdir)/include/sys/vdev_raidz.h \ $(top_srcdir)/include/sys/vdev_raidz_impl.h \ + $(top_srcdir)/include/sys/vdev_draid_impl.h \ $(top_srcdir)/include/sys/vdev_removal.h \ $(top_srcdir)/include/sys/xvattr.h \ $(top_srcdir)/include/sys/zap.h \ diff --git a/include/sys/arc.h b/include/sys/arc.h index 9d6bab505a2f..411681879114 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -292,6 +292,7 @@ void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(uint64_t reserve, uint64_t txg); uint64_t arc_target_bytes(void); +uint64_t arc_max_bytes(void); void arc_init(void); void arc_fini(void); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 119584365a2c..7df36985bee5 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -366,6 +366,7 @@ typedef struct dmu_buf { #define DMU_POOL_REMOVING "com.delphix:removing" #define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" #define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" +#define DMU_POOL_REBUILDING "com.intel:rebuilding" /* * Allocate an object from this objset. The range of object numbers diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index de3b729ebe04..d0155154c438 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -704,6 +704,15 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" #define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" #define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" + +#define ZPOOL_CONFIG_DRAIDCFG "com.intel:draid_config" +#define ZPOOL_CONFIG_DRAIDCFG_DATA "com.intel:draid_data" +#define ZPOOL_CONFIG_DRAIDCFG_PARITY "com.intel:draid_parity" +#define ZPOOL_CONFIG_DRAIDCFG_SPARE "com.intel:draid_spare" +#define ZPOOL_CONFIG_DRAIDCFG_BASE "com.intel:draid_base" +#define ZPOOL_CONFIG_DRAIDCFG_CHILDREN "com.intel:draid_children" +#define ZPOOL_CONFIG_DRAIDCFG_PERM "com.intel:draid_perm" + #define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */ @@ -737,6 +746,8 @@ typedef struct zpool_rewind_policy { #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" +#define VDEV_TYPE_DRAID "draid" +#define VDEV_TYPE_DRAID_SPARE "dspare" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" @@ -847,6 +858,7 @@ typedef enum pool_scan_func { POOL_SCAN_NONE, POOL_SCAN_SCRUB, POOL_SCAN_RESILVER, + POOL_SCAN_REBUILD, /* sequential SPA scan */ POOL_SCAN_FUNCS } pool_scan_func_t; @@ -1258,7 +1270,9 @@ typedef enum { * given payloads: * * ESC_ZFS_RESILVER_START - * ESC_ZFS_RESILVER_END + * ESC_ZFS_RESILVER_FINISH + * ESC_ZFS_REBUILD_START + * ESC_ZFS_REBUILD_FINISH * ESC_ZFS_POOL_DESTROY * ESC_ZFS_POOL_REGUID * diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 76f670a4d43f..a275cb31a2a5 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -338,6 +338,7 @@ struct metaslab { */ boolean_t ms_loaded; boolean_t ms_loading; + boolean_t ms_rebuilding; int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ diff --git a/include/sys/nvpair.h b/include/sys/nvpair.h index d2dfad5ca2b3..9c358c0af920 100644 --- a/include/sys/nvpair.h +++ b/include/sys/nvpair.h @@ -281,6 +281,7 @@ nvlist_t *fnvlist_alloc(void); void fnvlist_free(nvlist_t *); size_t fnvlist_size(nvlist_t *); char *fnvlist_pack(nvlist_t *, size_t *); +char *fnvlist_pack_xdr(nvlist_t *, size_t *); void fnvlist_pack_free(char *, size_t); nvlist_t *fnvlist_unpack(char *, size_t); nvlist_t *fnvlist_dup(nvlist_t *); diff --git a/include/sys/spa.h b/include/sys/spa.h index 89f4f5107f13..e7d2d9409cb8 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -767,12 +767,6 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_REMOVE_DONE 0x40 #define SPA_ASYNC_REMOVE_STOP 0x80 -/* - * Controls the behavior of spa_vdev_remove(). - */ -#define SPA_REMOVE_UNSPARE 0x01 -#define SPA_REMOVE_DONE 0x02 - /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 90d92926888b..fbdcc35ac2dc 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -184,6 +184,8 @@ typedef enum spa_all_vdev_zap_action { AVZ_ACTION_INITIALIZE } spa_avz_action_t; +typedef struct spa_vdev_scan spa_vdev_scan_t; + struct spa { /* * Fields protected by spa_namespace_lock. @@ -271,6 +273,9 @@ struct spa { spa_condensing_indirect_t *spa_condensing_indirect; zthr_t *spa_condense_zthr; /* zthr doing condense. */ + + spa_vdev_scan_t *spa_vdev_scan; + char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ int spa_last_open_failed; /* error if last open failed */ diff --git a/include/sys/sysevent/eventdefs.h b/include/sys/sysevent/eventdefs.h index aa13bd5052c7..4678160f8ee5 100644 --- a/include/sys/sysevent/eventdefs.h +++ b/include/sys/sysevent/eventdefs.h @@ -95,6 +95,8 @@ extern "C" { */ #define ESC_ZFS_RESILVER_START "resilver_start" #define ESC_ZFS_RESILVER_FINISH "resilver_finish" +#define ESC_ZFS_REBUILD_START "rebuild_start" +#define ESC_ZFS_REBUILD_FINISH "rebuild_finish" #define ESC_ZFS_VDEV_REMOVE "vdev_remove" #define ESC_ZFS_VDEV_REMOVE_AUX "vdev_remove_aux" #define ESC_ZFS_VDEV_REMOVE_DEV "vdev_remove_dev" diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 511d4d0b6308..5f5e85f037dd 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -113,6 +113,7 @@ extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); extern void vdev_clear(spa_t *spa, vdev_t *vd); extern boolean_t vdev_is_dead(vdev_t *vd); +extern boolean_t vdev_is_dead_at(vdev_t *vd, uint64_t offset); extern boolean_t vdev_readable(vdev_t *vd); extern boolean_t vdev_writeable(vdev_t *vd); extern boolean_t vdev_allocatable(vdev_t *vd); diff --git a/include/sys/vdev_draid_impl.h b/include/sys/vdev_draid_impl.h new file mode 100644 index 000000000000..58d00d9f8b34 --- /dev/null +++ b/include/sys/vdev_draid_impl.h @@ -0,0 +1,108 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _VDEV_DRAID_IMPL_H +#define _VDEV_DRAID_IMPL_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zio zio_t; +typedef struct vdev vdev_t; +typedef struct raidz_map raidz_map_t; + +struct vdev_draid_configuration { + uint64_t dcf_data; + uint64_t dcf_parity; + uint64_t dcf_spare; + uint64_t dcf_children; + uint64_t dcf_bases; + abd_t *dcf_zero_abd; + const uint64_t *dcf_base_perms; +}; + +extern boolean_t vdev_draid_ms_mirrored(const vdev_t *, uint64_t); +extern boolean_t vdev_draid_group_degraded(vdev_t *, vdev_t *, + uint64_t, uint64_t, boolean_t); +extern uint64_t vdev_draid_check_block(const vdev_t *vd, uint64_t, uint64_t); +extern uint64_t vdev_draid_get_astart(const vdev_t *, const uint64_t); +extern uint64_t vdev_draid_offset2group(const vdev_t *, uint64_t, boolean_t); +extern uint64_t vdev_draid_group2offset(const vdev_t *, uint64_t, boolean_t); +extern boolean_t vdev_draid_is_remainder_group(const vdev_t *, + uint64_t, boolean_t); +extern uint64_t vdev_draid_get_groupsz(const vdev_t *, boolean_t); +extern boolean_t vdev_draid_config_validate(const vdev_t *, nvlist_t *); +extern boolean_t vdev_draid_config_add(nvlist_t *, nvlist_t *); +extern void vdev_draid_fix_skip_sectors(zio_t *); +extern int vdev_draid_hide_skip_sectors(raidz_map_t *); +extern void vdev_draid_restore_skip_sectors(raidz_map_t *, int); +extern boolean_t vdev_draid_readable(vdev_t *, uint64_t); +extern boolean_t vdev_draid_is_dead(vdev_t *, uint64_t); +extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t); +extern vdev_t *vdev_draid_spare_get_parent(vdev_t *); +extern nvlist_t *vdev_draid_spare_read_config(vdev_t *); +extern uint64_t vdev_draid_asize2psize(vdev_t *, uint64_t, uint64_t); +extern uint64_t vdev_draid_max_rebuildable_asize(vdev_t *, uint64_t); + +#define VDEV_DRAID_MAX_CHILDREN 255 +#define VDEV_DRAID_U8_MAX ((uint8_t)-1) + +#define VDEV_DRAID_SPARE_PATH_FMT "$"VDEV_TYPE_DRAID"%lu-%lu-s%lu" + +#ifdef _KERNEL +#define U64FMT "%llu" +#ifdef ZFS_IS_GPL_COMPATIBLE +#define draid_print(fmt, ...) trace_printk(fmt, ##__VA_ARGS__) +#else +#define draid_print(fmt, ...) printk(fmt, ##__VA_ARGS__) +#endif +#define draid_console(fmt, ...) printk(KERN_EMERG fmt, ##__VA_ARGS__) +#else /* _KERNEL */ +#define U64FMT "%lu" +#define draid_print(fmt, ...) printf(fmt, ##__VA_ARGS__) +#define draid_console(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) +#endif + +extern int draid_debug_lvl; +extern void vdev_draid_debug_zio(zio_t *, boolean_t); + +#define draid_dbg(lvl, fmt, ...) \ + do { \ + if ((lvl) == 0) \ + draid_console(fmt, ##__VA_ARGS__); \ + else if (draid_debug_lvl >= (lvl)) \ + draid_print(fmt, ##__VA_ARGS__); \ + } while (0); + + +#ifdef __cplusplus +} +#endif + +#endif /* _VDEV_DRAID_IMPL_H */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index b933f9ab8d6a..a2ee1143830c 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -235,6 +235,8 @@ struct vdev { boolean_t vdev_ishole; /* is a hole in the namespace */ kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */ uint64_t vdev_top_zap; + uint64_t vdev_last_io; /* lbolt of last non-scan I/O */ + nvlist_t *vdev_cfg; /* additional dRAID configuration */ /* * Values stored in the config for an indirect or removing vdev. @@ -453,6 +455,8 @@ extern vdev_ops_t vdev_root_ops; extern vdev_ops_t vdev_mirror_ops; extern vdev_ops_t vdev_replacing_ops; extern vdev_ops_t vdev_raidz_ops; +extern vdev_ops_t vdev_draid_ops; +extern vdev_ops_t vdev_draid_spare_ops; extern vdev_ops_t vdev_disk_ops; extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; @@ -460,6 +464,31 @@ extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; extern vdev_ops_t vdev_indirect_ops; +/* + * Virtual device vector for mirroring. + */ +typedef struct mirror_child { + vdev_t *mc_vd; + uint64_t mc_offset; + int mc_error; + int mc_load; + uint8_t mc_tried; + uint8_t mc_skipped; + uint8_t mc_speculative; +} mirror_child_t; + +typedef struct mirror_map { + int *mm_preferred; + int mm_preferred_cnt; + int mm_children; + boolean_t mm_replacing; + boolean_t mm_root; + mirror_child_t mm_child[]; +} mirror_map_t; + +extern mirror_map_t *vdev_mirror_map_alloc(int, boolean_t, boolean_t); +extern const zio_vsd_ops_t vdev_mirror_vsd_ops; + /* * Common size functions */ diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 0799ed19dfc8..32ae63471a77 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -29,6 +29,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -106,6 +107,7 @@ typedef struct raidz_col { uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ abd_t *rc_abd; /* I/O data */ + abd_t *rc_abd_skip; /* Skip sector */ void *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ @@ -123,13 +125,18 @@ typedef struct raidz_map { uint64_t rm_nskip; /* Skipped sectors for padding */ uint64_t rm_skipstart; /* Column index of padding start */ abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ + abd_t *rm_abd_skip; /* dRAID skip sectors */ uintptr_t rm_reports; /* # of referencing checksum reports */ uint8_t rm_freed; /* map no longer has referencing ZIO */ uint8_t rm_ecksuminjected; /* checksum error was injected */ raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ + vdev_t *rm_vdev; /* RAIDz/dRAID vdev */ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ } raidz_map_t; +#define vdev_raidz_map_declustered(rm) ((rm)->rm_vdev != NULL && \ + (rm)->rm_vdev->vdev_ops == &vdev_draid_ops) + #define RAIDZ_ORIGINAL_IMPL (INT_MAX) extern const raidz_impl_ops_t vdev_raidz_scalar_impl; diff --git a/include/sys/vdev_scan.h b/include/sys/vdev_scan.h new file mode 100644 index 000000000000..151fd7ca4615 --- /dev/null +++ b/include/sys/vdev_scan.h @@ -0,0 +1,78 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018, Intel Corporation. + */ + +#ifndef _SYS_VDEV_SCAN_H +#define _SYS_VDEV_SCAN_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct spa_rebuilding_phys { + uint64_t sr_vdev; + uint64_t sr_oldvd; + int64_t sr_ms; +} spa_rebuilding_phys_t; + +typedef struct spa_vdev_scan { + dsl_pool_t *svs_dp; + vdev_t *svs_vd; + kthread_t *svs_thread; + kmutex_t svs_lock; + kcondvar_t svs_cv; + boolean_t svs_thread_exit; + uint64_t svs_dtl_max; + int svs_msi; + int svs_msi_synced; + int *svs_ms_done; + + kmutex_t svs_io_lock; + kcondvar_t svs_io_cv; + uint64_t svs_io_asize; + + spa_rebuilding_phys_t svs_phys; +} spa_vdev_scan_t; + +extern void spa_vdev_scan_setup_sync(dmu_tx_t *); +extern void spa_vdev_scan_start(spa_t *, vdev_t *, int, uint64_t); +extern int spa_vdev_scan_restart(vdev_t *); +extern int spa_vdev_scan_rebuild_cb(dsl_pool_t *, + const blkptr_t *, const zbookmark_phys_t *); +extern void spa_vdev_scan_suspend(spa_t *); +extern void spa_vdev_scan_destroy(spa_t *); +extern void spa_vdev_scan_sync_state(spa_vdev_scan_t *, dmu_tx_t *); + +#define DSL_SCAN_IS_REBUILD(scn) ((scn)->scn_phys.scn_func == POOL_SCAN_REBUILD) + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_SCAN_H */ diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 13670c8e5123..aef8e0c0013b 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -61,6 +61,7 @@ typedef enum spa_feature { SPA_FEATURE_PROJECT_QUOTA, SPA_FEATURE_DEVICE_REMOVAL, SPA_FEATURE_OBSOLETE_COUNTS, + SPA_FEATURE_DRAID, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index da40c96ce77b..0709c5c5298e 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -60,6 +60,7 @@ libzfs_la_LIBADD = \ $(top_builddir)/lib/libshare/libshare.la \ $(top_builddir)/lib/libtpool/libtpool.la \ $(top_builddir)/lib/libuutil/libuutil.la \ + $(top_builddir)/lib/libzpool/libzpool.la \ $(top_builddir)/lib/libzfs_core/libzfs_core.la libzfs_la_LIBADD += -lm $(LIBBLKID) $(LIBUDEV) $(LIBSSL) diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index cc9a52a3eabb..1bd3119fb133 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include "libzfs.h" #include "libzfs_impl.h" @@ -875,7 +876,7 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config) /* * Determine if the vdev id is a hole in the namespace. */ -boolean_t +static boolean_t vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) { int c; @@ -889,6 +890,64 @@ vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) return (B_FALSE); } +nvlist_t * +draidcfg_read_file(const char *path) +{ + int fd; + struct stat64 sb; + char *buf; + nvlist_t *config; + + if ((fd = open(path, O_RDONLY)) < 0) { + (void) fprintf(stderr, "Cannot open '%s'\n", path); + return (NULL); + } + + if (fstat64(fd, &sb) != 0) { + (void) fprintf(stderr, "Failed to stat '%s'\n", path); + close(fd); + return (NULL); + } + + if (!S_ISREG(sb.st_mode)) { + (void) fprintf(stderr, "Not a regular file '%s'\n", path); + close(fd); + return (NULL); + } + + if ((buf = malloc(sb.st_size)) == NULL) { + (void) fprintf(stderr, "Failed to allocate %llu bytes\n", + (u_longlong_t)sb.st_size); + close(fd); + return (NULL); + } + + if (read(fd, buf, sb.st_size) != sb.st_size) { + (void) fprintf(stderr, "Failed to read %llu bytes\n", + (u_longlong_t)sb.st_size); + close(fd); + free(buf); + return (NULL); + } + + (void) close(fd); + + if (nvlist_unpack(buf, sb.st_size, &config, 0) != 0) { + (void) fprintf(stderr, "Failed to unpack nvlist\n"); + free(buf); + return (NULL); + } + + free(buf); + + if (!vdev_draid_config_validate(NULL, config)) { + nvlist_free(config); + return (NULL); + } + + return (config); +} + /* * Convert our list of pools into the definitive set of configurations. We * start by picking the best config for each toplevel vdev. Once that's done, @@ -2105,17 +2164,6 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) return (ret); } -nvlist_t * -zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv) -{ - importargs_t iarg = { 0 }; - - iarg.paths = argc; - iarg.path = argv; - - return (zpool_find_import_impl(hdl, &iarg)); -} - /* * Given a cache file, return the contents as a list of importable pools. * poolname or guid (but not both) are provided by the caller when trying diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 2d94cd320497..baa10a9b0458 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include "zfs_namecheck.h" @@ -958,6 +959,7 @@ zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) if (ret == 0 && !isopen && (strncmp(pool, "mirror", 6) == 0 || strncmp(pool, "raidz", 5) == 0 || + strncmp(pool, "draid", 5) == 0 || strncmp(pool, "spare", 5) == 0 || strcmp(pool, "log") == 0)) { if (hdl != NULL) @@ -2263,6 +2265,7 @@ zpool_vdev_is_interior(const char *name) strncmp(name, VDEV_TYPE_SPARE, strlen(VDEV_TYPE_SPARE)) == 0 || strncmp(name, VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 || + strncmp(name, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0 || strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) return (B_TRUE); return (B_FALSE); @@ -2853,6 +2856,10 @@ zpool_vdev_attach(zpool_handle_t *zhp, if (islog) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a log with a spare")); + else if (new_disk[0] == VDEV_DRAID_SPARE_PATH_FMT[0]) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dspare can only replace a child " + "drive in its parent draid vdev")); else if (version >= SPA_VERSION_MULTI_REPLACE) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "already in replacing/spare config; wait " @@ -3246,6 +3253,12 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot remove %s"), path); + if (path[0] == VDEV_DRAID_SPARE_PATH_FMT[0]) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dRAID spare cannot be removed")); + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + } + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) @@ -3780,7 +3793,8 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, /* * Remove the partition from the path it this is a whole disk. */ - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) + if (strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0 && + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) == 0 && value && !(name_flags & VDEV_NAME_PATH)) { return (zfs_strip_partition(path)); } @@ -3790,7 +3804,8 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, /* * If it's a raidz device, we need to stick in the parity level. */ - if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) { + if (strcmp(path, VDEV_TYPE_RAIDZ) == 0 || + strcmp(path, VDEV_TYPE_DRAID) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &value) == 0); (void) snprintf(buf, sizeof (buf), "%s%llu", path, diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index ec80ccf543f1..c7add47d98da 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -102,6 +102,7 @@ KERNEL_C = \ uberblock.c \ unique.c \ vdev.c \ + vdev_scan.c \ vdev_cache.c \ vdev_file.c \ vdev_indirect_births.c \ @@ -122,6 +123,7 @@ KERNEL_C = \ vdev_raidz_math_sse2.c \ vdev_raidz_math_ssse3.c \ vdev_removal.c \ + vdev_draid.c \ vdev_root.c \ zap.c \ zap_leaf.c \ diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 index ce34a05a22fa..ce301f8f2f3b 100644 --- a/man/man5/zpool-features.5 +++ b/man/man5/zpool-features.5 @@ -304,6 +304,23 @@ required in order to support crash dumps under Linux. Existing pools where this feature is \fB\fBactive\fR can be imported. .RE +.sp +.ne 2 +.na +\fB\fBdraid\fR\fR +.ad +.RS 4n +.TS +l l . +GUID com.intel:draid +READ\-ONLY COMPATIBLE no +DEPENDENCIES none +.TE + +This feature enables use of the \fBdraid\fR vdev driver. + +.RE + .sp .ne 2 .na diff --git a/module/nvpair/fnvpair.c b/module/nvpair/fnvpair.c index a91b9524d8a0..4efb9048f745 100644 --- a/module/nvpair/fnvpair.c +++ b/module/nvpair/fnvpair.c @@ -73,15 +73,26 @@ fnvlist_size(nvlist_t *nvl) * Returns allocated buffer of size *sizep. Caller must free the buffer with * fnvlist_pack_free(). */ -char * -fnvlist_pack(nvlist_t *nvl, size_t *sizep) +static char * +fnvlist_pack_enc(nvlist_t *nvl, size_t *sizep, int encoding) { char *packed = 0; - VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE, - KM_SLEEP), ==, 0); + VERIFY3U(nvlist_pack(nvl, &packed, sizep, encoding, KM_SLEEP), ==, 0); return (packed); } +char * +fnvlist_pack(nvlist_t *nvl, size_t *sizep) +{ + return (fnvlist_pack_enc(nvl, sizep, NV_ENCODE_NATIVE)); +} + +char * +fnvlist_pack_xdr(nvlist_t *nvl, size_t *sizep) +{ + return (fnvlist_pack_enc(nvl, sizep, NV_ENCODE_XDR)); +} + /*ARGSUSED*/ void fnvlist_pack_free(char *pack, size_t size) diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 6ab3abe11914..344b10207b98 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -190,6 +190,10 @@ zpool_feature_init(void) "Crash dumps to multiple vdev pools.", 0, NULL); + zfeature_register(SPA_FEATURE_DRAID, + "com.intel:draid", "draid", "draid vdev driver.", + ZFEATURE_FLAG_MOS, NULL); + zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM, "com.delphix:spacemap_histogram", "spacemap_histogram", "Spacemaps maintain space histograms.", diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c index 42a7c6c93bed..140c69f022ed 100644 --- a/module/zcommon/zfs_namecheck.c +++ b/module/zcommon/zfs_namecheck.c @@ -334,7 +334,9 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what) return (-1); } - if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) { + if (strcmp(pool, "mirror") == 0 || + strcmp(pool, "raidz") == 0 || + strcmp(pool, "draid") == 0) { if (why) *why = NAME_ERR_RESERVED; return (-1); diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index fe50107731a9..8d084290f1ec 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -78,6 +78,7 @@ $(MODULE)-objs += trace.o $(MODULE)-objs += uberblock.o $(MODULE)-objs += unique.o $(MODULE)-objs += vdev.o +$(MODULE)-objs += vdev_scan.o $(MODULE)-objs += vdev_cache.o $(MODULE)-objs += vdev_disk.o $(MODULE)-objs += vdev_file.o @@ -92,6 +93,7 @@ $(MODULE)-objs += vdev_raidz.o $(MODULE)-objs += vdev_raidz_math.o $(MODULE)-objs += vdev_raidz_math_scalar.o $(MODULE)-objs += vdev_removal.o +$(MODULE)-objs += vdev_draid.o $(MODULE)-objs += vdev_root.o $(MODULE)-objs += zap.o $(MODULE)-objs += zap_leaf.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 040e943655f4..690b14ab69e7 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -7483,6 +7483,12 @@ arc_target_bytes(void) return (arc_c); } +uint64_t +arc_max_bytes(void) +{ + return (arc_c_max); +} + void arc_init(void) { diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index b87b4d5558f5..96135d13e3fc 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -199,8 +200,9 @@ int zfs_free_bpobj_enabled = 1; /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { NULL, - dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ - dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ + dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ + dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ + spa_vdev_scan_rebuild_cb, /* POOL_SCAN_REBUILD */ }; /* In core node for the scn->scn_queue. Represents a dataset to be scanned */ @@ -330,8 +332,11 @@ dsl_scan_is_running(const dsl_scan_t *scn) boolean_t dsl_scan_resilvering(dsl_pool_t *dp) { - return (dsl_scan_is_running(dp->dp_scan) && - dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); + dsl_scan_t *scn = dp->dp_scan; + + return (dsl_scan_is_running(scn) && + (scn->scn_phys.scn_func == POOL_SCAN_RESILVER || + DSL_SCAN_IS_REBUILD(scn))); } static inline void @@ -480,6 +485,12 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) "by old software; restarting in txg %llu", (longlong_t)scn->scn_restart_txg); } + + if (DSL_SCAN_IS_REBUILD(scn) && + scn->scn_phys.scn_state == DSS_SCANNING) { + ASSERT3P(spa->spa_vdev_scan, ==, NULL); + scn->scn_phys.scn_state = DSS_CANCELED; + } } /* reload the queue into the in-core state */ @@ -631,6 +642,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) ASSERT(!dsl_scan_is_running(scn)); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); + ASSERT(*funcp != POOL_SCAN_REBUILD); bzero(&scn->scn_phys, sizeof (scn->scn_phys)); scn->scn_phys.scn_func = *funcp; scn->scn_phys.scn_state = DSS_SCANNING; @@ -754,18 +766,22 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; - int i; + boolean_t rebuild = DSL_SCAN_IS_REBUILD(scn); - /* Remove any remnants of an old-style scrub. */ - for (i = 0; old_names[i]; i++) { - (void) zap_remove(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); - } + if (!rebuild) { + int i; - if (scn->scn_phys.scn_queue_obj != 0) { - VERIFY0(dmu_object_free(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, tx)); - scn->scn_phys.scn_queue_obj = 0; + /* Remove any remnants of an old-style scrub. */ + for (i = 0; old_names[i]; i++) { + (void) zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); + } + + if (scn->scn_phys.scn_queue_obj != 0) { + VERIFY0(dmu_object_free(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, tx)); + scn->scn_phys.scn_queue_obj = 0; + } } scan_ds_queue_clear(scn); @@ -802,7 +818,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) spa_history_log_internal(spa, "scan done", tx, "errors=%llu", spa_get_errlog_size(spa)); - if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + if (DSL_SCAN_IS_SCRUB_RESILVER(scn) || rebuild) { spa->spa_scrub_started = B_FALSE; spa->spa_scrub_active = B_FALSE; @@ -814,9 +830,16 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE); if (complete) { - spa_event_notify(spa, NULL, NULL, - scn->scn_phys.scn_min_txg ? - ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); + const char *name; + + if (rebuild) + name = ESC_ZFS_REBUILD_FINISH; + else if (scn->scn_phys.scn_min_txg) + name = ESC_ZFS_RESILVER_FINISH; + else + name = ESC_ZFS_SCRUB_FINISH; + + spa_event_notify(spa, NULL, NULL, name); } spa_errlog_rotate(spa); @@ -843,6 +866,8 @@ dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) if (!dsl_scan_is_running(scn)) return (SET_ERROR(ENOENT)); + if (DSL_SCAN_IS_REBUILD(scn)) + return (SET_ERROR(ENOTSUP)); return (0); } @@ -933,6 +958,9 @@ dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) void dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) { + if (dp->dp_spa->spa_vdev_scan != NULL) + return; + if (txg == 0) { dmu_tx_t *tx; tx = dmu_tx_create_dd(dp->dp_mos_dir); @@ -2309,7 +2337,7 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, zbookmark_phys_t zb = { 0 }; int p; - if (!dsl_scan_is_running(scn)) + if (!dsl_scan_is_running(scn) || DSL_SCAN_IS_REBUILD(scn)) return; for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { @@ -2997,10 +3025,7 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, * then it may be possible to skip the resilver IO. The psize * is provided instead of asize to simplify the check for RAIDZ. */ - if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) - return (B_FALSE); - - return (B_TRUE); + return (vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)); } /* @@ -3027,11 +3052,18 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (dsl_scan_restarting(scn, tx)) { pool_scan_func_t func = POOL_SCAN_SCRUB; dsl_scan_done(scn, B_FALSE, tx); - if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) - func = POOL_SCAN_RESILVER; + if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { + if (spa->spa_vdev_scan != NULL) + func = POOL_SCAN_REBUILD; + else + func = POOL_SCAN_RESILVER; + } zfs_dbgmsg("restarting scan func=%u txg=%llu", func, (longlong_t)tx->tx_txg); - dsl_scan_setup_sync(&func, tx); + if (func == POOL_SCAN_REBUILD) + spa_vdev_scan_setup_sync(tx); + else + dsl_scan_setup_sync(&func, tx); } /* @@ -3208,6 +3240,47 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn)) return; + if (DSL_SCAN_IS_REBUILD(scn)) { + spa_vdev_scan_t *svs = spa->spa_vdev_scan; + int msi; + boolean_t done; + + ASSERT(svs != NULL); + + mutex_enter(&svs->svs_lock); + done = (svs->svs_thread == NULL) ? B_TRUE : B_FALSE; + msi = svs->svs_msi_synced; + mutex_exit(&svs->svs_lock); + + if (done) { + boolean_t complete = !svs->svs_thread_exit; + + if (complete) { + ASSERT3U(msi + 1, ==, + svs->svs_vd->vdev_top->vdev_ms_count); + svs->svs_phys.sr_ms = -1; + svs->svs_phys.sr_vdev = 0; + svs->svs_phys.sr_oldvd = 0; + } + dsl_scan_done(scn, complete, tx); + /* + * HH: remove calls to dsl_scan_sync_state() here and + * below, when states shared with DSL scan are removed + */ + dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); + spa_vdev_scan_sync_state(svs, tx); + + spa_vdev_scan_destroy(spa); + svs = NULL; + } else if (msi == -1 || msi > svs->svs_phys.sr_ms) { + svs->svs_phys.sr_ms = msi; + dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); + spa_vdev_scan_sync_state(svs, tx); + } + /* Rebuild is mostly handled in the open-context scan thread */ + return; + } + /* * Wait a few txgs after importing to begin scanning so that * we can get the pool imported quickly. diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 1dbe1e98c1aa..7cb10368b402 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -209,7 +210,6 @@ boolean_t metaslab_trace_enabled = B_TRUE; uint64_t metaslab_trace_max_entries = 5000; #endif -static uint64_t metaslab_weight(metaslab_t *); static void metaslab_set_fragmentation(metaslab_t *); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, uint64_t); static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); @@ -1056,8 +1056,8 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) * tree looking for a block that matches the specified criteria. */ static uint64_t -metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, - uint64_t align) +metaslab_block_picker(metaslab_t *msp, avl_tree_t *t, uint64_t *cursor, + uint64_t size, uint64_t align) { range_seg_t *rs = metaslab_block_find(t, *cursor, size); @@ -1065,8 +1065,27 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, uint64_t offset = P2ROUNDUP(rs->rs_start, align); if (offset + size <= rs->rs_end) { - *cursor = offset + size; - return (offset); + vdev_t *vd = msp->ms_group->mg_vd; + uint64_t next_offset; + + if (vd->vdev_ops != &vdev_draid_ops) { + *cursor = offset + size; + return (offset); + } + + next_offset = vdev_draid_check_block(vd, offset, size); + if (next_offset == offset) { + *cursor = offset + size; + return (offset); + } + + offset = P2ROUNDUP(next_offset, align); + if (offset + size <= rs->rs_end) { + ASSERT3U(offset, ==, + vdev_draid_check_block(vd, offset, size)); + *cursor = offset + size; + return (offset); + } } rs = AVL_NEXT(t, rs); } @@ -1079,7 +1098,7 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, return (-1ULL); *cursor = 0; - return (metaslab_block_picker(t, cursor, size, align)); + return (metaslab_block_picker(msp, t, cursor, size, align)); } #endif /* WITH_FF/DF/CF_BLOCK_ALLOCATOR */ @@ -1103,7 +1122,7 @@ metaslab_ff_alloc(metaslab_t *msp, uint64_t size) uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; avl_tree_t *t = &msp->ms_tree->rt_root; - return (metaslab_block_picker(t, cursor, size, align)); + return (metaslab_block_picker(msp, t, cursor, size, align)); } static metaslab_ops_t metaslab_ff_ops = { @@ -1155,7 +1174,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) *cursor = 0; } - return (metaslab_block_picker(t, cursor, size, 1ULL)); + return (metaslab_block_picker(msp, t, cursor, size, 1ULL)); } static metaslab_ops_t metaslab_df_ops = { @@ -1366,9 +1385,17 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); + ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; + if (vd->vdev_ops == &vdev_draid_ops) { + uint64_t astart = vdev_draid_get_astart(vd, ms->ms_start); + + ASSERT3U(astart - ms->ms_start, <, ms->ms_size); + ms->ms_size -= astart - ms->ms_start; + ms->ms_start = astart; + } /* * We only open space map objects that already exist. All others @@ -1587,6 +1614,30 @@ metaslab_set_fragmentation(metaslab_t *msp) msp->ms_fragmentation = fragmentation; } +/* + * dRAID metaslabs start at a certain alignment, which causes their sizes to + * vary by a few sectors. The block allocator may get confused and pick a + * distant metaslab because the closer ones are slightly smaller. The small + * variance doesn't matter when the metaslab has already been allocated from. + * + * This function returns adjusted size to calculate metaslab weight, and + * should not be used for other purposes. + */ +static uint64_t +metaslab_weight_size(metaslab_t *msp) +{ + vdev_t *vd = msp->ms_group->mg_vd; + uint64_t size; + + if (vd->vdev_ops != &vdev_draid_ops || + space_map_allocated(msp->ms_sm) != 0) + return (msp->ms_size); + + size = 1ULL << vd->vdev_ms_shift; + ASSERT3U(size, >=, msp->ms_size); + return (size); +} + /* * Compute a weight -- a selection preference value -- for the given metaslab. * This is based on the amount of free space, the level of fragmentation, @@ -1605,7 +1656,7 @@ metaslab_space_weight(metaslab_t *msp) /* * The baseline weight is the metaslab's free space. */ - space = msp->ms_size - space_map_allocated(msp->ms_sm); + space = metaslab_weight_size(msp) - space_map_allocated(msp->ms_sm); if (metaslab_fragmentation_factor_enabled && msp->ms_fragmentation != ZFS_FRAG_INVALID) { @@ -1742,7 +1793,7 @@ metaslab_segment_weight(metaslab_t *msp) * The metaslab is completely free. */ if (space_map_allocated(msp->ms_sm) == 0) { - int idx = highbit64(msp->ms_size) - 1; + int idx = highbit64(metaslab_weight_size(msp)) - 1; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; if (idx < max_idx) { @@ -2200,10 +2251,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * into the DMU, because the DMU can call down to us * (e.g. via zio_free()) at any time. * - * The spa_vdev_remove_thread() can be reading metaslab state - * concurrently, and it is locked out by the ms_sync_lock. Note - * that the ms_lock is insufficient for this, because it is dropped - * by space_map_write(). + * The spa_vdev_remove_thread() or spa_scan_thread() can be reading + * metaslab state * concurrently, and it is locked out by the + * ms_sync_lock. Note that the ms_lock is insufficient for this, + * because it is dropped by space_map_write(). */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); @@ -2671,6 +2722,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) metaslab_class_t *mc = msp->ms_group->mg_class; VERIFY(!msp->ms_condensing); + VERIFY(!msp->ms_rebuilding); start = mc->mc_ops->msop_alloc(msp, size); if (start != -1ULL) { @@ -2683,7 +2735,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) range_tree_remove(rt, start, size); if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) - vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); + vdev_dirty(vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size); @@ -2702,17 +2754,25 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) + uint64_t psize, uint64_t asize, uint64_t txg, uint64_t min_distance, + dva_t *dva, int d) { + vdev_t *vd = mg->mg_vd; metaslab_t *msp = NULL; uint64_t offset = -1ULL; + boolean_t hybrid_mirror = B_FALSE; uint64_t activation_weight; uint64_t target_distance; int i; + if (vd->vdev_ops == &vdev_draid_ops && + psize <= (1ULL << vd->vdev_top->vdev_ashift)) { + hybrid_mirror = B_TRUE; + } + activation_weight = METASLAB_WEIGHT_PRIMARY; for (i = 0; i < d; i++) { - if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { + if (DVA_GET_VDEV(&dva[i]) == vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; break; } @@ -2753,10 +2813,15 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, continue; } + if (vd->vdev_ops == &vdev_draid_ops && + hybrid_mirror != + vdev_draid_ms_mirrored(vd, msp->ms_id)) + continue; + /* * If the selected metaslab is condensing, skip it. */ - if (msp->ms_condensing) + if (msp->ms_condensing || msp->ms_rebuilding) continue; was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; @@ -2832,7 +2897,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, * we can't manipulate this metaslab until it's committed * to disk. */ - if (msp->ms_condensing) { + if (msp->ms_condensing || msp->ms_rebuilding) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_CONDENSING); mutex_exit(&msp->ms_lock); @@ -2896,12 +2961,13 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) + uint64_t psize, uint64_t asize, uint64_t txg, uint64_t min_distance, + dva_t *dva, int d) { uint64_t offset; ASSERT(mg->mg_initialized); - offset = metaslab_group_alloc_normal(mg, zal, asize, txg, + offset = metaslab_group_alloc_normal(mg, zal, psize, asize, txg, min_distance, dva, d); mutex_enter(&mg->mg_lock); @@ -3093,8 +3159,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, uint64_t asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, - distance, dva, d); + uint64_t offset = metaslab_group_alloc(mg, zal, psize, asize, + txg, distance, dva, d); if (offset != -1ULL) { /* @@ -3539,6 +3605,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, } VERIFY(!msp->ms_condensing); + VERIFY(!msp->ms_rebuilding); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 3b5582624aa2..0695c342d0e4 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -78,6 +79,7 @@ #include #include #include +#include #include #include #include @@ -1360,6 +1362,7 @@ spa_unload(spa_t *spa) * Stop async tasks. */ spa_async_suspend(spa); + spa_vdev_scan_suspend(spa); /* * Stop syncing. @@ -1408,6 +1411,8 @@ spa_unload(spa_t *spa) spa_condense_fini(spa); + spa_vdev_scan_destroy(spa); + bpobj_close(&spa->spa_deferred_bpobj); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -3626,7 +3631,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * Check all DTLs to see if anything needs resilvering. */ if (!dsl_scan_resilvering(spa->spa_dsl_pool) && - vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) + vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && + spa_vdev_scan_restart(spa->spa_root_vdev) != 0) spa_async_request(spa, SPA_ASYNC_RESILVER); /* @@ -4373,6 +4379,72 @@ spa_create_check_encryption_params(dsl_crypto_params_t *dcp, return (dmu_objset_create_crypt_check(NULL, dcp)); } +static int +spa_add_draid_spare(nvlist_t *nvroot, vdev_t *rvd) +{ + int i, j, n; + nvlist_t **oldspares, **newspares; + uint_t nspares; + vdev_t *c; + struct vdev_draid_configuration *cfg; + + for (i = 0, n = 0; i < rvd->vdev_children; i++) { + c = rvd->vdev_child[i]; + + if (c->vdev_ops == &vdev_draid_ops) { + cfg = c->vdev_tsd; + ASSERT(cfg != NULL); + n += cfg->dcf_spare; + } + } + + if (n == 0) + return (0); + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &oldspares, &nspares) != 0) + nspares = 0; + + newspares = kmem_alloc(sizeof (*newspares) * (n + nspares), KM_SLEEP); + for (i = 0; i < nspares; i++) + newspares[i] = fnvlist_dup(oldspares[i]); + + for (i = 0, n = nspares; i < rvd->vdev_children; i++) { + c = rvd->vdev_child[i]; + + if (c->vdev_ops != &vdev_draid_ops) + continue; + + cfg = c->vdev_tsd; + for (j = 0; j < cfg->dcf_spare; j++) { + nvlist_t *ds = fnvlist_alloc(); + char path[64]; + + snprintf(path, sizeof (path), VDEV_DRAID_SPARE_PATH_FMT, + (long unsigned)c->vdev_nparity, + (long unsigned)c->vdev_id, (long unsigned)j); + fnvlist_add_string(ds, ZPOOL_CONFIG_PATH, path); + fnvlist_add_string(ds, + ZPOOL_CONFIG_TYPE, VDEV_TYPE_DRAID_SPARE); + fnvlist_add_uint64(ds, ZPOOL_CONFIG_IS_LOG, 0); + fnvlist_add_uint64(ds, ZPOOL_CONFIG_IS_SPARE, 1); + fnvlist_add_uint64(ds, ZPOOL_CONFIG_WHOLE_DISK, 1); + fnvlist_add_uint64(ds, + ZPOOL_CONFIG_ASHIFT, c->vdev_ashift); + + newspares[n] = ds; + n++; + } + } + + (void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES); + fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, newspares, n); + for (i = 0; i < n; i++) + nvlist_free(newspares[i]); + kmem_free(newspares, sizeof (*newspares) * n); + return (0); +} + /* * Pool Creation */ @@ -4396,6 +4468,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, char *feat_name; char *poolname; nvlist_t *nvl; + int draid = 0; if (nvlist_lookup_string(props, "tname", &poolname) != 0) poolname = (char *)pool; @@ -4499,11 +4572,17 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && + (error = spa_add_draid_spare(nvroot, rvd)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { for (int c = 0; c < rvd->vdev_children; c++) { - vdev_metaslab_set_size(rvd->vdev_child[c]); - vdev_expand(rvd->vdev_child[c], txg); + vdev_t *vd = rvd->vdev_child[c]; + + vdev_metaslab_set_size(vd); + vdev_expand(vd, txg); + + if (vd->vdev_ops == &vdev_draid_ops) + draid++; } } @@ -4634,6 +4713,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_sync_props(props, tx); } + for (int i = 0; i < draid; i++) + spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); + dmu_tx_commit(tx); /* @@ -5122,6 +5204,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; + int c, draid = 0; ASSERT(spa_writeable(spa)); @@ -5163,7 +5246,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) */ if (spa->spa_vdev_removal != NULL || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { - for (int c = 0; c < vd->vdev_children; c++) { + for (c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; if (spa->spa_vdev_removal != NULL && tvd->vdev_ashift != spa->spa_max_ashift) { @@ -5190,7 +5273,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) } } - for (int c = 0; c < vd->vdev_children; c++) { + for (c = 0; c < vd->vdev_children; c++) { /* * Set the vdev id to the first hole, if one exists. @@ -5206,6 +5289,20 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) tvd->vdev_id = id; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); + + if (tvd->vdev_ops == &vdev_draid_ops) + draid++; + } + + if (draid != 0) { + dmu_tx_t *tx; + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + for (c = 0; c < draid; c++) + spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); + + dmu_tx_commit(tx); } if (nspares != 0) { @@ -5245,6 +5342,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) return (0); } +static int spa_rebuild_mirror = 0; /* * Attach a device to a mirror. The arguments are the path to any device * in the mirror, and the nvroot for the new device. If the path specifies @@ -5268,11 +5366,15 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) char *oldvdpath, *newvdpath; int newvd_isspare; int error; + boolean_t rebuild = B_FALSE; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); + if (spa->spa_vdev_scan != NULL) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); if (spa->spa_vdev_removal != NULL) @@ -5301,6 +5403,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); + /* + * dRAID spare can only replace a child drive of its parent + * dRAID vdev + */ + if (newvd->vdev_ops == &vdev_draid_spare_ops && + oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + /* * Spares can't replace logs */ @@ -5418,8 +5528,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, - dtl_max_txg - TXG_INITIAL); + vdev_dtl_dirty(newvd, DTL_MISSING, + TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); @@ -5435,12 +5545,19 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ vdev_dirty(tvd, VDD_DTL, newvd, txg); + if (newvd->vdev_ops == &vdev_draid_spare_ops || + (tvd->vdev_ops == &vdev_mirror_ops && spa_rebuild_mirror != 0)) + rebuild = B_TRUE; /* HH: let zpool cmd choose */ + /* * Schedule the resilver to restart in the future. We do this to * ensure that dmu_sync-ed blocks have been stitched into the * respective datasets. */ - dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); + if (rebuild) + spa_vdev_scan_start(spa, oldvd, 0, dtl_max_txg); + else + dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); @@ -5572,6 +5689,17 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) unspare = B_TRUE; + /* + * If we are detaching a draid spare that is being rebuilt, we need to + * abort the rebuild thread. + */ + if (replace_done == 0 && + pvd->vdev_ops == &vdev_spare_ops && + vd->vdev_ops == &vdev_draid_spare_ops && + spa->spa_vdev_scan != NULL && + spa->spa_vdev_scan->svs_vd->vdev_parent == pvd) + spa->spa_vdev_scan->svs_thread_exit = B_TRUE; + /* * Erase the disk labels so the disk can be used for other things. * This must be done after all other error cases are handled, @@ -6198,9 +6326,13 @@ spa_scan(spa_t *spa, pool_scan_func_t func) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); - if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) + if (func >= POOL_SCAN_FUNCS || + func == POOL_SCAN_NONE || func == POOL_SCAN_REBUILD) return (SET_ERROR(ENOTSUP)); + if (spa->spa_vdev_scan != NULL) + return (SET_ERROR(EBUSY)); + /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. @@ -7563,6 +7695,10 @@ module_param(spa_load_verify_data, int, 0644); MODULE_PARM_DESC(spa_load_verify_data, "Set to traverse data on pool import"); +module_param(spa_rebuild_mirror, int, 0644); +MODULE_PARM_DESC(spa_rebuild_mirror, + "Set to enable rebuild on mirror vdev"); + /* CSTYLED */ module_param(zio_taskq_batch_pct, uint, 0444); MODULE_PARM_DESC(zio_taskq_batch_pct, diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 9fc25dd6a077..7d06120c2822 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include #include #include @@ -81,6 +83,8 @@ int zfs_scan_ignore_errors = 0; static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, + &vdev_draid_ops, + &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, @@ -158,6 +162,16 @@ vdev_get_min_asize(vdev_t *vd) return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / pvd->vdev_children); + if (pvd->vdev_ops == &vdev_draid_ops) { + struct vdev_draid_configuration *cfg = pvd->vdev_tsd; + + ASSERT(cfg != NULL); + ASSERT3U(pvd->vdev_nparity, ==, cfg->dcf_parity); + ASSERT3U(pvd->vdev_children, ==, cfg->dcf_children); + return (pvd->vdev_min_asize / + (pvd->vdev_children - cfg->dcf_spare)); + } + return (pvd->vdev_min_asize); } @@ -379,6 +393,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); + vd->vdev_cfg = NULL; + vd->vdev_last_io = 0; vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); @@ -431,6 +447,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, vdev_indirect_config_t *vic; char *tmp = NULL; int rc; + nvlist_t *draidcfg = NULL; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); @@ -485,7 +502,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, * Set the nparity property for RAID-Z vdevs. */ nparity = -1ULL; - if (ops == &vdev_raidz_ops) { + if (ops == &vdev_raidz_ops || ops == &vdev_draid_ops) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) @@ -517,11 +534,28 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } ASSERT(nparity != -1ULL); + if (ops == &vdev_draid_ops) { + if (nvlist_lookup_nvlist(nv, + ZPOOL_CONFIG_DRAIDCFG, &draidcfg) != 0) + return (SET_ERROR(EINVAL)); + if (!vdev_draid_config_validate(NULL, draidcfg)) + return (SET_ERROR(EINVAL)); + if (alloctype == VDEV_ALLOC_ADD && + spa->spa_load_state != SPA_LOAD_CREATE && + !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { + cmn_err(CE_WARN, "pool '%s' adding a dRAID " + "VDEV requires feature@draid", spa_name(spa)); + return (SET_ERROR(EINVAL)); + } + } + vd = vdev_alloc_common(spa, id, guid, ops); vic = &vd->vdev_indirect_config; vd->vdev_islog = islog; vd->vdev_nparity = nparity; + if (ops == &vdev_draid_ops) + vd->vdev_cfg = fnvlist_dup(draidcfg); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); @@ -771,6 +805,9 @@ vdev_free(vdev_t *vd) if (vd->vdev_isl2cache) spa_l2cache_remove(vd); + if (vd->vdev_cfg) + fnvlist_free(vd->vdev_cfg); + txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); @@ -826,6 +863,7 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) int t; ASSERT(tvd == tvd->vdev_top); + ASSERT(svd->vdev_ops != &vdev_draid_ops); tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite; tvd->vdev_ms_array = svd->vdev_ms_array; @@ -1184,6 +1222,9 @@ vdev_probe(vdev_t *vd, zio_t *zio) ASSERT(vd->vdev_ops->vdev_op_leaf); + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (NULL); + /* * Don't probe the probe. */ @@ -1538,6 +1579,7 @@ vdev_open(vdev_t *vd) * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); @@ -2870,6 +2912,9 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; @@ -3046,6 +3091,18 @@ vdev_is_dead(vdev_t *vd) vd->vdev_ops == &vdev_missing_ops); } +boolean_t +vdev_is_dead_at(vdev_t *vd, uint64_t zio_offset) +{ + if (vd->vdev_top == NULL || vd->vdev_top->vdev_ops != &vdev_draid_ops) + return (vdev_is_dead(vd)); + + if (vd->vdev_ops == &vdev_draid_spare_ops) + zio_offset -= VDEV_LABEL_START_SIZE; + + return (vdev_draid_is_dead(vd, zio_offset)); +} + boolean_t vdev_readable(vdev_t *vd) { @@ -3306,7 +3363,8 @@ vdev_stat_update(zio_t *zio, uint64_t psize) uint64_t *processed = &scn_phys->scn_processed; /* XXX cleanup? */ - if (vd->vdev_ops->vdev_op_leaf) + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } @@ -3369,20 +3427,22 @@ vdev_stat_update(zio_t *zio, uint64_t psize) return; mutex_enter(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { + if (type == ZIO_TYPE_READ && !vdev_is_dead_at(vd, zio->io_offset)) { if (zio->io_error == ECKSUM) vs->vs_checksum_errors++; else vs->vs_read_errors++; } - if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) + if (type == ZIO_TYPE_WRITE && !vdev_is_dead_at(vd, zio->io_offset)) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); + /* HH: todo proper rebuild IO error handling... */ if (spa->spa_load_state == SPA_LOAD_NONE && type == ZIO_TYPE_WRITE && txg != 0 && + vd->vdev_ops != &vdev_draid_spare_ops && (!(flags & ZIO_FLAG_IO_REPAIR) || - (flags & ZIO_FLAG_SCAN_THREAD) || + ((flags & ZIO_FLAG_SCAN_THREAD) && spa->spa_vdev_scan == NULL) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c new file mode 100644 index 000000000000..a423b7b21eea --- /dev/null +++ b/module/zfs/vdev_draid.c @@ -0,0 +1,1660 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#else +#include +#endif + +#include "vdev_raidz.h" + + +int draid_debug_lvl = 1; + +static void +vdev_draid_debug_map(int lvl, raidz_map_t *rm) +{ + int c; + + for (c = 0; rm != NULL && c < rm->rm_scols; c++) { + char t = 'D'; + raidz_col_t *rc = &rm->rm_col[c]; + vdev_t *cvd = rm->rm_vdev->vdev_child[rc->rc_devidx]; + + if (c >= rm->rm_cols) { + t = 'S'; + } else if (c < rm->rm_firstdatacol) { + switch (c) { + case 0: + t = 'P'; + break; + case 1: + t = 'Q'; + break; + case 2: + t = 'R'; + break; + default: + ASSERT0(c); + } + } + + draid_dbg(lvl, + "%c: dev "U64FMT" (%s) off "U64FMT"K, sz "U64FMT"K, " + "err %d, skipped %d, tried %d\n", t, rc->rc_devidx, + cvd->vdev_path != NULL ? cvd->vdev_path : "NA", + rc->rc_offset >> 10, rc->rc_size >> 10, + rc->rc_error, rc->rc_skipped, rc->rc_tried); + } +} + +void +vdev_draid_debug_zio(zio_t *zio, boolean_t mirror) +{ + draid_dbg(3, "%s zio: off "U64FMT"K sz "U64FMT"K data %p\n", + mirror ? "Mirror" : "dRAID", zio->io_offset >> 10, + zio->io_size >> 10, zio->io_abd); + + if (!mirror) + vdev_draid_debug_map(3, zio->io_vsd); +} + +/* A child vdev is divided into slices */ +static unsigned int slice_shift = 0; +#define DRAID_SLICESHIFT (SPA_MAXBLOCKSHIFT + slice_shift) +/* 2 ** slice_shift * SPA_MAXBLOCKSIZE */ +#define DRAID_SLICESIZE (1ULL << DRAID_SLICESHIFT) +#define DRAID_SLICEMASK (DRAID_SLICESIZE - 1) + +static int +vdev_draid_get_permutation(uint64_t *p, uint64_t nr, + const struct vdev_draid_configuration *cfg) +{ + uint64_t i; + uint64_t ncols = cfg->dcf_children; + uint64_t off = nr % (cfg->dcf_bases * ncols); + uint64_t base = off / ncols; + uint64_t dev = off % ncols; + + for (i = 0; i < ncols; i++) { + const uint64_t *base_perm = cfg->dcf_base_perms + + (base * ncols); + + p[i] = (base_perm[i] + dev) % ncols; + } + + return (0); +} + +noinline static raidz_map_t * +vdev_draid_map_alloc(zio_t *zio, uint64_t **array) +{ + vdev_t *vd = zio->io_vd; + const struct vdev_draid_configuration *cfg = vd->vdev_tsd; + const uint64_t unit_shift = vd->vdev_top->vdev_ashift; + const uint64_t ndata = cfg->dcf_data; + const uint64_t nparity = cfg->dcf_parity; + const uint64_t nspare = cfg->dcf_spare; + const uint64_t ncols = cfg->dcf_children; + /* The starting DRAID (parent) vdev sector of the block. */ + const uint64_t b = zio->io_offset >> unit_shift; + /* The zio's size in units of the vdev's minimum sector size. */ + const uint64_t psize = zio->io_size >> unit_shift; + const uint64_t slice = DRAID_SLICESIZE >> unit_shift; + uint64_t o, q, r, c, bc, acols, scols, asize, tot; + uint64_t perm, perm_off, group, group_offset, group_left, abd_off; + raidz_map_t *rm; + uint64_t *permutation; + + ASSERT(!vdev_draid_ms_mirrored(vd, + zio->io_offset >> vd->vdev_ms_shift)); + ASSERT3U(ncols % (nparity + ndata), ==, nspare); + ASSERT0(b % (nparity + ndata)); + ASSERT0(P2PHASE(DRAID_SLICESIZE, 1ULL << unit_shift)); + + /* HH: may not actually need the nspare columns for normal IO */ + permutation = kmem_alloc(sizeof (permutation[0]) * ncols, KM_SLEEP); + + perm = b / ((ncols - nspare) * slice); + perm_off = b % ((ncols - nspare) * slice); + group = perm_off / ((nparity + ndata) * slice); + group_offset = perm_off % ((nparity + ndata) * slice); + ASSERT0(group_offset % (nparity + ndata)); + + group_left = (slice - group_offset / (nparity + ndata)) * ndata; + ASSERT3U(psize, <=, group_left); + + /* The starting byte offset on each child vdev. */ + o = (perm * slice + group_offset / (nparity + ndata)) << unit_shift; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + */ + q = psize / ndata; + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + r = psize - q * ndata; + + /* The number of "big columns" - those which contain remainder data. */ + bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ + tot = psize + nparity * (q + (r == 0 ? 0 : 1)); + + /* acols: The columns that will be accessed. */ + /* scols: The columns that will be accessed or skipped. */ + if (q == 0) { + /* Our I/O request doesn't span all child vdevs. */ + acols = bc; + } else { + acols = nparity + ndata; + } + scols = nparity + ndata; + + ASSERT3U(acols, <=, scols); + + rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); + rm->rm_cols = acols; + rm->rm_scols = scols; + rm->rm_bigcols = bc; + rm->rm_skipstart = bc; + rm->rm_missingdata = 0; + rm->rm_missingparity = 0; + rm->rm_firstdatacol = nparity; + rm->rm_abd_copy = NULL; + rm->rm_reports = 0; + rm->rm_freed = 0; + rm->rm_ecksuminjected = 0; + rm->rm_vdev = vd; + + VERIFY0(vdev_draid_get_permutation(permutation, perm, cfg)); + + for (c = 0, asize = 0; c < scols; c++) { + uint64_t i = group * (nparity + ndata) + c; + + ASSERT3U(i, <, ncols - nspare); + + rm->rm_col[c].rc_devidx = permutation[i]; + rm->rm_col[c].rc_offset = o; + rm->rm_col[c].rc_abd = NULL; + rm->rm_col[c].rc_gdata = NULL; + rm->rm_col[c].rc_error = 0; + rm->rm_col[c].rc_tried = 0; + rm->rm_col[c].rc_skipped = 0; + + if (c >= acols) + rm->rm_col[c].rc_size = 0; + else if (c < bc) + rm->rm_col[c].rc_size = (q + 1) << unit_shift; + else + rm->rm_col[c].rc_size = q << unit_shift; + + asize += rm->rm_col[c].rc_size; + } + + ASSERT3U(asize, ==, tot << unit_shift); + rm->rm_asize = roundup(asize, (ndata + nparity) << unit_shift); + rm->rm_nskip = roundup(tot, ndata + nparity) - tot; + ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); + ASSERT3U(rm->rm_nskip, <, ndata); + + if (rm->rm_nskip == 0 || + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) == 0) + rm->rm_abd_skip = NULL; + else + rm->rm_abd_skip = + abd_alloc_linear(rm->rm_nskip << unit_shift, B_TRUE); + + for (c = 0; c < rm->rm_firstdatacol; c++) + rm->rm_col[c].rc_abd = + abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE); + + abd_off = 0; + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, abd_off, + rm->rm_col[c].rc_size); + abd_off += rm->rm_col[c].rc_size; + + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, + abd_off, rm->rm_col[c].rc_size); + abd_off += rm->rm_col[c].rc_size; + } + + if (array == NULL) + kmem_free(permutation, sizeof (permutation[0]) * ncols); + else + *array = permutation; /* caller will free */ + rm->rm_ops = vdev_raidz_math_get_ops(); + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; + return (rm); +} + +noinline static mirror_map_t * +vdev_draid_mirror_map_alloc(zio_t *zio, uint64_t unit_shift, + const struct vdev_draid_configuration *cfg, uint64_t **array) +{ + const uint64_t nparity = cfg->dcf_parity; + const uint64_t copies = nparity + 1; + const uint64_t nspare = cfg->dcf_spare; + const uint64_t ncols = cfg->dcf_children; + /* The starting DRAID (parent) vdev sector of the block. */ + const uint64_t b = zio->io_offset >> unit_shift; + const uint64_t slice = DRAID_SLICESIZE >> unit_shift; + vdev_t *vd = zio->io_vd; + uint64_t o, c, perm, perm_off, group, group_offset; + mirror_map_t *mm; + uint64_t *permutation; + ASSERTV(const uint64_t psize = zio->io_size >> unit_shift); + + ASSERT(vdev_draid_ms_mirrored(vd, zio->io_offset >> vd->vdev_ms_shift)); + ASSERT3U(ncols % (nparity + cfg->dcf_data), ==, nspare); + ASSERT0(P2PHASE(DRAID_SLICESIZE, 1ULL << unit_shift)); + + perm = b / ((ncols - nspare) * slice); + perm_off = b % ((ncols - nspare) * slice); + group = perm_off / (copies * slice); + ASSERT3U(group, <, (ncols - nspare) / copies); + group_offset = perm_off % (copies * slice); + ASSERT0(group_offset % copies); + ASSERT3U(psize, <=, slice - group_offset / copies); + /* The starting byte offset on each child vdev. */ + o = (perm * slice + group_offset / copies) << unit_shift; + + mm = vdev_mirror_map_alloc(copies, B_FALSE, B_FALSE); + permutation = kmem_alloc(sizeof (permutation[0]) * ncols, KM_SLEEP); + VERIFY0(vdev_draid_get_permutation(permutation, perm, cfg)); + + for (c = 0; c < mm->mm_children; c++) { + int idx = group * copies + c; + mirror_child_t *mc = &mm->mm_child[c]; + + /* The remainder group is not usable for IO */ + ASSERT3U(idx, <, ((ncols - nspare) / copies) * copies); + + mc->mc_vd = vd->vdev_child[permutation[idx]]; + mc->mc_offset = o; + } + + if (array == NULL) + kmem_free(permutation, sizeof (permutation[0]) * ncols); + else + *array = permutation; /* caller will free */ + + zio->io_vsd = mm; + zio->io_vsd_ops = &vdev_mirror_vsd_ops; + return (mm); +} + +static inline void +vdev_draid_assert_vd(const vdev_t *vd) +{ + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT(cfg != NULL); + ASSERT3U(vd->vdev_nparity, ==, cfg->dcf_parity); + ASSERT3U(vd->vdev_children, ==, cfg->dcf_children); + ASSERT(cfg->dcf_zero_abd != NULL); +} + +uint64_t +vdev_draid_get_groupsz(const vdev_t *vd, boolean_t mirror) +{ + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + uint64_t copies; + + vdev_draid_assert_vd(vd); + + copies = mirror ? + vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data; + return (copies << DRAID_SLICESHIFT); +} + +#define DRAID_PERM_ASIZE(vd) (((vd)->vdev_children - \ + ((struct vdev_draid_configuration *)(vd)->vdev_tsd)->dcf_spare) \ + << DRAID_SLICESHIFT) + +uint64_t +vdev_draid_offset2group(const vdev_t *vd, uint64_t offset, boolean_t mirror) +{ + uint64_t perm, perm_off, group, copies, groups_per_perm; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + vdev_draid_assert_vd(vd); + + perm = offset / DRAID_PERM_ASIZE(vd); + perm_off = offset % DRAID_PERM_ASIZE(vd); + group = perm_off / vdev_draid_get_groupsz(vd, mirror); + + copies = mirror ? + vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data; + groups_per_perm = (vd->vdev_children - cfg->dcf_spare + copies - 1) + / copies; + + return (perm * groups_per_perm + group); +} + +uint64_t +vdev_draid_group2offset(const vdev_t *vd, uint64_t group, boolean_t mirror) +{ + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + uint64_t copies, groups_per_perm, offset; + + vdev_draid_assert_vd(vd); + + copies = mirror ? + vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data; + groups_per_perm = (vd->vdev_children - cfg->dcf_spare + copies - 1) + / copies; + + offset = DRAID_PERM_ASIZE(vd) * (group / groups_per_perm); + offset += + vdev_draid_get_groupsz(vd, mirror) * (group % groups_per_perm); + return (offset); +} + +boolean_t +vdev_draid_is_remainder_group(const vdev_t *vd, + uint64_t group, boolean_t mirror) +{ + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + uint64_t copies, groups_per_perm; + + vdev_draid_assert_vd(vd); + + copies = mirror ? + vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data; + groups_per_perm = (vd->vdev_children - cfg->dcf_spare + copies - 1) + / copies; + + if ((vd->vdev_children - cfg->dcf_spare) % copies == 0) + return (B_FALSE); + + /* Currently only mirror can have remainder group */ + ASSERT(mirror); + + /* The last group in each permutation is the remainder */ + if (group % groups_per_perm == groups_per_perm - 1) + return (B_TRUE); + else + return (B_FALSE); +} + +uint64_t +vdev_draid_get_astart(const vdev_t *vd, const uint64_t start) +{ + uint64_t astart, perm_off, copies; + boolean_t mirror = + vdev_draid_ms_mirrored(vd, start >> vd->vdev_ms_shift); + uint64_t group = vdev_draid_offset2group(vd, start, mirror); + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + vdev_draid_assert_vd(vd); + + if (vdev_draid_is_remainder_group(vd, group, mirror)) + return (start); + + perm_off = start % DRAID_PERM_ASIZE(vd); + copies = mirror ? + vd->vdev_nparity + 1 : vd->vdev_nparity + cfg->dcf_data; + astart = roundup(perm_off, copies << vd->vdev_ashift); + astart += start - perm_off; + + ASSERT3U(astart, >=, start); + return (astart); +} + +uint64_t +vdev_draid_check_block(const vdev_t *vd, uint64_t start, uint64_t size) +{ + boolean_t mirror = + vdev_draid_ms_mirrored(vd, start >> vd->vdev_ms_shift); + uint64_t group = vdev_draid_offset2group(vd, start, mirror); + uint64_t end = start + size - 1; + + ASSERT3U(size, <, vdev_draid_get_groupsz(vd, mirror)); + ASSERT3U(start >> vd->vdev_ms_shift, ==, end >> vd->vdev_ms_shift); + + /* + * A block is good if it: + * - does not cross group boundary, AND + * - does not use a remainder group + */ + if (group == vdev_draid_offset2group(vd, end, mirror) && + !vdev_draid_is_remainder_group(vd, group, mirror)) { + ASSERT3U(start, ==, vdev_draid_get_astart(vd, start)); + return (start); + } + + group++; + if (vdev_draid_is_remainder_group(vd, group, mirror)) + group++; + ASSERT(!vdev_draid_is_remainder_group(vd, group, mirror)); + return (vdev_draid_group2offset(vd, group, mirror)); +} + +boolean_t +vdev_draid_ms_mirrored(const vdev_t *vd, uint64_t ms_id) +{ + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + /* HH: dedicate 1/20 ms for hybrid mirror */ + if ((ms_id % 20) == 19) + return (B_TRUE); + else + return (B_FALSE); +} + +static vdev_t *vdev_dspare_get_child(vdev_t *vd, uint64_t offset); + +/* + * dRAID spare does not fit into the DTL model. While it has child vdevs, + * there is no redundancy among them, and the effective child vdev is + * determined by offset. Moreover, DTLs of a child vdev before the spare + * becomes active are invalid, because the spare blocks were not in use yet. + * + * Here we are essentially doing a vdev_dtl_reassess() on the fly, by replacing + * a dRAID spare with the child vdev under the offset. Note that it is a + * recursive process because the child vdev can be another dRAID spare, and so + * on. + */ +boolean_t +vdev_draid_missing(vdev_t *vd, uint64_t offset, uint64_t txg, uint64_t size) +{ + int c; + + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) + return (B_TRUE); + + if (vd->vdev_ops == &vdev_draid_spare_ops) + vd = vdev_dspare_get_child(vd, offset); + + if (vd->vdev_ops != &vdev_spare_ops) + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); + + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) + return (B_TRUE); + + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (!vdev_draid_missing(cvd, offset, txg, size)) + return (B_FALSE); + } + + return (B_TRUE); +} + +boolean_t +vdev_draid_readable(vdev_t *vd, uint64_t offset) +{ + int c; + + if (vd->vdev_ops == &vdev_draid_spare_ops) + vd = vdev_dspare_get_child(vd, offset); + + if (vd->vdev_ops != &vdev_spare_ops) + return (vdev_readable(vd)); + + for (c = 0; c < vd->vdev_children; c++) + if (vdev_draid_readable(vd->vdev_child[c], offset)) + return (B_TRUE); + + return (B_FALSE); +} + +boolean_t +vdev_draid_is_dead(vdev_t *vd, uint64_t offset) +{ + int c; + + if (vd->vdev_ops == &vdev_draid_spare_ops) + vd = vdev_dspare_get_child(vd, offset); + + if (vd->vdev_ops != &vdev_spare_ops) + return (vdev_is_dead(vd)); + + for (c = 0; c < vd->vdev_children; c++) + if (!vdev_draid_is_dead(vd->vdev_child[c], offset)) + return (B_FALSE); + + return (B_TRUE); +} + +static boolean_t +vdev_draid_guid_exists(vdev_t *vd, uint64_t guid, uint64_t offset) +{ + int c; + + if (vd->vdev_ops == &vdev_draid_spare_ops) + vd = vdev_dspare_get_child(vd, offset); + + if (vd->vdev_guid == guid) + return (B_TRUE); + + if (vd->vdev_ops->vdev_op_leaf) + return (B_FALSE); + + for (c = 0; c < vd->vdev_children; c++) + if (vdev_draid_guid_exists(vd->vdev_child[c], guid, offset)) + return (B_TRUE); + + return (B_FALSE); +} + +static boolean_t +vdev_draid_vd_degraded(vdev_t *vd, const vdev_t *oldvd, uint64_t offset) +{ + if (oldvd == NULL) /* Resilver */ + return (!vdev_dtl_empty(vd, DTL_PARTIAL)); + + /* Rebuild */ + ASSERT(oldvd->vdev_ops->vdev_op_leaf); + ASSERT(oldvd->vdev_ops != &vdev_draid_spare_ops); + + return (vdev_draid_guid_exists(vd, oldvd->vdev_guid, offset)); +} + +boolean_t +vdev_draid_group_degraded(vdev_t *vd, vdev_t *oldvd, + uint64_t offset, uint64_t size, boolean_t mirror) +{ + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t group = vdev_draid_offset2group(vd, offset, mirror); + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + boolean_t degraded = B_FALSE; + zio_t *zio; + int c, dummy_data; + uint64_t *perm; + char buf[128]; + + vdev_draid_assert_vd(vd); + ASSERT(!vdev_draid_is_remainder_group(vd, group, mirror)); + + zio = kmem_alloc(sizeof (*zio), KM_SLEEP); + zio->io_vd = vd; + zio->io_offset = offset; + zio->io_size = MAX(SPA_MINBLOCKSIZE, 1ULL << ashift); + zio->io_abd = abd_get_from_buf(&dummy_data, zio->io_size); + + buf[0] = '\0'; + if (mirror) { + mirror_map_t *mm = + vdev_draid_mirror_map_alloc(zio, ashift, cfg, &perm); + + ASSERT3U(mm->mm_children, ==, cfg->dcf_parity + 1); + + for (c = 0; c < mm->mm_children; c++) { + mirror_child_t *mc = &mm->mm_child[c]; + char *status = ""; + + if (vdev_draid_vd_degraded(mc->mc_vd, + oldvd, mc->mc_offset)) { + degraded = B_TRUE; + status = "*"; + } + snprintf(buf + strlen(buf), sizeof (buf) - strlen(buf), + U64FMT"%s ", mc->mc_vd->vdev_id, status); + } + } else { + raidz_map_t *rm = vdev_draid_map_alloc(zio, &perm); + + ASSERT3U(rm->rm_scols, ==, cfg->dcf_parity + cfg->dcf_data); + + for (c = 0; c < rm->rm_scols; c++) { + raidz_col_t *rc = &rm->rm_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + char *status = ""; + + if (vdev_draid_vd_degraded(cvd, oldvd, rc->rc_offset)) { + degraded = B_TRUE; + status = "*"; + } + snprintf(buf + strlen(buf), sizeof (buf) - strlen(buf), + U64FMT"%s ", cvd->vdev_id, status); + } + } + + snprintf(buf + strlen(buf), sizeof (buf) - strlen(buf), "spares: "); + for (c = 0; c < cfg->dcf_spare; c++) + snprintf(buf + strlen(buf), sizeof (buf) - strlen(buf), + U64FMT" ", perm[cfg->dcf_children - 1 - c]); + draid_dbg(4, "%s %s at "U64FMT"K of "U64FMT"K: %s\n", + degraded ? "Degraded" : "Healthy", + mirror ? "mirror" : "draid", + offset >> 10, size >> 10, buf); + + kmem_free(perm, sizeof (perm[0]) * cfg->dcf_children); + (*zio->io_vsd_ops->vsd_free)(zio); + abd_put(zio->io_abd); + kmem_free(zio, sizeof (*zio)); + return (degraded); +} + +boolean_t +vdev_draid_config_validate(const vdev_t *vd, nvlist_t *config) +{ + int i; + uint_t c; + uint8_t *perm = NULL; + uint64_t n, d, p, s, b; + + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_DRAIDCFG_CHILDREN, &n) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_CHILDREN); + return (B_FALSE); + } + + if (n - 1 > VDEV_DRAID_U8_MAX) { + draid_dbg(0, "%s configuration too large: "U64FMT"\n", + ZPOOL_CONFIG_DRAIDCFG_CHILDREN, n); + return (B_FALSE); + } + if (vd != NULL && n != vd->vdev_children) + return (B_FALSE); + + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_DRAIDCFG_PARITY, &p) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_PARITY); + return (B_FALSE); + } + + if (vd != NULL && p != vd->vdev_nparity) + return (B_FALSE); + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_DATA, &d) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_DATA); + return (B_FALSE); + } + + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_DRAIDCFG_SPARE, &s) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_SPARE); + return (B_FALSE); + } + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_DRAIDCFG_BASE, &b) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_BASE); + return (B_FALSE); + } + + if (n == 0 || d == 0 || p == 0 || s == 0 || b == 0) { + draid_dbg(0, "Zero n/d/p/s/b\n"); + return (B_FALSE); + } + + if (p > VDEV_RAIDZ_MAXPARITY) { + draid_dbg(0, "Invalid parity "U64FMT"\n", p); + return (B_FALSE); + } + + if ((n - s) % (p + d) != 0) { + draid_dbg(0, U64FMT" mod "U64FMT" is not 0\n", n - s, p + d); + return (B_FALSE); + } + + if (nvlist_lookup_uint8_array(config, + ZPOOL_CONFIG_DRAIDCFG_PERM, &perm, &c) != 0) { + draid_dbg(0, "Missing %s in configuration\n", + ZPOOL_CONFIG_DRAIDCFG_PERM); + return (B_FALSE); + } + + if (c != b * n) { + draid_dbg(0, + "Permutation array has %u items, but "U64FMT" expected\n", + c, b * n); + return (B_FALSE); + } + + for (i = 0; i < b; i++) { + int j, k; + for (j = 0; j < n; j++) { + uint64_t val = perm[i * n + j]; + + if (val >= n) { + draid_dbg(0, + "Invalid value "U64FMT" in " + "permutation %d\n", val, i); + return (B_FALSE); + } + + for (k = 0; k < j; k++) { + if (val == perm[i * n + k]) { + draid_dbg(0, + "Duplicated value "U64FMT" in " + "permutation %d\n", + val, i); + return (B_FALSE); + } + } + } + } + + return (B_TRUE); +} + +boolean_t +vdev_draid_config_add(nvlist_t *top, nvlist_t *draidcfg) +{ + char *type; + uint64_t parity; + nvlist_t **children = NULL; + uint_t c = 0; + + if (draidcfg == NULL) + return (B_FALSE); + + type = fnvlist_lookup_string(top, ZPOOL_CONFIG_TYPE); + if (strcmp(type, VDEV_TYPE_DRAID) != 0) + return (B_FALSE); + + parity = fnvlist_lookup_uint64(top, ZPOOL_CONFIG_NPARITY); + if (parity != fnvlist_lookup_uint64(draidcfg, + ZPOOL_CONFIG_DRAIDCFG_PARITY)) + return (B_FALSE); + + VERIFY0(nvlist_lookup_nvlist_array(top, + ZPOOL_CONFIG_CHILDREN, &children, &c)); + if (c != + fnvlist_lookup_uint64(draidcfg, ZPOOL_CONFIG_DRAIDCFG_CHILDREN)) + return (B_FALSE); + + /* HH: todo: check permutation array csum */ + fnvlist_add_nvlist(top, ZPOOL_CONFIG_DRAIDCFG, draidcfg); + return (B_TRUE); +} + +/* Unfortunately this requires GPL-only symbols */ +#ifdef ZFS_IS_GPL_COMPATIBLE +#define __DRAID_HARDENING +#else +#undef __DRAID_HARDENING +#endif + +static void +vdev_draid_setup_page(const void *start, size_t sz, boolean_t readonly) +{ +#ifdef __DRAID_HARDENING + ASSERT(sz != 0); + + if (!IS_P2ALIGNED(sz, PAGESIZE) || !IS_P2ALIGNED(start, PAGESIZE)) { + draid_dbg(1, "Buffer not page aligned %p %lu\n", start, sz); + return; + } + +#ifdef _KERNEL + if (readonly) + set_memory_ro((unsigned long)start, sz >> PAGE_SHIFT); + else + set_memory_rw((unsigned long)start, sz >> PAGE_SHIFT); +#endif +#endif +} + +static inline void +vdev_draid_set_mem_ro(const void *start, size_t sz) +{ + vdev_draid_setup_page(start, sz, B_TRUE); +} + +static inline void +vdev_draid_set_mem_rw(const void *start, size_t sz) +{ + vdev_draid_setup_page(start, sz, B_FALSE); +} + +static uint64_t * +vdev_draid_create_base_perms(const uint8_t *perms, + const struct vdev_draid_configuration *cfg) +{ + int i, j; + uint64_t children = cfg->dcf_children, *base_perms; + size_t sz = sizeof (uint64_t) * cfg->dcf_bases * children; + +#ifdef __DRAID_HARDENING + sz = P2ROUNDUP(sz, PAGESIZE); +#endif + base_perms = kmem_alloc(sz, KM_SLEEP); + for (i = 0; i < cfg->dcf_bases; i++) + for (j = 0; j < children; j++) + base_perms[i * children + j] = perms[i * children + j]; + + vdev_draid_set_mem_ro(base_perms, sz); + return (base_perms); +} + +static struct vdev_draid_configuration * +vdev_draid_config_create(vdev_t *vd) +{ + uint_t c; + uint8_t *perms = NULL; + nvlist_t *nvl = vd->vdev_cfg; + struct vdev_draid_configuration *cfg; + + ASSERT(nvl != NULL); + + if (!vdev_draid_config_validate(vd, nvl)) + return (NULL); + + cfg = kmem_alloc(sizeof (*cfg), KM_SLEEP); + cfg->dcf_children = fnvlist_lookup_uint64(nvl, + ZPOOL_CONFIG_DRAIDCFG_CHILDREN); + cfg->dcf_data = fnvlist_lookup_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_DATA); + cfg->dcf_parity = fnvlist_lookup_uint64(nvl, + ZPOOL_CONFIG_DRAIDCFG_PARITY); + cfg->dcf_spare = fnvlist_lookup_uint64(nvl, + ZPOOL_CONFIG_DRAIDCFG_SPARE); + cfg->dcf_bases = fnvlist_lookup_uint64(nvl, ZPOOL_CONFIG_DRAIDCFG_BASE); + + VERIFY0(nvlist_lookup_uint8_array(nvl, + ZPOOL_CONFIG_DRAIDCFG_PERM, &perms, &c)); + + cfg->dcf_base_perms = vdev_draid_create_base_perms(perms, cfg); + cfg->dcf_zero_abd = NULL; + return (cfg); +} + +static int +vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *ashift) +{ + vdev_t *cvd; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + uint64_t nparity = vd->vdev_nparity; + int c; + int lasterror = 0; + int numerrors = 0; + + ASSERT(nparity > 0); + + if (nparity > VDEV_RAIDZ_MAXPARITY || + vd->vdev_children < nparity + 1) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* vd->vdev_tsd must be set before vdev_open_children(vd) */ + if (cfg == NULL) { + cfg = vdev_draid_config_create(vd); + if (cfg == NULL) + return (SET_ERROR(EINVAL)); + vd->vdev_tsd = cfg; + } else { + ASSERT(vd->vdev_reopening); + } + + vdev_open_children(vd); + + for (c = 0; c < vd->vdev_children; c++) { + cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) { + lasterror = cvd->vdev_open_error; + numerrors++; + continue; + } + + *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; + *ashift = MAX(*ashift, cvd->vdev_ashift); + } + + if (cfg->dcf_zero_abd == NULL) { + abd_t *zabd; + size_t sz = 1ULL << MAX(*ashift, vd->vdev_ashift); + +#ifdef __DRAID_HARDENING + sz = P2ROUNDUP(sz, PAGESIZE); +#endif + zabd = abd_alloc_linear(sz, B_TRUE); + abd_zero(zabd, sz); + vdev_draid_set_mem_ro(abd_to_buf(zabd), sz); + cfg->dcf_zero_abd = zabd; + } + + /* HH: asize becomes tricky with hybrid mirror */ + *asize *= vd->vdev_children - cfg->dcf_spare; + *max_asize *= vd->vdev_children - cfg->dcf_spare; + + if (numerrors > nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + return (0); +} + +static void +vdev_draid_close(vdev_t *vd) +{ + int c; + size_t sz; + abd_t *zabd; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + for (c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); + + if (vd->vdev_reopening || cfg == NULL) + return; + + zabd = cfg->dcf_zero_abd; + ASSERT(zabd != NULL); + vdev_draid_set_mem_rw(abd_to_buf(zabd), zabd->abd_size); + abd_free(zabd); + + sz = sizeof (uint64_t) * cfg->dcf_bases * cfg->dcf_children; +#ifdef __DRAID_HARDENING + sz = P2ROUNDUP(sz, PAGESIZE); +#endif + vdev_draid_set_mem_rw(cfg->dcf_base_perms, sz); + kmem_free((void *)cfg->dcf_base_perms, sz); + + kmem_free(cfg, sizeof (*cfg)); + vd->vdev_tsd = NULL; +} + +uint64_t +vdev_draid_asize_by_type(const vdev_t *vd, uint64_t psize, boolean_t mirror) +{ + uint64_t asize; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t nparity = vd->vdev_nparity; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + vdev_draid_assert_vd(vd); + + asize = ((psize - 1) >> ashift) + 1; + + if (mirror) { + asize *= 1 + nparity; + } else { /* draid */ + ASSERT3U(cfg->dcf_data, !=, 0); + asize = roundup(asize, cfg->dcf_data); + asize += nparity * (asize / cfg->dcf_data); + ASSERT0(asize % (nparity + cfg->dcf_data)); + } + + ASSERT(asize != 0); + return (asize << ashift); +} + +static uint64_t +vdev_draid_asize(vdev_t *vd, uint64_t psize) +{ + uint64_t sector = ((psize - 1) >> vd->vdev_top->vdev_ashift) + 1; + + return (vdev_draid_asize_by_type(vd, psize, sector == 1)); +} + +uint64_t +vdev_draid_asize2psize(vdev_t *vd, uint64_t asize, uint64_t offset) +{ + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t msid = offset >> vd->vdev_ms_shift; + boolean_t mirror = vdev_draid_ms_mirrored(vd, msid); + uint64_t psize; + + ASSERT0(P2PHASE(asize, 1ULL << ashift)); + ASSERT0(P2PHASE(offset, 1ULL << ashift)); + + if (mirror) { + ASSERT0((asize >> ashift) % (1 + vd->vdev_nparity)); + psize = asize / (1 + vd->vdev_nparity); + } else { + ASSERT0((asize >> ashift) % (cfg->dcf_data + vd->vdev_nparity)); + psize = (asize / (cfg->dcf_data + vd->vdev_nparity)) + * cfg->dcf_data; + } + + if (psize > SPA_MAXBLOCKSIZE) { + draid_dbg(0, "Psize "U64FMT" too big at offset "U64FMT" from " + "asize "U64FMT", ashift "U64FMT", %s MS "U64FMT"\n", + psize, offset, asize, ashift, + mirror ? "mirrored" : "draid", msid); + } + ASSERT3U(psize, <=, SPA_MAXBLOCKSIZE); + + return (psize); +} + +uint64_t +vdev_draid_max_rebuildable_asize(vdev_t *vd, uint64_t offset) +{ + uint64_t maxpsize = SPA_MAXBLOCKSIZE; + uint64_t ashift = vd->vdev_top->vdev_ashift; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + if (vdev_draid_ms_mirrored(vd, offset >> vd->vdev_ms_shift)) + return (vdev_draid_asize_by_type(vd, maxpsize, B_TRUE)); + + /* + * When SPA_MAXBLOCKSIZE>>ashift does not divide evenly by the number + * of data drives, the remainder must be discarded. Otherwise the skip + * sectors will cause vdev_draid_asize2psize() to get a psize larger + * than SPA_MAXBLOCKSIZE + */ + maxpsize >>= ashift; + maxpsize /= cfg->dcf_data; + maxpsize *= cfg->dcf_data; + maxpsize <<= ashift; + return (vdev_draid_asize_by_type(vd, maxpsize, B_FALSE)); +} + +static boolean_t +vdev_draid_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +{ + boolean_t mirror = + vdev_draid_ms_mirrored(vd, offset >> vd->vdev_ms_shift); + + /* A block cannot cross redundancy group boundary */ + ASSERT3U(offset, ==, + vdev_draid_check_block(vd, offset, vdev_draid_asize(vd, psize))); + + return (vdev_draid_group_degraded(vd, NULL, offset, psize, mirror)); +} + +/* + * Start an IO operation on a dRAID VDev + * + * Outline: + * - For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. + * 3. If the column skips any sectors for padding, create optional dummy + * write zio children for those areas to improve aggregation continuity. + * - For read operations: + * 1. Create child zio read operations to each data column's vdev to read + * the range of data required for zio. + * 2. If this is a scrub or resilver operation, or if any of the data + * vdevs have had errors, then create zio read operations to the parity + * columns' VDevs as well. + */ +static void +vdev_draid_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + uint64_t ashift = vd->vdev_top->vdev_ashift; + vdev_t *cvd; + raidz_map_t *rm; + raidz_col_t *rc; + int c, i; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + + vdev_draid_assert_vd(vd); + + if (vdev_draid_ms_mirrored(vd, zio->io_offset >> vd->vdev_ms_shift)) { + (void) vdev_draid_mirror_map_alloc(zio, ashift, cfg, NULL); + + ASSERT(zio->io_vsd != NULL); + vdev_mirror_ops.vdev_op_io_start(zio); + return; + } + + rm = vdev_draid_map_alloc(zio, NULL); + ASSERT3U(rm->rm_asize, ==, + vdev_draid_asize_by_type(vd, zio->io_size, B_FALSE)); + + if (zio->io_type == ZIO_TYPE_WRITE) { + vdev_raidz_generate_parity(rm); + + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + + /* + * Unlike raidz, it's mandatory to fill skip sectors with zero. + */ + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + ASSERT3U(c, <, rm->rm_scols); + ASSERT3U(c, >, rm->rm_firstdatacol); + + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, cfg->dcf_zero_abd, + 1ULL << ashift, zio->io_type, zio->io_priority, + 0, NULL, NULL)); /* HH: handle skip write error */ + } + + zio_execute(zio); + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + /* + * Sequential rebuild must do IO at redundancy group boundary, i.e. + * rm->rm_nskip must be 0 + */ + ASSERT((zio->io_flags & ZIO_FLAG_RESILVER) == 0 || + !DSL_SCAN_IS_REBUILD(zio->io_spa->spa_dsl_pool->dp_scan) || + rm->rm_nskip == 0); + + /* + * Iterate over the columns in reverse order so that we hit the parity + * last -- any errors along the way will force us to read the parity. + */ + for (c = rm->rm_cols - 1; c >= 0; c--) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + if (!vdev_draid_readable(cvd, rc->rc_offset)) { + if (c >= rm->rm_firstdatacol) + rm->rm_missingdata++; + else + rm->rm_missingparity++; + rc->rc_error = SET_ERROR(ENXIO); + rc->rc_tried = 1; /* don't even try */ + rc->rc_skipped = 1; + continue; + } + if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) { + if (c >= rm->rm_firstdatacol) + rm->rm_missingdata++; + else + rm->rm_missingparity++; + rc->rc_error = SET_ERROR(ESTALE); + rc->rc_skipped = 1; + continue; + } + if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + } + + /* + * Check skip sectors for scrub/resilver. For sequential rebuild, + * this is a no-op because rm->rm_nskip is always zero. + */ + if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + abd_t *abd; + + ASSERT3U(c, <, rm->rm_scols); + ASSERT3U(c, >, rm->rm_firstdatacol); + + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + + if (!vdev_draid_readable(cvd, + rc->rc_offset + rc->rc_size)) { + rc->rc_abd_skip = NULL; + continue; + } + + abd = abd_get_offset_size(rm->rm_abd_skip, + i << ashift, 1ULL << ashift); + *((int *)abd_to_buf(abd)) = 1; + rc->rc_abd_skip = abd; + + /* Skip sector to be written in vdev_draid_io_done() */ + if (vdev_draid_missing(cvd, + rc->rc_offset + rc->rc_size, zio->io_txg, 1)) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, abd, + 1ULL << ashift, ZIO_TYPE_READ, + zio->io_priority, 0, NULL, NULL)); + } + } + + zio_execute(zio); +} + +int +vdev_draid_hide_skip_sectors(raidz_map_t *rm) +{ + int c, cols; + size_t size = rm->rm_col[0].rc_size; + vdev_t *vd = rm->rm_vdev; + struct vdev_draid_configuration *cfg; + + ASSERT(vdev_raidz_map_declustered(rm)); + + cfg = vd->vdev_tsd; + + for (c = rm->rm_cols; c < rm->rm_scols; c++) { + raidz_col_t *rc = &rm->rm_col[c]; + + ASSERT0(rc->rc_size); + ASSERT0(rc->rc_error); + ASSERT0(rc->rc_tried); + ASSERT0(rc->rc_skipped); + ASSERT(rc->rc_abd == NULL); + ASSERT3U(cfg->dcf_zero_abd->abd_size, >=, size); + + rc->rc_size = size; + rc->rc_abd = cfg->dcf_zero_abd; + } + + cols = rm->rm_cols; + rm->rm_cols = rm->rm_scols; + return (cols); +} + +void +vdev_draid_restore_skip_sectors(raidz_map_t *rm, int cols) +{ + int c; + + ASSERT3U(cols, >, rm->rm_firstdatacol); + ASSERT3U(cols, <=, rm->rm_scols); + ASSERT(vdev_raidz_map_declustered(rm)); + + for (c = cols; c < rm->rm_scols; c++) { + raidz_col_t *rc = &rm->rm_col[c]; + + ASSERT0(rc->rc_error); + ASSERT0(rc->rc_tried); + ASSERT0(rc->rc_skipped); + ASSERT(rc->rc_abd != NULL); + + rc->rc_size = 0; + rc->rc_abd = NULL; + } + + rm->rm_cols = cols; +} + +void +vdev_draid_fix_skip_sectors(zio_t *zio) +{ + int c, i; + char *zero; + vdev_t *vd = zio->io_vd; + raidz_map_t *rm = zio->io_vsd; + struct vdev_draid_configuration *cfg = vd->vdev_tsd; + const uint64_t size = 1ULL << vd->vdev_top->vdev_ashift; + + vdev_draid_assert_vd(vd); + ASSERT3P(rm->rm_vdev, ==, vd); + + if (rm->rm_abd_skip == NULL) + return; + + zero = abd_to_buf(cfg->dcf_zero_abd); + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + char *skip; + boolean_t good_skip; + raidz_col_t *rc = &rm->rm_col[c]; + + ASSERT3U(c, <, rm->rm_scols); + ASSERT3U(c, >, rm->rm_firstdatacol); + + if (rc->rc_abd_skip == NULL) + continue; + + skip = abd_to_buf(rc->rc_abd_skip); + good_skip = (memcmp(skip, zero, size) == 0); + abd_put(rc->rc_abd_skip); + rc->rc_abd_skip = NULL; + + if (good_skip || !spa_writeable(zio->io_spa)) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], + rc->rc_offset + rc->rc_size, cfg->dcf_zero_abd, + size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR, NULL, NULL)); + } +} + +static void +vdev_draid_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + if (vdev_draid_ms_mirrored(vd, zio->io_offset >> vd->vdev_ms_shift)) + vdev_mirror_ops.vdev_op_io_done(zio); /* hybrid mirror */ + else + vdev_raidz_ops.vdev_op_io_done(zio); /* declustered raidz */ +} + +static void +vdev_draid_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (faulted > vd->vdev_nparity) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +vdev_ops_t vdev_draid_ops = { + vdev_draid_open, + vdev_draid_close, + vdev_draid_asize, + vdev_draid_io_start, + vdev_draid_io_done, + vdev_draid_state_change, + vdev_draid_need_resilver, + NULL, + NULL, + NULL, + VDEV_TYPE_DRAID, /* name of this vdev type */ + B_FALSE /* not a leaf vdev */ +}; + +#include + +typedef struct { + vdev_t *dsp_draid; + uint64_t dsp_id; +} vdev_dspare_t; + +static vdev_t * +vdev_dspare_get_child(vdev_t *vd, uint64_t offset) +{ + vdev_t *draid; + uint64_t *permutation, spareidx; + vdev_dspare_t *dspare = vd->vdev_tsd; + struct vdev_draid_configuration *cfg; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + ASSERT3U(offset, <, + vd->vdev_psize - VDEV_LABEL_START_SIZE - VDEV_LABEL_END_SIZE); + ASSERT(dspare != NULL); + draid = dspare->dsp_draid; + vdev_draid_assert_vd(draid); + cfg = draid->vdev_tsd; + ASSERT3U(dspare->dsp_id, <, cfg->dcf_spare); + + permutation = kmem_alloc(sizeof (permutation[0]) * draid->vdev_children, + KM_SLEEP); + VERIFY0(vdev_draid_get_permutation(permutation, + offset >> DRAID_SLICESHIFT, cfg)); + spareidx = permutation[draid->vdev_children - 1 - dspare->dsp_id]; + ASSERT3U(spareidx, <, draid->vdev_children); + kmem_free(permutation, sizeof (permutation[0]) * draid->vdev_children); + + return (draid->vdev_child[spareidx]); +} + +vdev_t * +vdev_draid_spare_get_parent(vdev_t *vd) +{ + vdev_dspare_t *dspare = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + ASSERT(dspare != NULL); + ASSERT(dspare->dsp_draid != NULL); + + return (dspare->dsp_draid); +} + +nvlist_t * +vdev_draid_spare_read_config(vdev_t *vd) +{ + int i; + uint64_t guid; + spa_t *spa = vd->vdev_spa; + spa_aux_vdev_t *sav = &spa->spa_spares; + nvlist_t *nv = fnvlist_alloc(); + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa)); + fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa)); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); + + if (vd->vdev_isspare) + fnvlist_add_uint64(nv, + ZPOOL_CONFIG_POOL_STATE, POOL_STATE_ACTIVE); + else + fnvlist_add_uint64(nv, + ZPOOL_CONFIG_POOL_STATE, POOL_STATE_SPARE); + + for (i = 0, guid = vd->vdev_guid; i < sav->sav_count; i++) { + if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops && + strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) { + guid = sav->sav_vdevs[i]->vdev_guid; + break; + } + } + fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid); + + /* HH: ZPOOL_CONFIG_UNSPARE and ZPOOL_CONFIG_RESILVER_TXG? */ + return (nv); +} + +static int +vdev_dspare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) +{ + uint64_t draid_id, nparity, spare_id; + uint64_t asize, max_asize; + vdev_t *draid; + vdev_dspare_t *dspare; + struct vdev_draid_configuration *cfg; + + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + dspare = vd->vdev_tsd; + draid = dspare->dsp_draid; + cfg = draid->vdev_tsd; + goto skip_open; + } + + if (sscanf(vd->vdev_path, VDEV_DRAID_SPARE_PATH_FMT, + (long unsigned *)&nparity, (long unsigned *)&draid_id, + (long unsigned *)&spare_id) != 3) + return (SET_ERROR(EINVAL)); + + if (draid_id >= vd->vdev_spa->spa_root_vdev->vdev_children) + return (SET_ERROR(EINVAL)); + + draid = vd->vdev_spa->spa_root_vdev->vdev_child[draid_id]; + if (draid->vdev_ops != &vdev_draid_ops) + return (SET_ERROR(EINVAL)); + if (draid->vdev_nparity != nparity) + return (SET_ERROR(EINVAL)); + + cfg = draid->vdev_tsd; + ASSERT(cfg != NULL); + if (nparity != cfg->dcf_parity || spare_id >= cfg->dcf_spare) + return (SET_ERROR(EINVAL)); + + dspare = kmem_alloc(sizeof (*dspare), KM_SLEEP); + dspare->dsp_draid = draid; + dspare->dsp_id = spare_id; + vd->vdev_tsd = dspare; + +skip_open: + asize = draid->vdev_asize / (draid->vdev_children - cfg->dcf_spare); + max_asize = draid->vdev_max_asize / + (draid->vdev_children - cfg->dcf_spare); + + *ashift = draid->vdev_ashift; + *psize = asize + (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); + *max_psize = max_asize + (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); + return (0); +} + +static void +vdev_dspare_close(vdev_t *vd) +{ + vdev_dspare_t *dspare = vd->vdev_tsd; + + if (vd->vdev_reopening || dspare == NULL) + return; + + vd->vdev_tsd = NULL; + kmem_free(dspare, sizeof (*dspare)); +} + +static uint64_t +vdev_dspare_asize(vdev_t *vd, uint64_t psize) +{ + /* HH: this function should never get called */ + ASSERT0(psize); + return (0); +} + +static void +vdev_dspare_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + pio->io_error = zio->io_error; +} + +static void +vdev_dspare_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *cvd; + uint64_t offset = zio->io_offset; + + /* HH: if dspare gets a FLUSH, so do all children of the draid vdev */ + if (zio->io_type == ZIO_TYPE_IOCTL) { + zio->io_error = 0; + zio_execute(zio); + return; + } + + /* + * HH: at pool creation, dspare gets some writes with + * ZIO_FLAG_SPECULATIVE and ZIO_FLAG_NODATA. + * Need to understand and handle them right. + */ + if (zio->io_flags & ZIO_FLAG_NODATA) { + zio->io_error = 0; + zio_execute(zio); + return; + } + + if (offset < VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) { + ASSERT(zio->io_flags & ZIO_FLAG_PHYSICAL); + + /* + * HH: dspare should not get any label IO as it is pretending + * to be a leaf disk. Later should catch and fix all places + * that still does label IO to dspare. + */ + zio->io_error = SET_ERROR(ENODATA); + zio_interrupt(zio); + return; + } + + offset -= VDEV_LABEL_START_SIZE; /* See zio_vdev_child_io() */ + cvd = vdev_dspare_get_child(vd, offset); + if (zio->io_type == ZIO_TYPE_READ && !vdev_readable(cvd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + /* + * Parent vdev should have avoided reading from me in the first + * place, unless this is a mirror scrub. + */ + draid_dbg(1, "Read from dead spare %s:%s:%s at "U64FMT"\n", + vd->vdev_path, + cvd->vdev_ops->vdev_op_type, + cvd->vdev_path != NULL ? cvd->vdev_path : "NA", + offset); + return; + } + + /* dspare IO does not cross slice boundary */ + ASSERT3U(offset >> DRAID_SLICESHIFT, ==, + (offset + zio->io_size - 1) >> DRAID_SLICESHIFT); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, offset, zio->io_abd, + zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_dspare_child_done, zio)); + zio_execute(zio); +} + +static void +vdev_dspare_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_draid_spare_ops = { + vdev_dspare_open, + vdev_dspare_close, + vdev_dspare_asize, + vdev_dspare_io_start, + vdev_dspare_io_done, + NULL, + NULL, + NULL, + NULL, + NULL, + VDEV_TYPE_DRAID_SPARE, + B_TRUE +}; + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(draid_debug_lvl, int, 0644); +MODULE_PARM_DESC(draid_debug_lvl, "dRAID debugging verbose level"); +#endif diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index b91e332de39f..55398c3a772b 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -141,6 +141,7 @@ #include #include #include +#include #include #include #include @@ -387,8 +388,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); if (vd->vdev_nparity != 0) { - ASSERT(strcmp(vd->vdev_ops->vdev_op_type, - VDEV_TYPE_RAIDZ) == 0); + ASSERT(vd->vdev_ops == &vdev_raidz_ops || + vd->vdev_ops == &vdev_draid_ops); /* * Make sure someone hasn't managed to sneak a fancy new vdev @@ -408,6 +409,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); } + if (vd->vdev_cfg != NULL) { + ASSERT(vd->vdev_ops == &vdev_draid_ops); + ASSERT(vdev_draid_config_validate(vd, vd->vdev_cfg)); + + fnvlist_add_nvlist(nv, ZPOOL_CONFIG_DRAIDCFG, vd->vdev_cfg); + } + if (vd->vdev_wholedisk != -1ULL) fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, vd->vdev_wholedisk); @@ -682,6 +690,9 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) if (!vdev_readable(vd)) return (NULL); + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (vdev_draid_spare_read_config(vd)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); vp = abd_to_buf(vp_abd); @@ -948,6 +959,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) ASSERT(reason == VDEV_LABEL_REPLACE); } + if (vd->vdev_ops == &vdev_draid_spare_ops) { + error = 0; + goto skip; + } + /* * Initialize its label. */ @@ -1069,6 +1085,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) abd_free(ub_abd); abd_free(vp_abd); +skip: /* * If this vdev hasn't been previously identified as a spare, then we * mark it as such only if a) we are labeling it as a spare, or b) it @@ -1156,7 +1173,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, for (int c = 0; c < vd->vdev_children; c++) vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); - if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) && + vd->vdev_ops != &vdev_draid_spare_ops) { for (int l = 0; l < VDEV_LABELS; l++) { for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, @@ -1291,6 +1309,13 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, if (!vd->vdev_ops->vdev_op_leaf) return; + /* + * No need to sync ub on dspare - if dspare gets a ub sync, so + * do the parent draid vdev and all its children. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + if (!vdev_writeable(vd)) return; @@ -1407,6 +1432,9 @@ vdev_label_sync(zio_t *zio, uint64_t *good_writes, if (!vd->vdev_ops->vdev_op_leaf) return; + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + if (!vdev_writeable(vd)) return; diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 4b01f317b4cc..06682f976b53 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -93,29 +94,6 @@ vdev_mirror_stat_fini(void) } } -/* - * Virtual device vector for mirroring. - */ - -typedef struct mirror_child { - vdev_t *mc_vd; - uint64_t mc_offset; - int mc_error; - int mc_load; - uint8_t mc_tried; - uint8_t mc_skipped; - uint8_t mc_speculative; -} mirror_child_t; - -typedef struct mirror_map { - int *mm_preferred; - int mm_preferred_cnt; - int mm_children; - boolean_t mm_replacing; - boolean_t mm_root; - mirror_child_t mm_child[]; -} mirror_map_t; - static int vdev_mirror_shift = 21; /* @@ -144,7 +122,7 @@ vdev_mirror_map_size(int children) sizeof (int) * children); } -static inline mirror_map_t * +mirror_map_t * vdev_mirror_map_alloc(int children, boolean_t replacing, boolean_t root) { mirror_map_t *mm; @@ -167,7 +145,7 @@ vdev_mirror_map_free(zio_t *zio) kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); } -static const zio_vsd_ops_t vdev_mirror_vsd_ops = { +const zio_vsd_ops_t vdev_mirror_vsd_ops = { .vsd_free = vdev_mirror_map_free, .vsd_cksum_report = zio_vsd_default_cksum_report }; @@ -400,6 +378,28 @@ vdev_mirror_preferred_child_randomize(zio_t *zio) return (mm->mm_preferred[p]); } +static boolean_t +vdev_mirror_child_readable(mirror_child_t *mc) +{ + vdev_t *vd = mc->mc_vd; + + if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) + return (vdev_draid_readable(vd, mc->mc_offset)); + else + return (vdev_readable(vd)); +} + +static boolean_t +vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size) +{ + vdev_t *vd = mc->mc_vd; + + if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) + return (vdev_draid_missing(vd, mc->mc_offset, txg, size)); + else + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + /* * Try to find a vdev whose DTL doesn't contain the block we want to read * prefering vdevs based on determined load. @@ -425,14 +425,15 @@ vdev_mirror_child_select(zio_t *zio) if (mc->mc_tried || mc->mc_skipped) continue; - if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) { + if (mc->mc_vd == NULL || + !vdev_mirror_child_readable(mc)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } - if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { + if (vdev_mirror_child_missing(mc, txg, 1)) { mc->mc_error = SET_ERROR(ESTALE); mc->mc_skipped = 1; mc->mc_speculative = 1; @@ -483,7 +484,12 @@ vdev_mirror_io_start(zio_t *zio) mirror_child_t *mc; int c, children; - mm = vdev_mirror_map_init(zio); + if (zio->io_vsd != NULL) { /* dRAID hybrid mirror */ + ASSERT3P(zio->io_vd->vdev_ops, ==, &vdev_draid_ops); + mm = zio->io_vsd; + } else { + mm = vdev_mirror_map_init(zio); + } if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_bp != NULL && diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index a21baf9c264b..06636c61f2a8 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -35,6 +35,7 @@ #include #include #include +#include /* * Virtual device vector for RAID-Z. @@ -145,6 +146,11 @@ vdev_raidz_map_free(raidz_map_t *rm) for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) abd_put(rm->rm_col[c].rc_abd); + if (rm->rm_abd_skip != NULL) { + ASSERT(vdev_raidz_map_declustered(rm)); + abd_free(rm->rm_abd_skip); + } + if (rm->rm_abd_copy != NULL) abd_free(rm->rm_abd_copy); @@ -317,7 +323,7 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) ASSERT3U(offset, ==, size); } -static const zio_vsd_ops_t vdev_raidz_vsd_ops = { +const zio_vsd_ops_t vdev_raidz_vsd_ops = { .vsd_free = vdev_raidz_map_free_vsd, .vsd_cksum_report = vdev_raidz_cksum_report }; @@ -392,6 +398,8 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, rm->rm_reports = 0; rm->rm_freed = 0; rm->rm_ecksuminjected = 0; + rm->rm_abd_skip = NULL; + rm->rm_vdev = NULL; asize = 0; @@ -669,23 +677,30 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) void vdev_raidz_generate_parity(raidz_map_t *rm) { - /* Generate using the new math implementation */ - if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL) - return; + int cols = 0; - switch (rm->rm_firstdatacol) { - case 1: - vdev_raidz_generate_parity_p(rm); - break; - case 2: - vdev_raidz_generate_parity_pq(rm); - break; - case 3: - vdev_raidz_generate_parity_pqr(rm); - break; - default: - cmn_err(CE_PANIC, "invalid RAID-Z configuration"); + if (vdev_raidz_map_declustered(rm) && rm->rm_firstdatacol > 1) + cols = vdev_draid_hide_skip_sectors(rm); + + /* Generate using the new math implementation */ + if (vdev_raidz_math_generate(rm) == RAIDZ_ORIGINAL_IMPL) { + switch (rm->rm_firstdatacol) { + case 1: + vdev_raidz_generate_parity_p(rm); + break; + case 2: + vdev_raidz_generate_parity_pq(rm); + break; + case 3: + vdev_raidz_generate_parity_pqr(rm); + break; + default: + cmn_err(CE_PANIC, "invalid RAID-Z configuration"); + } } + + if (cols != 0) + vdev_draid_restore_skip_sectors(rm, cols); } /* ARGSUSED */ @@ -1471,8 +1486,8 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; - int i, c, ret; - int code; + int i, c, code; + int cols = 0; int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; @@ -1507,25 +1522,32 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) ASSERT(nbaddata >= 0); ASSERT(nbaddata + nbadparity == ntgts); + if (vdev_raidz_map_declustered(rm)) + cols = vdev_draid_hide_skip_sectors(rm); + dt = &tgts[nbadparity]; /* Reconstruct using the new math implementation */ - ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); - if (ret != RAIDZ_ORIGINAL_IMPL) - return (ret); + code = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); + if (code != RAIDZ_ORIGINAL_IMPL) + goto out; /* * See if we can use any of our optimized reconstruction routines. */ switch (nbaddata) { case 1: - if (parity_valid[VDEV_RAIDZ_P]) - return (vdev_raidz_reconstruct_p(rm, dt, 1)); + if (parity_valid[VDEV_RAIDZ_P]) { + code = vdev_raidz_reconstruct_p(rm, dt, 1); + goto out; + } ASSERT(rm->rm_firstdatacol > 1); - if (parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_q(rm, dt, 1)); + if (parity_valid[VDEV_RAIDZ_Q]) { + code = vdev_raidz_reconstruct_q(rm, dt, 1); + goto out; + } ASSERT(rm->rm_firstdatacol > 2); break; @@ -1534,8 +1556,10 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) ASSERT(rm->rm_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_P] && - parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + parity_valid[VDEV_RAIDZ_Q]) { + code = vdev_raidz_reconstruct_pq(rm, dt, 2); + goto out; + } ASSERT(rm->rm_firstdatacol > 2); @@ -1545,6 +1569,9 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); ASSERT(code > 0); +out: + if (cols != 0) + vdev_draid_restore_skip_sectors(rm, cols); return (code); } @@ -1617,7 +1644,7 @@ vdev_raidz_asize(vdev_t *vd, uint64_t psize) return (asize); } -static void +void vdev_raidz_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; @@ -1820,6 +1847,8 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) abd_free(orig[c]); } + if (ret != 0 && vdev_raidz_map_declustered(rm)) + vdev_draid_debug_zio(zio, B_FALSE); return (ret); } @@ -2271,6 +2300,9 @@ vdev_raidz_io_done(zio_t *zio) ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } + + if (vdev_raidz_map_declustered(rm)) + vdev_draid_fix_skip_sectors(zio); } static void @@ -2288,7 +2320,7 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) /* * Determine if any portion of the provided block resides on a child vdev * with a dirty DTL and therefore needs to be resilvered. The function - * assumes that at least one DTL is dirty which imples that full stripe + * assumes that at least one DTL is dirty which implies that full stripe * width blocks must be resilvered. */ static boolean_t diff --git a/module/zfs/vdev_raidz.h b/module/zfs/vdev_raidz.h new file mode 100644 index 000000000000..ab3c5b81dc64 --- /dev/null +++ b/module/zfs/vdev_raidz.h @@ -0,0 +1,33 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018 Intel Corporation. + */ + +#include +#include +#include + +extern const zio_vsd_ops_t vdev_raidz_vsd_ops; + +extern void vdev_raidz_generate_parity(raidz_map_t *rm); +extern void vdev_raidz_child_done(zio_t *zio); diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 0fca8fb03f2d..8e52357b837f 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1846,20 +1846,32 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) * in this pool. */ if (vd == NULL || unspare) { - if (vd == NULL) - vd = spa_lookup_by_guid(spa, guid, B_TRUE); - ev = spa_event_create(spa, vd, NULL, - ESC_ZFS_VDEV_REMOVE_AUX); - - char *nvstr = fnvlist_lookup_string(nv, - ZPOOL_CONFIG_PATH); - spa_history_log_internal(spa, "vdev remove", NULL, - "%s vdev (%s) %s", spa_name(spa), - VDEV_TYPE_SPARE, nvstr); - spa_vdev_remove_aux(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares, nv); - spa_load_spares(spa); - spa->spa_spares.sav_sync = B_TRUE; + char *type; + boolean_t draid_spare = B_FALSE; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) + == 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) + draid_spare = B_TRUE; + + if (vd == NULL && draid_spare) { + error = SET_ERROR(ENOTSUP); + } else { + if (vd == NULL) + vd = spa_lookup_by_guid(spa, guid, + B_TRUE); + ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_AUX); + + char *nvstr = fnvlist_lookup_string(nv, + ZPOOL_CONFIG_PATH); + spa_history_log_internal(spa, "vdev remove", NULL, + "%s vdev (%s) %s", spa_name(spa), + VDEV_TYPE_SPARE, nvstr); + spa_vdev_remove_aux(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares, nv); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + } } else { error = SET_ERROR(EBUSY); } diff --git a/module/zfs/vdev_scan.c b/module/zfs/vdev_scan.c new file mode 100644 index 000000000000..ba60649e3c8d --- /dev/null +++ b/module/zfs/vdev_scan.c @@ -0,0 +1,583 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void +spa_vdev_scan_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + spa_vdev_scan_t *svs = zio->io_private; + uint64_t asize; + + ASSERT(svs != NULL); + ASSERT(svs->svs_thread != NULL); + ASSERT(zio->io_bp != NULL); + + abd_free(zio->io_abd); + asize = DVA_GET_ASIZE(&zio->io_bp->blk_dva[0]); + + scn->scn_phys.scn_examined += asize; + spa->spa_scan_pass_exam += asize; + spa->spa_scan_pass_issued += asize; + + if (zio->io_error && (zio->io_error != ECKSUM || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { + spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; + } + + mutex_enter(&svs->svs_io_lock); + ASSERT3U(svs->svs_io_asize, >=, asize); + svs->svs_io_asize -= asize; + cv_broadcast(&svs->svs_io_cv); + mutex_exit(&svs->svs_io_lock); +} + +static int spa_vdev_scan_delay = 64; /* number of ticks to delay rebuild */ +static int spa_vdev_scan_idle = 512; /* idle window in clock ticks */ + +static void +spa_vdev_scan_rebuild_block(spa_vdev_scan_t *svs, zio_t *pio, + vdev_t *vd, uint64_t offset, uint64_t asize) +{ + blkptr_t blk, *bp = &blk; + dva_t *dva = bp->blk_dva; + int scan_delay = spa_vdev_scan_delay; + uint64_t psize; + spa_t *spa = vd->vdev_spa; + + ASSERT(vd->vdev_ops == &vdev_draid_ops || + vd->vdev_ops == &vdev_mirror_ops); + + /* Calculate psize from asize */ + if (vd->vdev_ops == &vdev_mirror_ops) { + psize = asize; + } else { + int c, faulted; + + /* + * Initialize faulted to 1, to count the spare vdev we're + * rebuilding, which is not in faulted state. + */ + for (c = 0, faulted = 1; c < vd->vdev_children; c++) { + vdev_t *child = vd->vdev_child[c]; + + if (!vdev_readable(child) || + (!vdev_writeable(child) && spa_writeable(spa))) + faulted++; + } + + if (faulted >= vd->vdev_nparity) + scan_delay = 0; /* critical, go full speed */ + + psize = vdev_draid_asize2psize(vd, asize, offset); + } + /* + * HH: add this assertion after dmirror implemented + * ASSERT3U(asize, ==, vdev_psize_to_asize(vd, psize, offset)); + */ + + BP_ZERO(bp); + + DVA_SET_VDEV(&dva[0], vd->vdev_id); + DVA_SET_OFFSET(&dva[0], offset); + DVA_SET_GANG(&dva[0], 0); + DVA_SET_ASIZE(&dva[0], asize); + + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + BP_SET_LSIZE(bp, psize); + BP_SET_PSIZE(bp, psize); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + mutex_enter(&svs->svs_io_lock); + while (svs->svs_io_asize >= + MIN(arc_max_bytes(), 4 * SPA_MAXBLOCKSIZE * vd->vdev_children)) + cv_wait(&svs->svs_io_cv, &svs->svs_io_lock); + svs->svs_io_asize += asize; + mutex_exit(&svs->svs_io_lock); + + if (scan_delay != 0) { + /* + * If we're seeing recent (spa_vdev_scan_idle) "important" I/Os + * then throttle our workload to limit the impact of a scan. + */ + if (ddi_get_lbolt64() - vd->vdev_last_io <= spa_vdev_scan_idle) + delay(scan_delay); + } + + zio_nowait(zio_read(pio, spa, bp, + abd_alloc(psize, B_FALSE), psize, spa_vdev_scan_done, svs, + ZIO_PRIORITY_SCRUB, ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | + ZIO_FLAG_CANFAIL | ZIO_FLAG_RESILVER, NULL)); +} + +static void +spa_vdev_scan_rebuild(spa_vdev_scan_t *svs, zio_t *pio, + vdev_t *vd, uint64_t offset, uint64_t length) +{ + uint64_t max_asize; + + if (vd->vdev_ops == &vdev_draid_ops) + max_asize = vdev_draid_max_rebuildable_asize(vd, offset); + else + max_asize = vdev_psize_to_asize(vd, SPA_MAXBLOCKSIZE); + + while (length > 0 && !svs->svs_thread_exit) { + uint64_t chunksz = MIN(length, max_asize); + + spa_vdev_scan_rebuild_block(svs, pio, vd, offset, chunksz); + + length -= chunksz; + offset += chunksz; + } +} + +static void +spa_vdev_scan_draid_rebuild(spa_vdev_scan_t *svs, zio_t *pio, + vdev_t *vd, vdev_t *oldvd, uint64_t offset, uint64_t length) +{ + uint64_t msi = offset >> vd->vdev_ms_shift; + boolean_t mirror; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(msi, ==, (offset + length - 1) >> vd->vdev_ms_shift); + + mirror = vdev_draid_ms_mirrored(vd, msi); + + while (length > 0 && !svs->svs_thread_exit) { + uint64_t group, group_left, chunksz; + char *action; + + /* + * Make sure we don't cross redundancy group boundary + */ + group = vdev_draid_offset2group(vd, offset, mirror); + group_left = vdev_draid_group2offset(vd, + group + 1, mirror) - offset; + + ASSERT(!vdev_draid_is_remainder_group(vd, group, mirror)); + ASSERT3U(group_left, <=, vdev_draid_get_groupsz(vd, mirror)); + + chunksz = MIN(length, group_left); + if (vdev_draid_group_degraded(vd, oldvd, + offset, chunksz, mirror)) { + action = "Fixing"; + spa_vdev_scan_rebuild(svs, pio, vd, offset, chunksz); + } else { + spa_t *spa = vd->vdev_spa; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + + action = "Skipping"; + + scn->scn_phys.scn_examined += chunksz; + spa->spa_scan_pass_exam += chunksz; + } + + draid_dbg(3, "\t%s: "U64FMT"K + "U64FMT"K (%s)\n", + action, offset >> 10, chunksz >> 10, + mirror ? "mirrored" : "dRAID"); + + length -= chunksz; + offset += chunksz; + } +} + +static void +spa_vdev_scan_ms_done(zio_t *zio) +{ + metaslab_t *msp = zio->io_private; + spa_vdev_scan_t *svs = zio->io_spa->spa_vdev_scan; + int *ms_done, msi; + + ASSERT(msp != NULL); + ASSERT(svs != NULL); + + mutex_enter(&msp->ms_lock); + msp->ms_rebuilding = B_FALSE; + mutex_exit(&msp->ms_lock); + + ms_done = svs->svs_ms_done; + ASSERT(ms_done != NULL); + ASSERT0(ms_done[msp->ms_id]); + + mutex_enter(&svs->svs_lock); + + if (svs->svs_thread_exit) { + /* + * Cannot mark this MS as "done", because the rebuild thread + * may have been interrupted in the middle of working on + * this MS. + */ + mutex_exit(&svs->svs_lock); + draid_dbg(1, "Aborted rebuilding metaslab "U64FMT"\n", + msp->ms_id); + return; + } + + ms_done[msp->ms_id] = 1; + + for (msi = svs->svs_msi_synced + 1; + msi < svs->svs_vd->vdev_top->vdev_ms_count; msi++) { + if (ms_done[msi] == 0) + break; + } + svs->svs_msi_synced = msi - 1; + + mutex_exit(&svs->svs_lock); + + draid_dbg(1, "Completed rebuilding metaslab "U64FMT"\n", msp->ms_id); + draid_dbg(1, "All metaslabs [0, %d) fully rebuilt.\n", msi) +} + +static void +spa_vdev_scan_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + spa_vdev_scan_t *svs = spa->spa_vdev_scan; + zio_t *rio = zio_root(spa, NULL, NULL, 0); + range_tree_t *allocd_segs; + uint64_t msi; + int *ms_done, err; + + ASSERT(svs != NULL); + ASSERT3P(svs->svs_vd, ==, vd); + ASSERT3P(svs->svs_ms_done, ==, NULL); + + vd = vd->vdev_top; + ASSERT3U(svs->svs_msi, >=, 0); + ASSERT3U(svs->svs_msi, <, vd->vdev_ms_count); + + /* + * Wait for newvd's DTL to propagate upward when + * spa_vdev_attach()->spa_vdev_exit() calls vdev_dtl_reassess(). + */ + txg_wait_synced(spa->spa_dsl_pool, svs->svs_dtl_max); + + allocd_segs = range_tree_create(NULL, NULL); + + ms_done = kmem_alloc(sizeof (*ms_done) * vd->vdev_ms_count, KM_SLEEP); + for (msi = 0; msi < vd->vdev_ms_count; msi++) { + if (msi < svs->svs_msi) + ms_done[msi] = 1; + else + ms_done[msi] = 0; + } + + mutex_enter(&svs->svs_lock); + svs->svs_ms_done = ms_done; + svs->svs_msi_synced = svs->svs_msi - 1; + mutex_exit(&svs->svs_lock); + + for (msi = svs->svs_msi; + msi < vd->vdev_ms_count && !svs->svs_thread_exit; msi++) { + metaslab_t *msp = vd->vdev_ms[msi]; + zio_t *pio = zio_null(rio, spa, NULL, + spa_vdev_scan_ms_done, msp, rio->io_flags); + + ASSERT0(range_tree_space(allocd_segs)); + + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + + while (msp->ms_condensing) { + mutex_exit(&msp->ms_lock); + + zfs_sleep_until(gethrtime() + 100 * MICROSEC); + + mutex_enter(&msp->ms_lock); + } + + VERIFY(!msp->ms_condensing); + VERIFY(!msp->ms_rebuilding); + msp->ms_rebuilding = B_TRUE; + + /* + * If the metaslab has ever been allocated from (ms_sm!=NULL), + * read the allocated segments from the space map object + * into svr_allocd_segs. Since we do this while holding + * svr_lock and ms_sync_lock, concurrent frees (which + * would have modified the space map) will wait for us + * to finish loading the spacemap, and then take the + * appropriate action (see free_from_removing_vdev()). + */ + if (msp->ms_sm != NULL) { + space_map_t *sm = NULL; + + /* + * We have to open a new space map here, because + * ms_sm's sm_length and sm_alloc may not reflect + * what's in the object contents, if we are in between + * metaslab_sync() and metaslab_sync_done(). + */ + VERIFY0(space_map_open(&sm, + spa->spa_dsl_pool->dp_meta_objset, + msp->ms_sm->sm_object, msp->ms_sm->sm_start, + msp->ms_sm->sm_size, msp->ms_sm->sm_shift)); + space_map_update(sm); + VERIFY0(space_map_load(sm, allocd_segs, SM_ALLOC)); + space_map_close(sm); + } + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + + draid_dbg(1, "Scanning %lu segments for MS "U64FMT"\n", + avl_numnodes(&allocd_segs->rt_root), msp->ms_id); + + while (!svs->svs_thread_exit && + range_tree_space(allocd_segs) != 0) { + uint64_t offset, length; + range_seg_t *rs = avl_first(&allocd_segs->rt_root); + + ASSERT(rs != NULL); + offset = rs->rs_start; + length = rs->rs_end - rs->rs_start; + + range_tree_remove(allocd_segs, offset, length); + + draid_dbg(2, "MS ("U64FMT" at "U64FMT"K) segment: " + U64FMT"K + "U64FMT"K\n", + msp->ms_id, msp->ms_start >> 10, + (offset - msp->ms_start) >> 10, length >> 10); + + if (vd->vdev_ops == &vdev_mirror_ops) + spa_vdev_scan_rebuild(svs, pio, + vd, offset, length); + else + spa_vdev_scan_draid_rebuild(svs, pio, vd, + svs->svs_vd, offset, length); + } + + zio_nowait(pio); + } + + err = zio_wait(rio); + if (err != 0) /* HH: handle error */ + err = SET_ERROR(err); + + mutex_enter(&svs->svs_lock); + if (svs->svs_thread_exit) { + range_tree_vacate(allocd_segs, NULL, NULL); + } + + svs->svs_thread = NULL; + svs->svs_ms_done = NULL; + cv_broadcast(&svs->svs_cv); + mutex_exit(&svs->svs_lock); + + ASSERT0(range_tree_space(allocd_segs)); + range_tree_destroy(allocd_segs); + kmem_free(ms_done, sizeof (*ms_done) * vd->vdev_ms_count); + thread_exit(); +} + +void +spa_vdev_scan_start(spa_t *spa, vdev_t *oldvd, int msi, uint64_t txg) +{ + dsl_scan_t *scan = spa->spa_dsl_pool->dp_scan; + spa_vdev_scan_t *svs = kmem_zalloc(sizeof (*svs), KM_SLEEP); + + ASSERT3U(msi, <, oldvd->vdev_top->vdev_ms_count); + + svs->svs_msi = msi; + svs->svs_vd = oldvd; + svs->svs_dtl_max = txg; + svs->svs_thread = NULL; + svs->svs_ms_done = NULL; + svs->svs_dp = spa->spa_dsl_pool; + mutex_init(&svs->svs_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&svs->svs_cv, NULL, CV_DEFAULT, NULL); + svs->svs_io_asize = 0; + mutex_init(&svs->svs_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&svs->svs_io_cv, NULL, CV_DEFAULT, NULL); + ASSERT3P(spa->spa_vdev_scan, ==, NULL); + spa->spa_vdev_scan = svs; + svs->svs_thread = thread_create(NULL, 0, spa_vdev_scan_thread, oldvd, + 0, NULL, TS_RUN, defclsyspri); + + scan->scn_restart_txg = txg; +} + +int +spa_vdev_scan_restart(vdev_t *rvd) +{ + spa_t *spa = rvd->vdev_spa; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + spa_rebuilding_phys_t svs_phys; + int err; + vdev_t *tvd, *oldvd, *pvd, *dspare; + + ASSERT(scn != NULL); + ASSERT3P(spa->spa_vdev_scan, ==, NULL); + + err = zap_lookup(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_REBUILDING, sizeof (uint64_t), + sizeof (spa_rebuilding_phys_t) / sizeof (uint64_t), &svs_phys); + + if (err != 0 || !DSL_SCAN_IS_REBUILD(scn) || + scn->scn_phys.scn_state == DSS_FINISHED || + svs_phys.sr_vdev == 0 || svs_phys.sr_oldvd == 0 || + svs_phys.sr_ms < -1) + return (SET_ERROR(ENOENT)); + + tvd = vdev_lookup_by_guid(rvd, svs_phys.sr_vdev); + oldvd = vdev_lookup_by_guid(rvd, svs_phys.sr_oldvd); + if (tvd == NULL || oldvd == NULL || oldvd->vdev_top != tvd) + return (SET_ERROR(ENOENT)); + + if (tvd->vdev_ops != &vdev_draid_ops) + return (SET_ERROR(ENOTSUP)); + + if (svs_phys.sr_ms >= tvd->vdev_ms_count - 1) + return (SET_ERROR(ENOENT)); + + pvd = oldvd->vdev_parent; + if (pvd->vdev_ops != &vdev_spare_ops || pvd->vdev_children != 2) + return (SET_ERROR(ENOENT)); + + dspare = pvd->vdev_child[1]; + if (dspare->vdev_ops != &vdev_draid_spare_ops || + !vdev_resilver_needed(dspare, NULL, NULL)) + return (SET_ERROR(ENOENT)); + + draid_dbg(1, "Restarting rebuild at metaslab "U64FMT"\n", + svs_phys.sr_ms + 1); + spa_vdev_scan_start(spa, oldvd, svs_phys.sr_ms + 1, + spa_last_synced_txg(spa) + 1 + TXG_CONCURRENT_STATES); + return (0); +} + +void +spa_vdev_scan_setup_sync(dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + spa_t *spa = scn->scn_dp->dp_spa; + spa_vdev_scan_t *svs = spa->spa_vdev_scan; + vdev_t *oldvd; + + ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); + ASSERT(svs != NULL); + + oldvd = svs->svs_vd; + bzero(&scn->scn_phys, sizeof (scn->scn_phys)); + scn->scn_phys.scn_func = POOL_SCAN_REBUILD; + scn->scn_phys.scn_state = DSS_SCANNING; + scn->scn_phys.scn_min_txg = 0; + scn->scn_phys.scn_max_txg = tx->tx_txg; + scn->scn_phys.scn_ddt_class_max = 0; + scn->scn_phys.scn_start_time = gethrestime_sec(); + scn->scn_phys.scn_errors = 0; + /* Rebuild only examines blocks on one vdev */ + scn->scn_phys.scn_to_examine = oldvd->vdev_top->vdev_stat.vs_alloc; + svs->svs_phys.sr_ms = -1; + svs->svs_phys.sr_vdev = oldvd->vdev_top->vdev_guid; + svs->svs_phys.sr_oldvd = oldvd->vdev_guid; + + scn->scn_restart_txg = 0; + scn->scn_done_txg = 0; + scn->scn_sync_start_time = gethrtime(); + + spa->spa_scrub_active = B_TRUE; + spa_scan_stat_init(spa); + spa->spa_scrub_started = B_TRUE; + spa_event_notify(spa, NULL, NULL, ESC_ZFS_REBUILD_START); +} + +int +spa_vdev_scan_rebuild_cb(dsl_pool_t *dp, + const blkptr_t *bp, const zbookmark_phys_t *zb) +{ + /* Rebuild happens in open context and does not use this callback */ + ASSERT0(1); + return (-ENOTSUP); +} + +void +spa_vdev_scan_destroy(spa_t *spa) +{ + spa_vdev_scan_t *svs = spa->spa_vdev_scan; + + if (svs == NULL) + return; + + ASSERT3P(svs->svs_thread, ==, NULL); + ASSERT3P(svs->svs_ms_done, ==, NULL); + ASSERT3U(svs->svs_io_asize, ==, 0); + + spa->spa_vdev_scan = NULL; + mutex_destroy(&svs->svs_lock); + cv_destroy(&svs->svs_cv); + mutex_destroy(&svs->svs_io_lock); + cv_destroy(&svs->svs_io_cv); + kmem_free(svs, sizeof (*svs)); +} + +void +spa_vdev_scan_suspend(spa_t *spa) +{ + spa_vdev_scan_t *svs = spa->spa_vdev_scan; + + if (svs == NULL) + return; + + mutex_enter(&svs->svs_lock); + svs->svs_thread_exit = B_TRUE; + while (svs->svs_thread != NULL) + cv_wait(&svs->svs_cv, &svs->svs_lock); + mutex_exit(&svs->svs_lock); +} + +void +spa_vdev_scan_sync_state(spa_vdev_scan_t *svs, dmu_tx_t *tx) +{ + VERIFY0(zap_update(svs->svs_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_REBUILDING, sizeof (uint64_t), + sizeof (spa_rebuilding_phys_t) / sizeof (uint64_t), + &svs->svs_phys, tx)); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(spa_vdev_scan_delay, int, 0644); +MODULE_PARM_DESC(spa_vdev_scan_delay, "Number of ticks to delay SPA rebuild"); + +module_param(spa_vdev_scan_idle, int, 0644); +MODULE_PARM_DESC(spa_vdev_scan_idle, + "Idle window in clock ticks for SPA rebuild"); +#endif diff --git a/module/zfs/zio.c b/module/zfs/zio.c index b585368be570..4b8cf8cf1100 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3473,6 +3473,27 @@ zio_vdev_io_start(zio_t *zio) ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); } + /* + * We keep track of time-sensitive I/Os so that the scan thread + * can quickly react to certain workloads. In particular, we care + * about non-scrubbing, top-level reads and writes with the following + * characteristics: + * - synchronous writes of user data to non-slog devices + * - any reads of user data + * When these conditions are met, adjust the timestamp of vdev_last_io + * which allows the scan thread to adjust its workload accordingly. + */ + if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && + vd == vd->vdev_top && !vd->vdev_islog && + zio->io_bookmark.zb_objset != DMU_META_OBJSET && + zio->io_txg != spa_syncing_txg(spa)) { + uint64_t old = vd->vdev_last_io; + uint64_t new = ddi_get_lbolt64(); + + if (old != new) + (void) atomic_cas_64(&vd->vdev_last_io, old, new); + } + align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && @@ -3536,11 +3557,21 @@ zio_vdev_io_start(zio_t *zio) * However, indirect vdevs point off to other vdevs which may have * DTL's, so we never bypass them. The child i/os on concrete vdevs * will be properly bypassed instead. + * + * Leaf DTL_PARTIAL can be empty when a legitimate write comes from + * a dRAID spare vdev. For example, when a dRAID spare is first + * used, its spare blocks need to be written to but the leaf vdev's + * of such blocks can have empty DTL_PARTIAL. + * + * There seemed no clean way to allow such writes while bypassing + * spurious ones. At this point, just avoid all bypassing for dRAID + * for correctness. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ vd->vdev_ops != &vdev_indirect_ops && + vd->vdev_top->vdev_ops != &vdev_draid_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); @@ -3548,6 +3579,7 @@ zio_vdev_io_start(zio_t *zio) } if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) @@ -3584,8 +3616,8 @@ zio_vdev_io_done(zio_t *zio) if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; - if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { - + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) { vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 2ea82f0f6979..4354ddde3d7c 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -82,5 +82,6 @@ if is_linux; then "feature@userobj_accounting" "feature@encryption" "feature@project_quota" + "feature@draid" ) fi From 3d5934d65e4bfc83f83518fed91db6a14a349752 Mon Sep 17 00:00:00 2001 From: Isaac Huang Date: Tue, 1 May 2018 11:57:18 -0600 Subject: [PATCH 2/3] Use '%' as dRAID spare vdev prefix Changed dRAID spare vdev prefix from '$' to '%'. Fixed a few build and style warnings. Fixed rebuild status report (thegreatgazoo/zfs/issues/10). Signed-off-by: Isaac Huang --- cmd/zpool/zpool_main.c | 2 +- cmd/zpool/zpool_vdev.c | 2 +- include/sys/spa_impl.h | 3 +-- include/sys/vdev_draid_impl.h | 6 +++++- module/zfs/vdev_draid.c | 13 +++++++++++-- module/zfs/vdev_removal.c | 4 ++-- 6 files changed, 21 insertions(+), 9 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index aafde8f37a67..1a6e38dac122 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -6198,7 +6198,7 @@ print_scan_status(pool_scan_stat_t *ps) (void) printf(gettext("\t%s repaired, %.2f%% done"), processed_buf, 100 * fraction_done); } else if (ps->pss_func == POOL_SCAN_REBUILD) { - (void) printf(gettext("\t%s rebuilt, %.2f%% done\n"), + (void) printf(gettext("\t%s rebuilt, %.2f%% done"), processed_buf, 100 * fraction_done); } diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index b7738aa242e9..c9c10f1f013a 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -592,7 +592,7 @@ is_spare(nvlist_t *config, const char *path) * /dev/xxx Complete disk path * /xxx Full path to file * xxx Shorthand for /xxx - * $draidxxx dRAID spare, see VDEV_DRAID_SPARE_PATH_FMT + * %draidxxx dRAID spare, see VDEV_DRAID_SPARE_PATH_FMT */ static nvlist_t * make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index fbdcc35ac2dc..7be8021a6db8 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -184,8 +185,6 @@ typedef enum spa_all_vdev_zap_action { AVZ_ACTION_INITIALIZE } spa_avz_action_t; -typedef struct spa_vdev_scan spa_vdev_scan_t; - struct spa { /* * Fields protected by spa_namespace_lock. diff --git a/include/sys/vdev_draid_impl.h b/include/sys/vdev_draid_impl.h index 58d00d9f8b34..1cf3826ee0de 100644 --- a/include/sys/vdev_draid_impl.h +++ b/include/sys/vdev_draid_impl.h @@ -73,7 +73,11 @@ extern uint64_t vdev_draid_max_rebuildable_asize(vdev_t *, uint64_t); #define VDEV_DRAID_MAX_CHILDREN 255 #define VDEV_DRAID_U8_MAX ((uint8_t)-1) -#define VDEV_DRAID_SPARE_PATH_FMT "$"VDEV_TYPE_DRAID"%lu-%lu-s%lu" +/* + * Double '%' characters in the front because it's used as format string in + * scanf()/printf() family of functions + */ +#define VDEV_DRAID_SPARE_PATH_FMT "%%"VDEV_TYPE_DRAID"%lu-%lu-s%lu" #ifdef _KERNEL #define U64FMT "%llu" diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index a423b7b21eea..0eb909100793 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1108,6 +1108,15 @@ vdev_draid_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) return (vdev_draid_group_degraded(vd, NULL, offset, psize, mirror)); } +static void +vdev_draid_skip_io_done(zio_t *zio) +{ + /* + * HH: handle skip IO error + * raidz_col_t *rc = zio->io_private; + */ +} + /* * Start an IO operation on a dRAID VDev * @@ -1174,7 +1183,7 @@ vdev_draid_io_start(zio_t *zio) zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset + rc->rc_size, cfg->dcf_zero_abd, 1ULL << ashift, zio->io_type, zio->io_priority, - 0, NULL, NULL)); /* HH: handle skip write error */ + 0, vdev_draid_skip_io_done, rc)); } zio_execute(zio); @@ -1258,7 +1267,7 @@ vdev_draid_io_start(zio_t *zio) zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset + rc->rc_size, abd, 1ULL << ashift, ZIO_TYPE_READ, - zio->io_priority, 0, NULL, NULL)); + zio->io_priority, 0, vdev_draid_skip_io_done, rc)); } } diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 8e52357b837f..539b4a83b5f4 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1864,8 +1864,8 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); - spa_history_log_internal(spa, "vdev remove", NULL, - "%s vdev (%s) %s", spa_name(spa), + spa_history_log_internal(spa, "vdev remove", + NULL, "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_SPARE, nvstr); spa_vdev_remove_aux(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares, nv); From b56a862a7bc10483dcae50f7007aa2f6e5893c05 Mon Sep 17 00:00:00 2001 From: Isaac Huang Date: Tue, 1 May 2018 16:25:58 -0600 Subject: [PATCH 3/3] Fixed compiler warnings on ppc64 Signed-off-by: Isaac Huang --- include/sys/vdev_draid_impl.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/sys/vdev_draid_impl.h b/include/sys/vdev_draid_impl.h index 1cf3826ee0de..1bc9ebeab0d0 100644 --- a/include/sys/vdev_draid_impl.h +++ b/include/sys/vdev_draid_impl.h @@ -28,14 +28,14 @@ #include #include #include +#include +#include +#include #ifdef __cplusplus extern "C" { #endif -typedef struct zio zio_t; -typedef struct vdev vdev_t; -typedef struct raidz_map raidz_map_t; struct vdev_draid_configuration { uint64_t dcf_data; @@ -88,7 +88,8 @@ extern uint64_t vdev_draid_max_rebuildable_asize(vdev_t *, uint64_t); #endif #define draid_console(fmt, ...) printk(KERN_EMERG fmt, ##__VA_ARGS__) #else /* _KERNEL */ -#define U64FMT "%lu" +#include +#define U64FMT "%"PRIu64 #define draid_print(fmt, ...) printf(fmt, ##__VA_ARGS__) #define draid_console(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) #endif