From 8b272a6c443a1c6db47ddfb3b0e5cc2cdbeb9c03 Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Mon, 10 Aug 2020 15:29:34 +0300 Subject: [PATCH 1/8] Initial version of time dependent geometry --- include/sys/fs/zfs.h | 1 + include/sys/vdev_raidz.h | 7 ++ module/zfs/vdev.c | 7 +- module/zfs/vdev_raidz.c | 137 ++++++++++++++++++++++++++++++++++----- 4 files changed, 130 insertions(+), 22 deletions(-) diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 9c570aca176d..1a679c9c84e0 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -688,6 +688,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_NPARITY "nparity" #define ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH "raidz_logical_width" #define ZPOOL_CONFIG_RAIDZ_EXPAND_OFFSET "raidz_expand_offset" +#define ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS "raidz_expand_txgs" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index 146dd3c29660..5452a4d21a7e 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -35,6 +35,7 @@ extern "C" { struct zio; struct raidz_row; struct raidz_map; +struct vdev_raidz; #if !defined(_KERNEL) struct kernel_param {}; #endif @@ -47,6 +48,7 @@ struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t, struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); void vdev_raidz_map_free(struct raidz_map *); +void vdev_raidz_free(struct vdev_raidz *); void vdev_raidz_generate_parity(struct raidz_map *); void vdev_raidz_reconstruct(struct raidz_map *, const int *, int); @@ -84,6 +86,11 @@ typedef struct vdev_raidz_expand { uint64_t vre_offset_pertxg[TXG_SIZE]; + /* + * Last reflow txg per attached device. + */ + avl_tree_t vre_txgs; + dsl_scan_state_t vre_state; time_t vre_start_time; time_t vre_end_time; diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 8edd786331ff..dbcf652c1186 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -909,10 +910,8 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); - if (vd->vdev_ops == &vdev_raidz_ops) { - vdev_raidz_t *rz = vd->vdev_tsd; - kmem_free(rz, sizeof (*rz)); - } + if (vd->vdev_ops == &vdev_raidz_ops) + vdev_raidz_free(vd->vdev_tsd); /* * Discard allocation state. diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index e1b1950f04ec..504dc2fa4a12 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -196,6 +196,38 @@ vdev_raidz_map_free_vsd(zio_t *zio) } } +typedef struct reflow_node { + uint64_t re_txg; + uint64_t re_logical_width; + avl_node_t re_link; +} reflow_node_t; + +static int +vedv_raidz_reflow_compare(const void *x1, const void *x2) +{ + const reflow_node_t *l = (reflow_node_t *)x1; + const reflow_node_t *r = (reflow_node_t *)x2; + + if (l->re_txg < r->re_txg) + return (-1); + else if (l->re_txg == r->re_txg) + return (0); + + return (1); +} + +void +vdev_raidz_free(vdev_raidz_t *vdrz) +{ + reflow_node_t *re; + void *cookie = NULL; + avl_tree_t *tree = &vdrz->vn_vre.vre_txgs; + while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) + kmem_free(re, sizeof (*re)); + avl_destroy(&vdrz->vn_vre.vre_txgs); + kmem_free(vdrz, sizeof (*vdrz)); +} + /*ARGSUSED*/ static void vdev_raidz_cksum_free(void *arg, size_t ignored) @@ -2010,6 +2042,20 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) } } +static uint64_t +vdev_raidz_get_width(vdev_raidz_t *vdrz, uint64_t blk_birth) +{ + reflow_node_t *re, lookup = { blk_birth, 0 }; + avl_index_t where; + + re = avl_find(&vdrz->vn_vre.vre_txgs, &lookup, &where); + if (re != NULL) + return (re->re_logical_width); + + re = avl_nearest(&vdrz->vn_vre.vre_txgs, where, AVL_BEFORE); + return (re->re_logical_width); +} + /* * Start an IO operation on a RAIDZ VDev * @@ -2036,22 +2082,30 @@ vdev_raidz_io_start(zio_t *zio) raidz_map_t *rm; if (vdrz->vd_logical_width != vdrz->vd_physical_width) { - /* XXX rangelock not needed after expansion completes */ - zfs_locked_range_t *lr = - zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, - zio->io_offset, zio->io_size, RL_READER); - - rm = vdev_raidz_map_alloc_expanded(zio->io_abd, - zio->io_size, zio->io_offset, - tvd->vdev_ashift, vdrz->vd_physical_width, - vdrz->vd_logical_width, vdrz->vd_nparity, - vdrz->vn_vre.vre_offset_phys); - rm->rm_lr = lr; - /* - * XXX If this is a write, will need to do additional - * writes to locations that are already copied, but - * not yet reflected in the on-disk format. - */ + uint64_t width = vdev_raidz_get_width(vdrz, zio->io_bp->blk_birth); + if (vdrz->vn_vre.vre_offset != UINT64_MAX || + (zio->io_type == ZIO_TYPE_READ && width != vdrz->vd_physical_width)) { + /* XXX rangelock not needed after expansion completes */ + zfs_locked_range_t *lr = + zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, + zio->io_offset, zio->io_size, RL_READER); + + rm = vdev_raidz_map_alloc_expanded(zio->io_abd, + zio->io_size, zio->io_offset, + tvd->vdev_ashift, vdrz->vd_physical_width, + width, vdrz->vd_nparity, + vdrz->vn_vre.vre_offset_phys); + rm->rm_lr = lr; + /* + * XXX If this is a write, will need to do additional + * writes to locations that are already copied, but + * not yet reflected in the on-disk format. + */ + } else { + rm = vdev_raidz_map_alloc(zio, + tvd->vdev_ashift, vdrz->vd_physical_width, + vdrz->vd_nparity); + } } else { rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vdrz->vd_logical_width, @@ -2977,12 +3031,19 @@ raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = arg; vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + vdev_raidz_t *vdrz = raidvd->vdev_tsd; for (int i = 0; i < TXG_SIZE; i++) ASSERT0(vre->vre_offset_pertxg[i]); vre->vre_offset_phys = UINT64_MAX; + reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); + re->re_txg = tx->tx_txg; + re->re_logical_width = vdrz->vd_physical_width - 1; + avl_add(&vdrz->vn_vre.vre_txgs, re); + /* * vre_offset_phys will be removed from the on-disk config by * vdev_raidz_config_generate(). @@ -3337,6 +3398,13 @@ vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, new_child); + if (vdrz->vd_logical_width == vdrz->vd_physical_width) { + reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); + re->re_txg = 0; + re->re_logical_width = vdrz->vd_logical_width; + avl_add(&vdrz->vn_vre.vre_txgs, re); + } + vdrz->vd_physical_width++; vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; @@ -3402,6 +3470,24 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_OFFSET, vdrz->vn_vre.vre_offset_phys); } + + if (!avl_is_empty(&vdrz->vn_vre.vre_txgs)) { + uint64_t i = 0; + uint64_t count = avl_numnodes(&vdrz->vn_vre.vre_txgs); + uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, + KM_SLEEP); + + for (reflow_node_t *re = + avl_first(&vdrz->vn_vre.vre_txgs); re; + re = AVL_NEXT(&vdrz->vn_vre.vre_txgs, re)) { + txgs[i++] = re->re_txg; + } + + fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, + txgs, count); + + kmem_free(txgs, sizeof (uint64_t) * count); + } } /* @@ -3412,7 +3498,8 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) void * vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv) { - uint64_t nparity, lw; + uint64_t nparity, lw, *txgs; + uint_t txgs_size; vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); vdrz->vn_vre.vre_vdev_id = -1; @@ -3450,6 +3537,20 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv) */ } + avl_create(&vdrz->vn_vre.vre_txgs, vedv_raidz_reflow_compare, + sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); + + error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, + &txgs, &txgs_size); + if (error == 0) { + for (int i = 0; i < txgs_size; i++) { + reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); + re->re_txg = txgs[i]; + re->re_logical_width = vdrz->vd_logical_width + i; + avl_add(&vdrz->vn_vre.vre_txgs, re); + } + } + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) @@ -3479,7 +3580,7 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv) vdrz->vd_nparity = nparity; return (vdrz); out: - kmem_free(vdrz, sizeof (*vdrz)); + vdev_raidz_free(vdrz); return (NULL); } From 477630feac31584546d2fc6f869cb2f1221ba415 Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Thu, 13 Aug 2020 09:45:35 +0300 Subject: [PATCH 2/8] Move txgs avl tree to vdev_raidz_t --- include/sys/vdev_raidz.h | 11 ++++++----- module/zfs/vdev_raidz.c | 25 ++++++++++++------------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index 5452a4d21a7e..81ac4c7c7c93 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -86,11 +86,6 @@ typedef struct vdev_raidz_expand { uint64_t vre_offset_pertxg[TXG_SIZE]; - /* - * Last reflow txg per attached device. - */ - avl_tree_t vre_txgs; - dsl_scan_state_t vre_state; time_t vre_start_time; time_t vre_end_time; @@ -102,6 +97,12 @@ typedef struct vdev_raidz { int vd_logical_width; int vd_physical_width; int vd_nparity; + + /* + * Last reflow txg per attached device. + */ + avl_tree_t vre_txgs; + /* * If this vdev is being expanded, spa_raidz_expand is set to this */ diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 504dc2fa4a12..13c1010030b3 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -221,10 +221,10 @@ vdev_raidz_free(vdev_raidz_t *vdrz) { reflow_node_t *re; void *cookie = NULL; - avl_tree_t *tree = &vdrz->vn_vre.vre_txgs; + avl_tree_t *tree = &vdrz->vre_txgs; while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) kmem_free(re, sizeof (*re)); - avl_destroy(&vdrz->vn_vre.vre_txgs); + avl_destroy(&vdrz->vre_txgs); kmem_free(vdrz, sizeof (*vdrz)); } @@ -2048,11 +2048,11 @@ vdev_raidz_get_width(vdev_raidz_t *vdrz, uint64_t blk_birth) reflow_node_t *re, lookup = { blk_birth, 0 }; avl_index_t where; - re = avl_find(&vdrz->vn_vre.vre_txgs, &lookup, &where); + re = avl_find(&vdrz->vre_txgs, &lookup, &where); if (re != NULL) return (re->re_logical_width); - re = avl_nearest(&vdrz->vn_vre.vre_txgs, where, AVL_BEFORE); + re = avl_nearest(&vdrz->vre_txgs, where, AVL_BEFORE); return (re->re_logical_width); } @@ -3042,7 +3042,7 @@ raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); re->re_txg = tx->tx_txg; re->re_logical_width = vdrz->vd_physical_width - 1; - avl_add(&vdrz->vn_vre.vre_txgs, re); + avl_add(&vdrz->vre_txgs, re); /* * vre_offset_phys will be removed from the on-disk config by @@ -3402,7 +3402,7 @@ vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); re->re_txg = 0; re->re_logical_width = vdrz->vd_logical_width; - avl_add(&vdrz->vn_vre.vre_txgs, re); + avl_add(&vdrz->vre_txgs, re); } vdrz->vd_physical_width++; @@ -3471,15 +3471,14 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) vdrz->vn_vre.vre_offset_phys); } - if (!avl_is_empty(&vdrz->vn_vre.vre_txgs)) { - uint64_t i = 0; - uint64_t count = avl_numnodes(&vdrz->vn_vre.vre_txgs); + if (!avl_is_empty(&vdrz->vre_txgs)) { + uint64_t i = 0, count = avl_numnodes(&vdrz->vre_txgs); uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, KM_SLEEP); for (reflow_node_t *re = - avl_first(&vdrz->vn_vre.vre_txgs); re; - re = AVL_NEXT(&vdrz->vn_vre.vre_txgs, re)) { + avl_first(&vdrz->vre_txgs); re; + re = AVL_NEXT(&vdrz->vre_txgs, re)) { txgs[i++] = re->re_txg; } @@ -3537,7 +3536,7 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv) */ } - avl_create(&vdrz->vn_vre.vre_txgs, vedv_raidz_reflow_compare, + avl_create(&vdrz->vre_txgs, vedv_raidz_reflow_compare, sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, @@ -3547,7 +3546,7 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv) reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); re->re_txg = txgs[i]; re->re_logical_width = vdrz->vd_logical_width + i; - avl_add(&vdrz->vn_vre.vre_txgs, re); + avl_add(&vdrz->vre_txgs, re); } } From daaf5ee118c41415df0e7ec5ec0489ae16e34bfa Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Thu, 13 Aug 2020 10:29:48 +0300 Subject: [PATCH 3/8] Use TREE_CMP in the avl comparator --- module/zfs/vdev_raidz.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 13c1010030b3..a0b6ee41c611 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -208,12 +208,7 @@ vedv_raidz_reflow_compare(const void *x1, const void *x2) const reflow_node_t *l = (reflow_node_t *)x1; const reflow_node_t *r = (reflow_node_t *)x2; - if (l->re_txg < r->re_txg) - return (-1); - else if (l->re_txg == r->re_txg) - return (0); - - return (1); + return (TREE_CMP(l->re_txg, r->re_txg)); } void From 8bd747816370d1211f7120267389ba43d389d42d Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Thu, 13 Aug 2020 10:37:53 +0300 Subject: [PATCH 4/8] Remove ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH --- include/sys/fs/zfs.h | 1 - module/zfs/vdev_raidz.c | 18 +++++++----------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 1a679c9c84e0..3e7c6ac701e8 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -686,7 +686,6 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" -#define ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH "raidz_logical_width" #define ZPOOL_CONFIG_RAIDZ_EXPAND_OFFSET "raidz_expand_offset" #define ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS "raidz_expand_txgs" #define ZPOOL_CONFIG_HOSTID "hostid" diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index a0b6ee41c611..90a89fdc1f78 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -3036,7 +3036,7 @@ raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); re->re_txg = tx->tx_txg; - re->re_logical_width = vdrz->vd_physical_width - 1; + re->re_logical_width = vdrz->vd_physical_width; avl_add(&vdrz->vre_txgs, re); /* @@ -3459,8 +3459,6 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) * it. */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); - fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH, - vdrz->vd_logical_width); if (vdrz->vn_vre.vre_offset_phys != UINT64_MAX) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_OFFSET, vdrz->vn_vre.vre_offset_phys); @@ -3492,7 +3490,7 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) void * vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv) { - uint64_t nparity, lw, *txgs; + uint64_t nparity, *txgs; uint_t txgs_size; vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); @@ -3513,11 +3511,6 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv) vdrz->vd_logical_width = children; vdrz->vd_physical_width = children; - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH, - &lw) == 0) { - vdrz->vd_logical_width = lw; - } - /* note, the ID does not exist when creating a pool */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &vdrz->vn_vre.vre_vdev_id); @@ -3539,10 +3532,13 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv) if (error == 0) { for (int i = 0; i < txgs_size; i++) { reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); - re->re_txg = txgs[i]; - re->re_logical_width = vdrz->vd_logical_width + i; + re->re_txg = txgs[txgs_size - i - 1]; + re->re_logical_width = vdrz->vd_physical_width - i; avl_add(&vdrz->vre_txgs, re); } + + reflow_node_t *re = avl_first(&vdrz->vre_txgs); + vdrz->vd_logical_width = re->re_logical_width; } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, From 15df8613bf7effaa0c6b8e9c21dd414e303ce205 Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Thu, 13 Aug 2020 11:37:08 +0300 Subject: [PATCH 5/8] Use BP_PHYSICAL_BIRTH together with zio bp --- module/zfs/vdev_raidz.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 90a89fdc1f78..bb67d5ada50b 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -2038,9 +2038,9 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) } static uint64_t -vdev_raidz_get_width(vdev_raidz_t *vdrz, uint64_t blk_birth) +vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, zio_t *zio) { - reflow_node_t *re, lookup = { blk_birth, 0 }; + reflow_node_t *re, lookup = { BP_PHYSICAL_BIRTH(zio->io_bp), 0 }; avl_index_t where; re = avl_find(&vdrz->vre_txgs, &lookup, &where); @@ -2077,9 +2077,10 @@ vdev_raidz_io_start(zio_t *zio) raidz_map_t *rm; if (vdrz->vd_logical_width != vdrz->vd_physical_width) { - uint64_t width = vdev_raidz_get_width(vdrz, zio->io_bp->blk_birth); + uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, zio); if (vdrz->vn_vre.vre_offset != UINT64_MAX || - (zio->io_type == ZIO_TYPE_READ && width != vdrz->vd_physical_width)) { + (zio->io_type == ZIO_TYPE_READ && + logical_width != vdrz->vd_physical_width)) { /* XXX rangelock not needed after expansion completes */ zfs_locked_range_t *lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, @@ -2088,7 +2089,7 @@ vdev_raidz_io_start(zio_t *zio) rm = vdev_raidz_map_alloc_expanded(zio->io_abd, zio->io_size, zio->io_offset, tvd->vdev_ashift, vdrz->vd_physical_width, - width, vdrz->vd_nparity, + logical_width, vdrz->vd_nparity, vdrz->vn_vre.vre_offset_phys); rm->rm_lr = lr; /* From deeff4a25236c4ad985fb06f1ad0cc3bfb32c941 Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Thu, 13 Aug 2020 13:28:49 +0300 Subject: [PATCH 6/8] Simplify if expression around vdev_raidz_get_width() --- module/zfs/vdev_raidz.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index bb67d5ada50b..c809dfdd9249 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -2079,8 +2079,7 @@ vdev_raidz_io_start(zio_t *zio) if (vdrz->vd_logical_width != vdrz->vd_physical_width) { uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, zio); if (vdrz->vn_vre.vre_offset != UINT64_MAX || - (zio->io_type == ZIO_TYPE_READ && - logical_width != vdrz->vd_physical_width)) { + logical_width != vdrz->vd_physical_width) { /* XXX rangelock not needed after expansion completes */ zfs_locked_range_t *lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, From 488c7ae6d612c2fc5072b9dce9bc70a7fb7baba7 Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Fri, 14 Aug 2020 08:59:49 +0300 Subject: [PATCH 7/8] Add reflow_node_t comment --- module/zfs/vdev_raidz.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index c809dfdd9249..43556530c189 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -196,6 +196,14 @@ vdev_raidz_map_free_vsd(zio_t *zio) } } +/* + * The node of avl tree below is main part of expanded raidz time dependent + * geometry logic. Allowing to mix BPs stored in reflowed and normal format + * depending of BP blk birth txg. The new reflow node is added on first + * expansion process start and every time on expansion reflow process completion. + * Where re_txg is last reflow process txg and the re_logical_width is actual + * logical width required to read BP in reflowed format. + */ typedef struct reflow_node { uint64_t re_txg; uint64_t re_logical_width; From a9e972628a3206725f9a9b231e5203326aab690b Mon Sep 17 00:00:00 2001 From: Fedor Uporov Date: Fri, 14 Aug 2020 13:00:06 +0300 Subject: [PATCH 8/8] Add raidz expansion test case --- tests/runfiles/common.run | 3 +- .../tests/functional/raidz/Makefile.am | 3 +- ...dz_expand.ksh => raidz_expand_001_pos.ksh} | 0 .../functional/raidz/raidz_expand_002_pos.ksh | 123 ++++++++++++++++++ 4 files changed, 127 insertions(+), 2 deletions(-) rename tests/zfs-tests/tests/functional/raidz/{raidz_expand.ksh => raidz_expand_001_pos.ksh} (100%) create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index b2df6d70444e..a54e5772fa00 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -708,8 +708,9 @@ tags = ['functional', 'redacted_send'] [tests/functional/raidz] tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos', - 'raidz_expand.ksh'] + 'raidz_expand_001_pos', 'raidz_expand_002_pos'] tags = ['functional', 'raidz'] +timeout = 1200 [tests/functional/redundancy] tests = ['redundancy_001_pos', 'redundancy_002_pos', 'redundancy_003_pos', diff --git a/tests/zfs-tests/tests/functional/raidz/Makefile.am b/tests/zfs-tests/tests/functional/raidz/Makefile.am index 1068a35eb3d4..bf50764db1ca 100644 --- a/tests/zfs-tests/tests/functional/raidz/Makefile.am +++ b/tests/zfs-tests/tests/functional/raidz/Makefile.am @@ -6,4 +6,5 @@ dist_pkgdata_SCRIPTS = \ raidz_002_pos.ksh \ raidz_003_pos.ksh \ raidz_004_pos.ksh \ - raidz_expand.ksh + raidz_expand_001_pos.ksh \ + raidz_expand_002_pos.ksh diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh similarity index 100% rename from tests/zfs-tests/tests/functional/raidz/raidz_expand.ksh rename to tests/zfs-tests/tests/functional/raidz/raidz_expand_001_pos.ksh diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh new file mode 100755 index 000000000000..d760cb3c0dd5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_002_pos.ksh @@ -0,0 +1,123 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool attach poolname raidz ...' should attach new devive to the pool. +# +# STRATEGY: +# 1. Create block device files for the test raidz pool +# 2. For each parity value [1..3] +# - create raidz pool with minimum block device files required +# - for each free test block device +# - attach to the pool +# - verify the raidz pool +# - destroy the raidz pool + +typeset -r devs=6 +typeset -r dev_size_mb=512 + +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done + + log_must set_tunable32 PREFETCH_DISABLE $prefetch_disable +} + +function wait_expand_completion +{ + while zpool status $TESTPOOL | grep 'raidz expand:' | \ + grep 'in progress'; do + sleep 1 + done +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 + +# Disk files which will be used by pool +for i in {0..$(($devs))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +for nparity in 1 2 3; do + raid=raidz$nparity + dir=$TEST_BASE_DIR + pool=$TESTPOOL + opts="-o cachefile=none" + + log_must zpool create -f $opts $pool $raid ${disks[1..$(($nparity+1))]} + log_must zfs set primarycache=metadata $pool + + log_must zfs create $pool/fs + log_must fill_fs /$pool/fs 1 512 100 1024 R + + log_must zfs create -o compress=on $pool/fs2 + log_must fill_fs /$pool/fs2 1 512 100 1024 R + + log_must zfs create -o compress=on -o recordsize=8k $pool/fs3 + log_must fill_fs /$pool/fs3 1 512 100 1024 R + + typeset pool_size=$(get_pool_prop size $pool) + + for disk in ${disks[$(($nparity+1))+1..$devs]}; do + log_must zpool attach $pool ${raid}-0 $disk + + wait_expand_completion + + log_must zpool export $pool + log_must zpool import $opts -d $dir $pool + + typeset disk_attached=$(get_disklist $pool | grep $disk) + if [[ -z $disk_attached ]]; then + log_fail "pool $pool attached disk not found" + fi + + typeset expand_size=$(get_pool_prop size $pool) + if [[ "$expand_size" -le "$pool_size" ]]; then + log_fail "pool $pool not expanded" + fi + + pool_size=$expand_size + done + + zpool destroy "$pool" +done + +log_pass "raidz expansion test succeeded." \ No newline at end of file