From 84adc455ca2f674888125b05d19ec310d1d5a279 Mon Sep 17 00:00:00 2001 From: Gvozden Neskovic Date: Sun, 20 Mar 2016 21:39:48 +0100 Subject: [PATCH] RAIDZ ABD changes new iterator functions for raidz gen and rec (max 6 mapped pages at once) abd iter now handles irq_save/restore if needed make abd iterator precise: iter.length always shows real value (0 when exhausted) pass size on abd_get_offset --- include/sys/abd.h | 24 ++-- module/zfs/abd.c | 240 +++++++++++++++++++++++++++++++++++----- module/zfs/vdev_raidz.c | 18 ++- module/zfs/zfs_fm.c | 4 +- module/zfs/zio.c | 17 +-- 5 files changed, 253 insertions(+), 50 deletions(-) diff --git a/include/sys/abd.h b/include/sys/abd.h index 10739c52de61..6c6cc154795c 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -56,13 +56,13 @@ typedef struct arc_buf_data { }; } abd_t; -#define ABD_F_SCATTER (0) /* abd is scatter */ -#define ABD_F_LINEAR (1) /* abd is linear */ -#define ABD_F_OWNER (1<<1) /* abd owns the buffer */ -#define ABD_F_HIGHMEM (1<<2) /* abd uses highmem */ +#define ABD_F_SCATTER (1U << 0) /* abd is scatter */ +#define ABD_F_LINEAR (1U << 1) /* abd is linear */ +#define ABD_F_OWNER (1U << 2) /* abd owns the buffer */ +#define ABD_F_HIGHMEM (1U << 3) /* abd uses highmem */ -#define ABD_IS_SCATTER(abd) (!((abd)->abd_flags & ABD_F_LINEAR)) -#define ABD_IS_LINEAR(abd) (!ABD_IS_SCATTER(abd)) +#define ABD_IS_SCATTER(abd) (!!((abd)->abd_flags & ABD_F_SCATTER)) +#define ABD_IS_LINEAR(abd) (!!((abd)->abd_flags & ABD_F_LINEAR)) #define ASSERT_ABD_SCATTER(abd) ASSERT(ABD_IS_SCATTER(abd)) #define ASSERT_ABD_LINEAR(abd) ASSERT(ABD_IS_LINEAR(abd)) @@ -89,7 +89,7 @@ abd_t *_abd_alloc_scatter(size_t, int); #define abd_alloc_meta_scatter(s) _abd_alloc_scatter(s, 0) abd_t *abd_alloc_linear(size_t); void abd_free(abd_t *, size_t); -abd_t *abd_get_offset(abd_t *, size_t); +abd_t *abd_get_offset(abd_t *, size_t, size_t); abd_t *abd_get_from_buf(void *, size_t); void abd_put(abd_t *); @@ -111,6 +111,16 @@ int abd_cmp(abd_t *, abd_t *, size_t); int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); void abd_zero_off(abd_t *, size_t, size_t); void *abd_buf_segment(abd_t *, size_t, size_t); + +/* + * ABD operations for RAIDZ vdev + */ +void abd_raidz_gen_iterate(abd_t **, abd_t *, ssize_t, ssize_t, const unsigned, + void (*)(void **, const void *, size_t, size_t)); +void abd_raidz_rec_iterate(abd_t **, abd_t **, ssize_t, const unsigned, + void (*)(void **, const size_t, void **, const unsigned *), + const unsigned *); + /* * abd_array_off - returns an object in an array contained in @abd * diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 53e1e1f52eb4..3e613e6a7dd5 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -64,6 +64,8 @@ struct page; #define virt_to_page(addr) \ ((struct page *)(addr)) +#define PageHighMem(p) (B_FALSE) + typedef unsigned int gfp_t; /* * scatterlist @@ -150,8 +152,10 @@ struct abd_miter { }; int nents; /* num of sg entries */ int rw; /* r/w access, whether to flush cache */ + int size_left; /* size left to be accessed */ #ifndef HAVE_1ARG_KMAP_ATOMIC int km_type; /* KM_USER0 or KM_USER1 */ + unsigned long irq_flags; /* save irq if km_type > KM_USER1 */ #endif }; @@ -177,17 +181,18 @@ abd_miter_init_km(struct abd_miter *aiter, abd_t *abd, int rw, int km) } else { aiter->is_linear = 0; aiter->sg = abd->abd_sgl; - aiter->length = aiter->sg->length - abd->abd_offset; + aiter->length = MIN(aiter->sg->length - abd->abd_offset, + abd->abd_size); } aiter->offset = abd->abd_offset; aiter->nents = abd->abd_nents; aiter->rw = rw; + aiter->size_left = abd->abd_size; #ifndef HAVE_1ARG_KMAP_ATOMIC aiter->km_type = km; #endif } - #define abd_miter_init(a, abd, rw) abd_miter_init_km(a, abd, rw, 0) #define abd_miter_init2(a, aabd, arw, b, babd, brw) \ do { \ @@ -217,20 +222,32 @@ abd_miter_map_x(struct abd_miter *aiter, int atomic) if (!aiter->nents) return; + if (!aiter->length) + return; if (aiter->is_linear) { paddr = aiter->buf; } else { - ASSERT(aiter->length == aiter->sg->length - aiter->offset); - - if (atomic) + if (atomic) { +#if !defined(HAVE_1ARG_KMAP_ATOMIC) + /* + * Disable irqs if using slot above KM_USER1 and + * the page is HighMem + */ + if ((aiter->km_type > 1) && + PageHighMem(sg_page(aiter->sg))) { + local_irq_save(aiter->irq_flags); + } +#endif paddr = zfs_kmap_atomic(sg_page(aiter->sg), - (aiter->km_type == 0 ? KM_USER0 : - (aiter->km_type == 1 ? KM_USER1 : KM_BIO_SRC_IRQ))); - else + KM_USER0 + aiter->km_type); + } else { paddr = kmap(sg_page(aiter->sg)); + } + ASSERT(paddr != NULL); } aiter->addr = paddr + aiter->offset; + VERIFY(aiter->addr); } /* @@ -246,6 +263,8 @@ abd_miter_unmap_x(struct abd_miter *aiter, int atomic) if (!aiter->nents) return; + if (!aiter->length) + return; ASSERT(aiter->addr); @@ -256,9 +275,14 @@ abd_miter_unmap_x(struct abd_miter *aiter, int atomic) if (atomic) { if (aiter->rw == ABD_MITER_W) flush_kernel_dcache_page(sg_page(aiter->sg)); - zfs_kunmap_atomic(paddr, - (aiter->km_type == 0 ? KM_USER0 : - (aiter->km_type == 1 ? KM_USER1 : KM_BIO_SRC_IRQ))); + zfs_kunmap_atomic(paddr, KM_USER0 + aiter->km_type); + +#if !defined(HAVE_1ARG_KMAP_ATOMIC) + if ((aiter->km_type > 1) && + PageHighMem(sg_page(aiter->sg))) { + local_irq_restore(aiter->irq_flags); + } +#endif } else { kunmap(sg_page(aiter->sg)); } @@ -312,30 +336,38 @@ static int abd_miter_advance(struct abd_miter *aiter, int offset) { ASSERT(!aiter->addr); + ASSERT3S(offset, >=, 0); if (!aiter->nents) return (0); + aiter->size_left = MAX(aiter->size_left - offset, 0); + + /* Exhausted if size_left drops to zero */ + if (!aiter->size_left) { + aiter->length = 0; + aiter->nents = 0; + return (0); + } + aiter->offset += offset; + if (aiter->is_linear) { - aiter->length -= offset; - if (aiter->length <= 0) { - aiter->nents--; - aiter->length = 0; - return (0); - } + aiter->length = aiter->size_left; } else { while (aiter->offset >= aiter->sg->length) { aiter->offset -= aiter->sg->length; aiter->nents--; aiter->sg = sg_next(aiter->sg); - if (!aiter->nents) { - aiter->length = 0; - return (0); - } + ASSERT3S(aiter->nents, >, 0); } - aiter->length = aiter->sg->length - aiter->offset; + ASSERT3S(aiter->offset, >=, 0); + + aiter->length = MIN(aiter->sg->length - aiter->offset, + aiter->size_left); } + ASSERT3S(aiter->length, >, 0); + return (1); } @@ -496,7 +528,6 @@ abd_iterate_func3(abd_t *abd0, abd_t *abd1, abd_t *abd2, size_t size, size_t len; int stop; struct abd_miter aiter0, aiter1, aiter2; - unsigned long flags; ABD_CHECK(abd0); ABD_CHECK(abd1); @@ -510,8 +541,6 @@ abd_iterate_func3(abd_t *abd0, abd_t *abd1, abd_t *abd2, size_t size, &aiter1, abd1, ABD_MITER_W, &aiter2, abd2, ABD_MITER_W); - /* We are using KM_BIO_SRC_IRQ so we need to disable irq */ - local_irq_save(flags); while (size > 0) { len = MIN(aiter0.length, size); len = MIN(aiter1.length, len); @@ -539,7 +568,161 @@ abd_iterate_func3(abd_t *abd0, abd_t *abd1, abd_t *abd2, size_t size, abd_miter_advance(&aiter1, len); abd_miter_advance(&aiter2, len); } - local_irq_restore(flags); +} + +/* + * Iterate over code ABDs and a data ABD and call @func_raidz_gen. + * + * @cabds parity ABDs, must have equal size + * @dabd data ABD. Can be NULL (in this case @dsize = 0) + * @func_raidz_gen should be implemented so that its behaviour + * is the same when taking linear and when taking scatter + */ +void +abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)) +{ + int i; + ssize_t len, dlen; + struct abd_miter caiters[3]; + struct abd_miter daiter; + void *caddrs[3]; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) { + abd_miter_init_km(&caiters[i], cabds[i], ABD_MITER_W, i); + } + if (dabd) + abd_miter_init_km(&daiter, dabd, ABD_MITER_R, parity); + + while (csize > 0) { + len = csize; + switch (parity) { + case 3: + len = MIN(caiters[2].length, len); + case 2: + len = MIN(caiters[1].length, len); + case 1: + len = MIN(caiters[0].length, len); + } + + if (dabd && (dsize > 0)) { + /* this needs precise iter.length */ + len = MIN(daiter.length, len); + dlen = len; + } else { + dlen = 0; + } + + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + if (dabd && (dsize > 0)) { + abd_miter_map_atomic(&daiter); + } + + for (i = 0; i < parity; i++) { + abd_miter_map_atomic(&caiters[i]); + caddrs[i] = caiters[i].addr; + } + + func_raidz_gen(caddrs, daiter.addr, len, dlen); + + for (i = parity-1; i >= 0; i--) { + abd_miter_unmap_atomic(&caiters[i]); + abd_miter_advance(&caiters[i], len); + } + + if (dabd && (dsize > 0)) { + abd_miter_unmap_atomic(&daiter); + abd_miter_advance(&daiter, dlen); + dsize -= dlen; + } + + csize -= len; + + ASSERT3S(dsize, >=, 0); + ASSERT3S(csize, >=, 0); + } +} + +/* + * Iterate over code ABDs and data reconstruction target ABDs and call + * @func_raidz_rec. Function maps at most 6 pages atomically. + * + * @cabds parity ABDs, must have equal size + * @tabds rec target ABDs, at most 3 + * @tsize size of data target columns + * @func_raidz_rec expects syndrome data in target columns. Function + * reconstructs data and overwrites target columns. + */ +void +abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul) +{ + int i; + ssize_t len; + struct abd_miter citers[3]; + struct abd_miter xiters[3]; + void *caddrs[3], *xaddrs[3]; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) { + abd_miter_init_km(&citers[i], cabds[i], ABD_MITER_R, 2*i); + abd_miter_init_km(&xiters[i], tabds[i], ABD_MITER_W, 2*i+1); + } + + while (tsize > 0) { + len = tsize; + switch (parity) { + case 3: + len = MIN(xiters[2].length, len); + len = MIN(citers[2].length, len); + case 2: + len = MIN(xiters[1].length, len); + len = MIN(citers[1].length, len); + case 1: + len = MIN(xiters[0].length, len); + len = MIN(citers[0].length, len); + } + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + for (i = 0; i < parity; i++) { + abd_miter_map_atomic(&citers[i]); + abd_miter_map_atomic(&xiters[i]); + caddrs[i] = citers[i].addr; + xaddrs[i] = xiters[i].addr; + } + + func_raidz_rec(xaddrs, len, caddrs, mul); + + for (i = parity-1; i >= 0; i--) { + abd_miter_unmap_atomic(&xiters[i]); + abd_miter_unmap_atomic(&citers[i]); + abd_miter_advance(&xiters[i], len); + abd_miter_advance(&citers[i], len); + } + + tsize -= len; + ASSERT3S(tsize, >=, 0); + } } /* @@ -1114,17 +1297,18 @@ static kmem_cache_t *abd_struct_cache = NULL; * not be freed before any of its derived ABD. */ abd_t * -abd_get_offset(abd_t *sabd, size_t off) +abd_get_offset(abd_t *sabd, size_t size, size_t off) { abd_t *abd; ABD_CHECK(sabd); ASSERT(off <= sabd->abd_size); + ASSERT3S(sabd->abd_size, >=, size + off); abd = kmem_cache_alloc(abd_struct_cache, KM_PUSHPAGE); abd_set_magic(abd); - abd->abd_size = sabd->abd_size - off; + abd->abd_size = size; abd->abd_flags = sabd->abd_flags & ~ABD_F_OWNER; if (ABD_IS_LINEAR(sabd)) { diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 9c08d3044652..a4b48679e863 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -344,7 +344,8 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, abd_t *good_data) for (; x < rm->rm_cols; x++) { abd_put(rm->rm_col[x].rc_data); rm->rm_col[x].rc_data = - abd_get_offset(good_data, offset); + abd_get_offset(good_data, + rm->rm_col[x].rc_size, offset); offset += rm->rm_col[x].rc_size; } @@ -361,7 +362,8 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, abd_t *good_data) for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { abd_put(rm->rm_col[x].rc_data); rm->rm_col[x].rc_data = abd_get_offset( - rm->rm_datacopy, offset); + rm->rm_datacopy, rm->rm_col[x].rc_size, + offset); offset += rm->rm_col[x].rc_size; } } @@ -373,7 +375,8 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, abd_t *good_data) offset = 0; for (x = rm->rm_firstdatacol; x < c; x++) offset += rm->rm_col[x].rc_size; - good = abd_get_offset(good_data, offset); + good = abd_get_offset(good_data, rm->rm_col[x].rc_size, + offset); need_put = 1; } @@ -430,7 +433,8 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { raidz_col_t *col = &rm->rm_col[c]; - abd_t *tmp = abd_get_offset(rm->rm_datacopy, offset); + abd_t *tmp = abd_get_offset(rm->rm_datacopy, col->rc_size, + offset); abd_copy(tmp, col->rc_data, col->rc_size); abd_put(col->rc_data); @@ -555,11 +559,13 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, abd_alloc_scatter(rm->rm_col[c].rc_size); } - rm->rm_col[c].rc_data = abd_get_offset(zio->io_data, 0); + rm->rm_col[c].rc_data = abd_get_offset(zio->io_data, + rm->rm_col[c].rc_size, 0); off = rm->rm_col[c].rc_size; for (c = c + 1; c < acols; c++) { - rm->rm_col[c].rc_data = abd_get_offset(zio->io_data, off); + rm->rm_col[c].rc_data = abd_get_offset(zio->io_data, + rm->rm_col[c].rc_size, off); off += rm->rm_col[c].rc_size; } diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 5a09bb8788e7..07426a199c03 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -684,8 +684,8 @@ build_histo(zfs_ecksum_info_t *eip, abd_t *goodbuf, abd_t *badbuf, size_t len = (end - start) * sizeof (uint64_t); /* abd_iterate_func2 don't takes offset, so do get_offset */ - g = abd_get_offset(goodbuf, start * sizeof (uint64_t)); - b = abd_get_offset(badbuf, start * sizeof (uint64_t)); + g = abd_get_offset(goodbuf, len, start * sizeof (uint64_t)); + b = abd_get_offset(badbuf, len, start * sizeof (uint64_t)); bh.range = range; diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 7353f3c3f5f9..25e59ff88bf8 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1765,7 +1765,8 @@ zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, if (gn != NULL) return (pio); - return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), + return (zio_read(pio, pio->io_spa, bp, + abd_get_offset(data, BP_GET_PSIZE(bp), offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); @@ -1794,7 +1795,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { - abd_t *buf = abd_get_offset(data, offset); + abd_t *buf = abd_get_offset(data, BP_GET_PSIZE(bp), + offset); zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), buf, BP_GET_PSIZE(bp)); @@ -1809,9 +1811,10 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - abd_get_offset(data, offset), BP_GET_PSIZE(bp), - zio_gang_issue_func_done, NULL, pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + abd_get_offset(data, BP_GET_PSIZE(bp), offset), + BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, + pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + &pio->io_bookmark); } return (zio); @@ -2109,8 +2112,8 @@ zio_write_gang_block(zio_t *pio) zp.zp_nopwrite = B_FALSE; zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], - abd_get_offset(pio->io_data, pio->io_size - resid), lsize, - &zp, zio_write_gang_member_ready, NULL, + abd_get_offset(pio->io_data, lsize, pio->io_size - resid), + lsize, &zp, zio_write_gang_member_ready, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); }