diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 25abe99e0749..e7df0f0019fe 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -35,6 +35,7 @@ #include <sys/vdev_impl.h> #include <sys/vdev_draid.h> #include <sys/zio.h> +#include <sys/zio_checksum.h> #include <sys/abd.h> #include <sys/fs/zfs.h> @@ -102,6 +103,7 @@ vdev_mirror_stat_fini(void) */ typedef struct mirror_child { vdev_t *mc_vd; + abd_t *mc_abd; uint64_t mc_offset; int mc_error; int mc_load; @@ -434,6 +436,10 @@ vdev_mirror_child_done(zio_t *zio) { mirror_child_t *mc = zio->io_private; + /* See scrubbing read comment in vdev_mirror_io_start() */ + if (zio->io_flags & ZIO_FLAG_SCRUB && zio->io_bp == NULL) + mc->mc_abd = zio->io_abd; + mc->mc_error = zio->io_error; mc->mc_tried = 1; mc->mc_skipped = 0; @@ -637,15 +643,16 @@ vdev_mirror_io_start(zio_t *zio) } if (zio->io_type == ZIO_TYPE_READ) { - if (zio->io_bp != NULL && - (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) { + if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) { /* - * For scrubbing reads (if we can verify the - * checksum here, as indicated by io_bp being - * non-NULL) we need to allocate a read buffer for - * each child and issue reads to all children. If - * any child succeeds, it will copy its data into - * zio->io_data in vdev_mirror_scrub_done. + * For scrubbing reads we need to allocate a buffer + * for each child and issue reads to all children. + * If we can verify the checksum here, as indicated + * by io_bp being non-NULL, the data will be copied + * into zio->io_data in vdev_mirror_scrub_done(). + * If not, then vdev_mirror_io_done() will compare + * all of the read buffers and return a checksum + * error if they aren't all identical. */ for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; @@ -663,7 +670,8 @@ vdev_mirror_io_start(zio_t *zio) abd_alloc_sametype(zio->io_abd, zio->io_size), zio->io_size, zio->io_type, zio->io_priority, 0, - vdev_mirror_scrub_done, mc)); + zio->io_bp ? vdev_mirror_scrub_done : + vdev_mirror_child_done, mc)); } zio_execute(zio); return; @@ -731,6 +739,7 @@ vdev_mirror_io_done(zio_t *zio) int c; int good_copies = 0; int unexpected_errors = 0; + int last_good_copy = -1; if (mm == NULL) return; @@ -742,6 +751,7 @@ vdev_mirror_io_done(zio_t *zio) if (!mc->mc_skipped) unexpected_errors++; } else if (mc->mc_tried) { + last_good_copy = c; good_copies++; } } @@ -755,7 +765,6 @@ vdev_mirror_io_done(zio_t *zio) * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ - /* XXPOLICY */ if (good_copies != mm->mm_children) { /* * Always require at least one good copy. @@ -782,7 +791,6 @@ vdev_mirror_io_done(zio_t *zio) /* * If we don't have a good copy yet, keep trying other children. */ - /* XXPOLICY */ if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { ASSERT(c >= 0 && c < mm->mm_children); mc = &mm->mm_child[c]; @@ -794,7 +802,59 @@ vdev_mirror_io_done(zio_t *zio) return; } - /* XXPOLICY */ + /* + * If we're scrubbing but don't have a BP available (because this + * vdev is under a raidz or draid vdev) then the best we can do is + * compare all of the copies read. If they're not identical then + * return a checksum error and the most likely correct data. The + * raidz code will issue a repair I/O if possible. + */ + if (zio->io_flags & ZIO_FLAG_SCRUB && zio->io_bp == NULL) { + abd_t *last_good_abd; + + ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops || + zio->io_vd->vdev_ops == &vdev_spare_ops); + + if (good_copies > 1) { + last_good_abd = mm->mm_child[last_good_copy].mc_abd; + abd_t *best_abd = NULL; + + for (c = 0; c < last_good_copy; c++) { + mc = &mm->mm_child[c]; + + if (mc->mc_error || !mc->mc_tried) + continue; + + if (abd_cmp(mc->mc_abd, last_good_abd) != 0) + zio->io_error = SET_ERROR(ECKSUM); + + /* + * The distributed spare is always prefered + * by vdev_mirror_child_select() so it's + * considered to be the best candidate. + */ + if (best_abd == NULL && + mc->mc_vd->vdev_ops == + &vdev_draid_spare_ops) { + best_abd = mc->mc_abd; + } + } + + abd_copy(zio->io_abd, best_abd ? best_abd : + last_good_abd, zio->io_size); + + } else if (good_copies == 1) { + last_good_abd = mm->mm_child[last_good_copy].mc_abd; + abd_copy(zio->io_abd, last_good_abd, zio->io_size); + } + + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + abd_free(mc->mc_abd); + mc->mc_abd = NULL; + } + } + if (good_copies == 0) { zio->io_error = vdev_mirror_worst_error(mm); ASSERT(zio->io_error != 0); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index ae0777b3dcc1..bc343c6254b7 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1885,6 +1885,9 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { parity_untried++; } + + if (rc->rc_force_repair) + unexpected_errors++; } /* @@ -2249,9 +2252,20 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; - if (rc->rc_error) { - ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ + /* + * If scrubbing and a replacing/sparing child vdev determined + * that not all of its children have an identical copy of the + * data, then clear the error so the column is treated like + * any other read and force a repair to correct the damage. + */ + if (rc->rc_error == ECKSUM) { + ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); + vdev_raidz_checksum_error(zio, rc, rc->rc_abd); + rc->rc_force_repair = 1; + rc->rc_error = 0; + } + if (rc->rc_error) { if (c < rr->rr_firstdatacol) parity_errors++; else diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 89ee0d3cb7b6..a4ec27a368ac 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -763,7 +763,8 @@ tags = ['functional', 'raidz'] [tests/functional/redundancy] tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2', - 'redundancy_draid3', 'redundancy_draid_damaged', 'redundancy_draid_spare1', + 'redundancy_draid3', 'redundancy_draid_damaged1', + 'redundancy_draid_damaged2', 'redundancy_draid_spare1', 'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror', 'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2', 'redundancy_raidz3', 'redundancy_stripe'] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index ddb9bb7eed1d..bf7cf22b61fc 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -226,9 +226,6 @@ maybe = { 'pyzfs/pyzfs_unittest': ['SKIP', python_deps_reason], 'pool_checkpoint/checkpoint_discard_busy': ['FAIL', 11946], 'projectquota/setup': ['SKIP', exec_reason], - 'redundancy/redundancy_004_neg': ['FAIL', 7290], - 'redundancy/redundancy_draid_spare1': ['FAIL', known_reason], - 'redundancy/redundancy_draid_spare3': ['FAIL', known_reason], 'removal/removal_condense_export': ['FAIL', known_reason], 'reservation/reservation_008_pos': ['FAIL', 7741], 'reservation/reservation_018_pos': ['FAIL', 5642], diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index e65a8bba2c2c..4c5b1121293e 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1632,7 +1632,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/redundancy/redundancy_draid1.ksh \ functional/redundancy/redundancy_draid2.ksh \ functional/redundancy/redundancy_draid3.ksh \ - functional/redundancy/redundancy_draid_damaged.ksh \ + functional/redundancy/redundancy_draid_damaged1.ksh \ + functional/redundancy/redundancy_draid_damaged2.ksh \ functional/redundancy/redundancy_draid.ksh \ functional/redundancy/redundancy_draid_spare1.ksh \ functional/redundancy/redundancy_draid_spare2.ksh \ diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh similarity index 84% rename from tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged.ksh rename to tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh index 9b3be9f4e3f7..da2d58eef6f3 100755 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged.ksh +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh @@ -85,28 +85,13 @@ function test_sequential_resilver # <pool> <parity> <dir> for (( i=0; i<$nparity; i=i+1 )); do spare=draid${nparity}-0-$i - zpool status $pool - zpool replace -fsw $pool $dir/dev-$i $spare - zpool status $pool + log_must zpool replace -fsw $pool $dir/dev-$i $spare done log_must zpool scrub -w $pool + log_must zpool status $pool - # When only a single child was overwritten the sequential resilver - # can fully repair the damange from parity and the scrub will have - # nothing to repair. When multiple children are silently damaged - # the sequential resilver will calculate the wrong data since only - # the parity information is used and it cannot be verified with - # the checksum. However, since only the resilvering devices are - # written to with the bad data a subsequent scrub will be able to - # fully repair the pool. - # - if [[ $nparity == 1 ]]; then - log_must check_pool_status $pool "scan" "repaired 0B" - else - log_mustnot check_pool_status $pool "scan" "repaired 0B" - fi - + log_mustnot check_pool_status $pool "scan" "repaired 0B" log_must check_pool_status $pool "errors" "No known data errors" log_must check_pool_status $pool "scan" "with 0 errors" } diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh new file mode 100755 index 000000000000..8e06db9bad99 --- /dev/null +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh @@ -0,0 +1,157 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2022 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# When sequentially resilvering a dRAID pool to a distributed spare +# silent damage to an online vdev in a replacing or spare mirror vdev +# is not expected to be repaired. Not only does the rebuild have no +# reason to suspect the silent damage but even if it did there's no +# checksum available to determine the correct copy and make the repair. +# However, the subsequent scrub should detect and repair any damage. +# +# STRATEGY: +# 1. Create block device files for the test draid pool +# 2. For each parity value [1..3] +# a. Create a draid pool +# b. Fill it with some directories/files +# c. Systematically damage and replace three devices by: +# - Overwrite the device +# - Replace the damaged vdev with a distributed spare +# - Scrub the pool and verify repair IO is issued +# d. Detach the distributed spares +# e. Scrub the pool and verify there was nothing to repair +# f. Destroy the draid pool +# + +typeset -r devs=7 +typeset -r dev_size_mb=512 +typeset -a disks + +prefetch_disable=$(get_tunable PREFETCH_DISABLE) +rebuild_scrub_enabled=$(get_tunable REBUILD_SCRUB_ENABLED) + +function cleanup +{ + poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL" + + for i in {0..$devs}; do + rm -f "$TEST_BASE_DIR/dev-$i" + done + + set_tunable32 PREFETCH_DISABLE $prefetch_disable + set_tunable32 REBUILD_SCRUB_ENABLED $rebuild_scrub_enabled +} + +log_onexit cleanup + +log_must set_tunable32 PREFETCH_DISABLE 1 +log_must set_tunable32 REBUILD_SCRUB_ENABLED 0 + +# Disk files which will be used by pool +for i in {0..$(($devs - 1))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s ${dev_size_mb}M $device + disks[${#disks[*]}+1]=$device +done + +# Disk file which will be attached +log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs + +dir=$TEST_BASE_DIR + +for nparity in 1 2 3; do + raid=draid${nparity}:3s + + log_must zpool create -f -O compression=off -o cachefile=none \ + $TESTPOOL $raid ${disks[@]} + # log_must zfs set primarycache=metadata $TESTPOOL + + log_must zfs create $TESTPOOL/fs + log_must fill_fs /$TESTPOOL/fs 1 256 10 1024 R + + log_must zfs create -o compress=on $TESTPOOL/fs2 + log_must fill_fs /$TESTPOOL/fs2 1 256 10 1024 R + + log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3 + log_must fill_fs /$TESTPOOL/fs3 1 256 10 1024 R + + log_must zpool export $TESTPOOL + log_must zpool import -o cachefile=none -d $dir $TESTPOOL + + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + + for nspare in 0 1 2; do + damaged=$dir/dev-${nspare} + spare=draid${nparity}-0-${nspare} + + log_must zpool export $TESTPOOL + log_must dd conv=notrunc if=/dev/zero of=$damaged \ + bs=1M seek=4 count=$(($dev_size_mb-4)) + log_must zpool import -o cachefile=none -d $dir $TESTPOOL + + log_must zpool replace -fsw $TESTPOOL $damaged $spare + + # Scrub the pool after the sequential resilver and verify + # that the silent damage was repaired by the scrub. + log_must zpool scrub -w $TESTPOOL + log_must zpool status $TESTPOOL + log_must check_pool_status $TESTPOOL "errors" \ + "No known data errors" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_mustnot check_pool_status $TESTPOOL "scan" "repaired 0B" + done + + for nspare in 0 1 2; do + log_must check_vdev_state $TESTPOOL \ + spare-${nspare} "ONLINE" + log_must check_vdev_state $TESTPOOL \ + ${dir}/dev-${nspare} "ONLINE" + log_must check_vdev_state $TESTPOOL \ + draid${nparity}-0-${nspare} "ONLINE" + done + + # Detach the distributed spares and scrub the pool again to + # verify no damage remained on the originally corrupted vdevs. + for nspare in 0 1 2; do + log_must zpool detach $TESTPOOL draid${nparity}-0-${nspare} + done + + log_must zpool clear $TESTPOOL + log_must zpool scrub -w $TESTPOOL + log_must zpool status $TESTPOOL + + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + + log_must zpool destroy "$TESTPOOL" +done + +log_pass "draid damaged device scrub test succeeded."