Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scrub mirror children without BPs - 2.1.6 backport #13642

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 72 additions & 12 deletions module/zfs/vdev_mirror.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include <sys/vdev_impl.h>
#include <sys/vdev_draid.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>

Expand Down Expand Up @@ -102,6 +103,7 @@ vdev_mirror_stat_fini(void)
*/
typedef struct mirror_child {
vdev_t *mc_vd;
abd_t *mc_abd;
uint64_t mc_offset;
int mc_error;
int mc_load;
Expand Down Expand Up @@ -434,6 +436,10 @@ vdev_mirror_child_done(zio_t *zio)
{
mirror_child_t *mc = zio->io_private;

/* See scrubbing read comment in vdev_mirror_io_start() */
if (zio->io_flags & ZIO_FLAG_SCRUB && zio->io_bp == NULL)
mc->mc_abd = zio->io_abd;

mc->mc_error = zio->io_error;
mc->mc_tried = 1;
mc->mc_skipped = 0;
Expand Down Expand Up @@ -637,15 +643,16 @@ vdev_mirror_io_start(zio_t *zio)
}

if (zio->io_type == ZIO_TYPE_READ) {
if (zio->io_bp != NULL &&
(zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
/*
* For scrubbing reads (if we can verify the
* checksum here, as indicated by io_bp being
* non-NULL) we need to allocate a read buffer for
* each child and issue reads to all children. If
* any child succeeds, it will copy its data into
* zio->io_data in vdev_mirror_scrub_done.
* For scrubbing reads we need to allocate a buffer
* for each child and issue reads to all children.
* If we can verify the checksum here, as indicated
* by io_bp being non-NULL, the data will be copied
* into zio->io_data in vdev_mirror_scrub_done().
* If not, then vdev_mirror_io_done() will compare
* all of the read buffers and return a checksum
* error if they aren't all identical.
*/
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
Expand All @@ -663,7 +670,8 @@ vdev_mirror_io_start(zio_t *zio)
abd_alloc_sametype(zio->io_abd,
zio->io_size), zio->io_size,
zio->io_type, zio->io_priority, 0,
vdev_mirror_scrub_done, mc));
zio->io_bp ? vdev_mirror_scrub_done :
vdev_mirror_child_done, mc));
}
zio_execute(zio);
return;
Expand Down Expand Up @@ -731,6 +739,7 @@ vdev_mirror_io_done(zio_t *zio)
int c;
int good_copies = 0;
int unexpected_errors = 0;
int last_good_copy = -1;

if (mm == NULL)
return;
Expand All @@ -742,6 +751,7 @@ vdev_mirror_io_done(zio_t *zio)
if (!mc->mc_skipped)
unexpected_errors++;
} else if (mc->mc_tried) {
last_good_copy = c;
good_copies++;
}
}
Expand All @@ -755,7 +765,6 @@ vdev_mirror_io_done(zio_t *zio)
* no non-degraded top-level vdevs left, and not update DTLs
* if we intend to reallocate.
*/
/* XXPOLICY */
if (good_copies != mm->mm_children) {
/*
* Always require at least one good copy.
Expand All @@ -782,7 +791,6 @@ vdev_mirror_io_done(zio_t *zio)
/*
* If we don't have a good copy yet, keep trying other children.
*/
/* XXPOLICY */
if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
ASSERT(c >= 0 && c < mm->mm_children);
mc = &mm->mm_child[c];
Expand All @@ -794,7 +802,59 @@ vdev_mirror_io_done(zio_t *zio)
return;
}

/* XXPOLICY */
/*
* If we're scrubbing but don't have a BP available (because this
* vdev is under a raidz or draid vdev) then the best we can do is
* compare all of the copies read. If they're not identical then
* return a checksum error and the most likely correct data. The
* raidz code will issue a repair I/O if possible.
*/
if (zio->io_flags & ZIO_FLAG_SCRUB && zio->io_bp == NULL) {
abd_t *last_good_abd;

ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops ||
zio->io_vd->vdev_ops == &vdev_spare_ops);

if (good_copies > 1) {
last_good_abd = mm->mm_child[last_good_copy].mc_abd;
abd_t *best_abd = NULL;

for (c = 0; c < last_good_copy; c++) {
mc = &mm->mm_child[c];

if (mc->mc_error || !mc->mc_tried)
continue;

if (abd_cmp(mc->mc_abd, last_good_abd) != 0)
zio->io_error = SET_ERROR(ECKSUM);

/*
* The distributed spare is always prefered
* by vdev_mirror_child_select() so it's
* considered to be the best candidate.
*/
if (best_abd == NULL &&
mc->mc_vd->vdev_ops ==
&vdev_draid_spare_ops) {
best_abd = mc->mc_abd;
}
}

abd_copy(zio->io_abd, best_abd ? best_abd :
last_good_abd, zio->io_size);

} else if (good_copies == 1) {
last_good_abd = mm->mm_child[last_good_copy].mc_abd;
abd_copy(zio->io_abd, last_good_abd, zio->io_size);
}

for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
abd_free(mc->mc_abd);
mc->mc_abd = NULL;
}
}

if (good_copies == 0) {
zio->io_error = vdev_mirror_worst_error(mm);
ASSERT(zio->io_error != 0);
Expand Down
18 changes: 16 additions & 2 deletions module/zfs/vdev_raidz.c
Original file line number Diff line number Diff line change
Expand Up @@ -1791,6 +1791,9 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
parity_untried++;
}

if (rc->rc_force_repair)
unexpected_errors++;
}

/*
Expand Down Expand Up @@ -2155,9 +2158,20 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
for (int c = 0; c < rr->rr_cols; c++) {
raidz_col_t *rc = &rr->rr_col[c];

if (rc->rc_error) {
ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
/*
* If scrubbing and a replacing/sparing child vdev determined
* that not all of its children have an identical copy of the
* data, then clear the error so the column is treated like
* any other read and force a repair to correct the damage.
*/
if (rc->rc_error == ECKSUM) {
ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
rc->rc_force_repair = 1;
rc->rc_error = 0;
}

if (rc->rc_error) {
if (c < rr->rr_firstdatacol)
parity_errors++;
else
Expand Down
3 changes: 2 additions & 1 deletion tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -747,7 +747,8 @@ tags = ['functional', 'raidz']

[tests/functional/redundancy]
tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2',
'redundancy_draid3', 'redundancy_draid_damaged', 'redundancy_draid_spare1',
'redundancy_draid3', 'redundancy_draid_damaged1',
'redundancy_draid_damaged2', 'redundancy_draid_spare1',
'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror',
'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2',
'redundancy_raidz3', 'redundancy_stripe']
Expand Down
2 changes: 0 additions & 2 deletions tests/test-runner/bin/zts-report.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,6 @@ maybe = {
'pyzfs/pyzfs_unittest': ['SKIP', python_deps_reason],
'pool_checkpoint/checkpoint_discard_busy': ['FAIL', '11946'],
'projectquota/setup': ['SKIP', exec_reason],
'redundancy/redundancy_004_neg': ['FAIL', '7290'],
'redundancy/redundancy_draid_spare3': ['SKIP', known_reason],
'removal/removal_condense_export': ['FAIL', known_reason],
'reservation/reservation_008_pos': ['FAIL', '7741'],
'reservation/reservation_018_pos': ['FAIL', '5642'],
Expand Down
3 changes: 2 additions & 1 deletion tests/zfs-tests/tests/functional/redundancy/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ dist_pkgdata_SCRIPTS = \
redundancy_draid1.ksh \
redundancy_draid2.ksh \
redundancy_draid3.ksh \
redundancy_draid_damaged.ksh \
redundancy_draid_damaged1.ksh \
redundancy_draid_damaged2.ksh \
redundancy_draid_spare1.ksh \
redundancy_draid_spare2.ksh \
redundancy_draid_spare3.ksh \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,22 +89,9 @@ function test_sequential_resilver # <pool> <parity> <dir>
done

log_must zpool scrub -w $pool
log_must zpool status $pool

# When only a single child was overwritten the sequential resilver
# can fully repair the damange from parity and the scrub will have
# nothing to repair. When multiple children are silently damaged
# the sequential resilver will calculate the wrong data since only
# the parity information is used and it cannot be verified with
# the checksum. However, since only the resilvering devices are
# written to with the bad data a subsequent scrub will be able to
# fully repair the pool.
#
if [[ $nparity == 1 ]]; then
log_must check_pool_status $pool "scan" "repaired 0B"
else
log_mustnot check_pool_status $pool "scan" "repaired 0B"
fi

log_mustnot check_pool_status $pool "scan" "repaired 0B"
log_must check_pool_status $pool "errors" "No known data errors"
log_must check_pool_status $pool "scan" "with 0 errors"
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#

#
# Copyright (c) 2022 by Lawrence Livermore National Security, LLC.
#

. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib

#
# DESCRIPTION:
# When sequentially resilvering a dRAID pool to a distributed spare
# silent damage to an online vdev in a replacing or spare mirror vdev
# is not expected to be repaired. Not only does the rebuild have no
# reason to suspect the silent damage but even if it did there's no
# checksum available to determine the correct copy and make the repair.
# However, the subsequent scrub should detect and repair any damage.
#
# STRATEGY:
# 1. Create block device files for the test draid pool
# 2. For each parity value [1..3]
# a. Create a draid pool
# b. Fill it with some directories/files
# c. Systematically damage and replace three devices by:
# - Overwrite the device
# - Replace the damaged vdev with a distributed spare
# - Scrub the pool and verify repair IO is issued
# d. Detach the distributed spares
# e. Scrub the pool and verify there was nothing to repair
# f. Destroy the draid pool
#

typeset -r devs=7
typeset -r dev_size_mb=512
typeset -a disks

prefetch_disable=$(get_tunable PREFETCH_DISABLE)
rebuild_scrub_enabled=$(get_tunable REBUILD_SCRUB_ENABLED)

function cleanup
{
poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL"

for i in {0..$devs}; do
rm -f "$TEST_BASE_DIR/dev-$i"
done

set_tunable32 PREFETCH_DISABLE $prefetch_disable
set_tunable32 REBUILD_SCRUB_ENABLED $rebuild_scrub_enabled
}

log_onexit cleanup

log_must set_tunable32 PREFETCH_DISABLE 1
log_must set_tunable32 REBUILD_SCRUB_ENABLED 0

# Disk files which will be used by pool
for i in {0..$(($devs - 1))}; do
device=$TEST_BASE_DIR/dev-$i
log_must truncate -s ${dev_size_mb}M $device
disks[${#disks[*]}+1]=$device
done

# Disk file which will be attached
log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs

dir=$TEST_BASE_DIR

for nparity in 1 2 3; do
raid=draid${nparity}:3s

log_must zpool create -f -O compression=off -o cachefile=none \
$TESTPOOL $raid ${disks[@]}
# log_must zfs set primarycache=metadata $TESTPOOL

log_must zfs create $TESTPOOL/fs
log_must fill_fs /$TESTPOOL/fs 1 256 10 1024 R

log_must zfs create -o compress=on $TESTPOOL/fs2
log_must fill_fs /$TESTPOOL/fs2 1 256 10 1024 R

log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3
log_must fill_fs /$TESTPOOL/fs3 1 256 10 1024 R

log_must zpool export $TESTPOOL
log_must zpool import -o cachefile=none -d $dir $TESTPOOL

log_must check_pool_status $TESTPOOL "errors" "No known data errors"

for nspare in 0 1 2; do
damaged=$dir/dev-${nspare}
spare=draid${nparity}-0-${nspare}

log_must zpool export $TESTPOOL
log_must dd conv=notrunc if=/dev/zero of=$damaged \
bs=1M seek=4 count=$(($dev_size_mb-4))
log_must zpool import -o cachefile=none -d $dir $TESTPOOL

log_must zpool replace -fsw $TESTPOOL $damaged $spare

# Scrub the pool after the sequential resilver and verify
# that the silent damage was repaired by the scrub.
log_must zpool scrub -w $TESTPOOL
log_must zpool status $TESTPOOL
log_must check_pool_status $TESTPOOL "errors" \
"No known data errors"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
log_mustnot check_pool_status $TESTPOOL "scan" "repaired 0B"
done

for nspare in 0 1 2; do
log_must check_vdev_state $TESTPOOL \
spare-${nspare} "ONLINE"
log_must check_vdev_state $TESTPOOL \
${dir}/dev-${nspare} "ONLINE"
log_must check_vdev_state $TESTPOOL \
draid${nparity}-0-${nspare} "ONLINE"
done

# Detach the distributed spares and scrub the pool again to
# verify no damage remained on the originally corrupted vdevs.
for nspare in 0 1 2; do
log_must zpool detach $TESTPOOL draid${nparity}-0-${nspare}
done

log_must zpool clear $TESTPOOL
log_must zpool scrub -w $TESTPOOL
log_must zpool status $TESTPOOL

log_must check_pool_status $TESTPOOL "errors" "No known data errors"
log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
log_must check_pool_status $TESTPOOL "scan" "repaired 0B"

log_must zpool destroy "$TESTPOOL"
done

log_pass "draid damaged device scrub test succeeded."