Skip to content

Commit

Permalink
Scrub mirror children without BPs
Browse files Browse the repository at this point in the history
When scrubbing a raidz/draid pool, which contains a replacing or
sparing mirror with multiple online children, only one child will
be read.  This is not normally a serious concern because the DTL
records are used to determine where a good copy of the data is.
As long as the data can be read from one child the mirror vdev
will use it to repair gaps in any of its children.  Furthermore,
even if the data which was read is corrupt the raidz code will
detect this and issue its own repair I/O to correct the damage
in the mirror vdev.

However, in the scenario where the DTL is wrong due to silent
data corruption (say due to overwriting one child) and the scrub
happens to read from a child with good data, then the other damaged
mirror child will not be detected nor repaired.

While this is possible for both raidz and draid vdevs, it's most
pronounced when using draid.  This is because by default the zed
will sequentially rebuild a draid pool to a distributed spare,
and the distributed spare half of the mirror is always preferred
since it delivers better performance.  This means the damaged
half of the mirror will go undetected even after scrubbing.

For system administrations this behavior is non-intuitive and in
a worst case scenario could result in the only good copy of the
data being unknowingly detached from the mirror.

This change resolves the issue by reading all replacing/sparing
mirror children when scrubbing.  When the BP isn't available for
verification, then compare the data buffers from each child.  They
must all be identical, if not there's silent damage and an error
is returned to prompt the top-level vdev to issue a repair I/O to
rewrite the data on all of the mirror children.  Since we can't
tell which child was wrong a checksum error is logged against the
replacing or sparing mirror vdev.

Reviewed-by: Mark Maybee <[email protected]>
Reviewed-by: Tony Hutter <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes openzfs#13555
  • Loading branch information
behlendorf authored and andrewc12 committed Sep 23, 2022
1 parent 2c475a4 commit 6347906
Show file tree
Hide file tree
Showing 7 changed files with 252 additions and 37 deletions.
84 changes: 72 additions & 12 deletions module/zfs/vdev_mirror.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include <sys/vdev_impl.h>
#include <sys/vdev_draid.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>

Expand Down Expand Up @@ -102,6 +103,7 @@ vdev_mirror_stat_fini(void)
*/
typedef struct mirror_child {
vdev_t *mc_vd;
abd_t *mc_abd;
uint64_t mc_offset;
int mc_error;
int mc_load;
Expand Down Expand Up @@ -434,6 +436,10 @@ vdev_mirror_child_done(zio_t *zio)
{
mirror_child_t *mc = zio->io_private;

/* See scrubbing read comment in vdev_mirror_io_start() */
if (zio->io_flags & ZIO_FLAG_SCRUB && zio->io_bp == NULL)
mc->mc_abd = zio->io_abd;

mc->mc_error = zio->io_error;
mc->mc_tried = 1;
mc->mc_skipped = 0;
Expand Down Expand Up @@ -637,15 +643,16 @@ vdev_mirror_io_start(zio_t *zio)
}

if (zio->io_type == ZIO_TYPE_READ) {
if (zio->io_bp != NULL &&
(zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
/*
* For scrubbing reads (if we can verify the
* checksum here, as indicated by io_bp being
* non-NULL) we need to allocate a read buffer for
* each child and issue reads to all children. If
* any child succeeds, it will copy its data into
* zio->io_data in vdev_mirror_scrub_done.
* For scrubbing reads we need to allocate a buffer
* for each child and issue reads to all children.
* If we can verify the checksum here, as indicated
* by io_bp being non-NULL, the data will be copied
* into zio->io_data in vdev_mirror_scrub_done().
* If not, then vdev_mirror_io_done() will compare
* all of the read buffers and return a checksum
* error if they aren't all identical.
*/
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
Expand All @@ -663,7 +670,8 @@ vdev_mirror_io_start(zio_t *zio)
abd_alloc_sametype(zio->io_abd,
zio->io_size), zio->io_size,
zio->io_type, zio->io_priority, 0,
vdev_mirror_scrub_done, mc));
zio->io_bp ? vdev_mirror_scrub_done :
vdev_mirror_child_done, mc));
}
zio_execute(zio);
return;
Expand Down Expand Up @@ -731,6 +739,7 @@ vdev_mirror_io_done(zio_t *zio)
int c;
int good_copies = 0;
int unexpected_errors = 0;
int last_good_copy = -1;

if (mm == NULL)
return;
Expand All @@ -742,6 +751,7 @@ vdev_mirror_io_done(zio_t *zio)
if (!mc->mc_skipped)
unexpected_errors++;
} else if (mc->mc_tried) {
last_good_copy = c;
good_copies++;
}
}
Expand All @@ -755,7 +765,6 @@ vdev_mirror_io_done(zio_t *zio)
* no non-degraded top-level vdevs left, and not update DTLs
* if we intend to reallocate.
*/
/* XXPOLICY */
if (good_copies != mm->mm_children) {
/*
* Always require at least one good copy.
Expand All @@ -782,7 +791,6 @@ vdev_mirror_io_done(zio_t *zio)
/*
* If we don't have a good copy yet, keep trying other children.
*/
/* XXPOLICY */
if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
ASSERT(c >= 0 && c < mm->mm_children);
mc = &mm->mm_child[c];
Expand All @@ -794,7 +802,59 @@ vdev_mirror_io_done(zio_t *zio)
return;
}

/* XXPOLICY */
/*
* If we're scrubbing but don't have a BP available (because this
* vdev is under a raidz or draid vdev) then the best we can do is
* compare all of the copies read. If they're not identical then
* return a checksum error and the most likely correct data. The
* raidz code will issue a repair I/O if possible.
*/
if (zio->io_flags & ZIO_FLAG_SCRUB && zio->io_bp == NULL) {
abd_t *last_good_abd;

ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops ||
zio->io_vd->vdev_ops == &vdev_spare_ops);

if (good_copies > 1) {
last_good_abd = mm->mm_child[last_good_copy].mc_abd;
abd_t *best_abd = NULL;

for (c = 0; c < last_good_copy; c++) {
mc = &mm->mm_child[c];

if (mc->mc_error || !mc->mc_tried)
continue;

if (abd_cmp(mc->mc_abd, last_good_abd) != 0)
zio->io_error = SET_ERROR(ECKSUM);

/*
* The distributed spare is always prefered
* by vdev_mirror_child_select() so it's
* considered to be the best candidate.
*/
if (best_abd == NULL &&
mc->mc_vd->vdev_ops ==
&vdev_draid_spare_ops) {
best_abd = mc->mc_abd;
}
}

abd_copy(zio->io_abd, best_abd ? best_abd :
last_good_abd, zio->io_size);

} else if (good_copies == 1) {
last_good_abd = mm->mm_child[last_good_copy].mc_abd;
abd_copy(zio->io_abd, last_good_abd, zio->io_size);
}

for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
abd_free(mc->mc_abd);
mc->mc_abd = NULL;
}
}

if (good_copies == 0) {
zio->io_error = vdev_mirror_worst_error(mm);
ASSERT(zio->io_error != 0);
Expand Down
18 changes: 16 additions & 2 deletions module/zfs/vdev_raidz.c
Original file line number Diff line number Diff line change
Expand Up @@ -1885,6 +1885,9 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
parity_untried++;
}

if (rc->rc_force_repair)
unexpected_errors++;
}

/*
Expand Down Expand Up @@ -2249,9 +2252,20 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
for (int c = 0; c < rr->rr_cols; c++) {
raidz_col_t *rc = &rr->rr_col[c];

if (rc->rc_error) {
ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
/*
* If scrubbing and a replacing/sparing child vdev determined
* that not all of its children have an identical copy of the
* data, then clear the error so the column is treated like
* any other read and force a repair to correct the damage.
*/
if (rc->rc_error == ECKSUM) {
ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
rc->rc_force_repair = 1;
rc->rc_error = 0;
}

if (rc->rc_error) {
if (c < rr->rr_firstdatacol)
parity_errors++;
else
Expand Down
3 changes: 2 additions & 1 deletion tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -763,7 +763,8 @@ tags = ['functional', 'raidz']

[tests/functional/redundancy]
tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2',
'redundancy_draid3', 'redundancy_draid_damaged', 'redundancy_draid_spare1',
'redundancy_draid3', 'redundancy_draid_damaged1',
'redundancy_draid_damaged2', 'redundancy_draid_spare1',
'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror',
'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2',
'redundancy_raidz3', 'redundancy_stripe']
Expand Down
3 changes: 0 additions & 3 deletions tests/test-runner/bin/zts-report.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -226,9 +226,6 @@ maybe = {
'pyzfs/pyzfs_unittest': ['SKIP', python_deps_reason],
'pool_checkpoint/checkpoint_discard_busy': ['FAIL', 11946],
'projectquota/setup': ['SKIP', exec_reason],
'redundancy/redundancy_004_neg': ['FAIL', 7290],
'redundancy/redundancy_draid_spare1': ['FAIL', known_reason],
'redundancy/redundancy_draid_spare3': ['FAIL', known_reason],
'removal/removal_condense_export': ['FAIL', known_reason],
'reservation/reservation_008_pos': ['FAIL', 7741],
'reservation/reservation_018_pos': ['FAIL', 5642],
Expand Down
3 changes: 2 additions & 1 deletion tests/zfs-tests/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -1632,7 +1632,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/redundancy/redundancy_draid1.ksh \
functional/redundancy/redundancy_draid2.ksh \
functional/redundancy/redundancy_draid3.ksh \
functional/redundancy/redundancy_draid_damaged.ksh \
functional/redundancy/redundancy_draid_damaged1.ksh \
functional/redundancy/redundancy_draid_damaged2.ksh \
functional/redundancy/redundancy_draid.ksh \
functional/redundancy/redundancy_draid_spare1.ksh \
functional/redundancy/redundancy_draid_spare2.ksh \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,28 +85,13 @@ function test_sequential_resilver # <pool> <parity> <dir>

for (( i=0; i<$nparity; i=i+1 )); do
spare=draid${nparity}-0-$i
zpool status $pool
zpool replace -fsw $pool $dir/dev-$i $spare
zpool status $pool
log_must zpool replace -fsw $pool $dir/dev-$i $spare
done

log_must zpool scrub -w $pool
log_must zpool status $pool

# When only a single child was overwritten the sequential resilver
# can fully repair the damange from parity and the scrub will have
# nothing to repair. When multiple children are silently damaged
# the sequential resilver will calculate the wrong data since only
# the parity information is used and it cannot be verified with
# the checksum. However, since only the resilvering devices are
# written to with the bad data a subsequent scrub will be able to
# fully repair the pool.
#
if [[ $nparity == 1 ]]; then
log_must check_pool_status $pool "scan" "repaired 0B"
else
log_mustnot check_pool_status $pool "scan" "repaired 0B"
fi

log_mustnot check_pool_status $pool "scan" "repaired 0B"
log_must check_pool_status $pool "errors" "No known data errors"
log_must check_pool_status $pool "scan" "with 0 errors"
}
Expand Down
Loading

0 comments on commit 6347906

Please sign in to comment.