From 75cda28e8d163bf233cc8812209844d6aaf25d47 Mon Sep 17 00:00:00 2001
From: "Brian D. Behlendorf" <behlendo@kern1.llnl.gov>
Date: Fri, 10 Jun 2022 12:46:06 -0700
Subject: [PATCH] Scrub mirror children without BPs

When scrubbing a raidz/draid pool, which contains a replacing or
sparing mirror with multiple online children, only one child will
be read.  This is not normally a serious concern because the DTL
records are used to determine where a good copy of the data is.
As long as the data can be read from one child the mirror vdev
will use it to repair gaps in any of its children.  Furthermore,
even if the data which was read is corrupt the raidz code will
detect this and issue its own repair I/O to correct the damage
in the mirror vdev.

However, in the contrived scenario where the DTL is wrong because
of silent data corruption (say due to overwritting one child) and
the scrub happens to read from the good online child, then any
damaged mirror child will not be detected and repaired.

While this is possible for both raidz and draid vdevs, it's most
pronouced when using a draid distributed spare because the faster
distributed spare is always the selected child.  This means it's
possible to scrub a pool in this state multiple times and never
get an indication one of the mirror children is incomplete.

For system administrations this behavior is non-intuitive and in
a worst case scenario could result in the only good copy of the
data being detached from the mirror.

This change resolves the issue by always reading all mirror
children when scrubbing.  When the BP isn't available to use for
verification, then compare the data buffers from each child.  They
must all be indentical, if not there's silent damage and an error
is returned to prompt the top-level vdev to issue a repair I/O to
rewrite the data on all of the mirror children.  Since we can't
tell which child was wrong a checksum error is logged against the
replacing or sparing mirror vdev.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 module/zfs/vdev_mirror.c                      |  88 +++++++++--
 module/zfs/vdev_raidz.c                       |   7 +-
 tests/runfiles/common.run                     |   3 +-
 tests/test-runner/bin/zts-report.py.in        |   1 +
 tests/zfs-tests/tests/Makefile.am             |   3 +-
 ...aged.ksh => redundancy_draid_damaged1.ksh} |   6 +-
 .../redundancy/redundancy_draid_damaged2.ksh  | 138 ++++++++++++++++++
 7 files changed, 223 insertions(+), 23 deletions(-)
 rename tests/zfs-tests/tests/functional/redundancy/{redundancy_draid_damaged.ksh => redundancy_draid_damaged1.ksh} (97%)
 create mode 100755 tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh

diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 25abe99e0749..32a21f0b161e 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -35,6 +35,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/zio.h>
+#include <sys/zio_checksum.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
 
@@ -102,6 +103,7 @@ vdev_mirror_stat_fini(void)
  */
 typedef struct mirror_child {
 	vdev_t		*mc_vd;
+	abd_t		*mc_abd;
 	uint64_t	mc_offset;
 	int		mc_error;
 	int		mc_load;
@@ -434,6 +436,10 @@ vdev_mirror_child_done(zio_t *zio)
 {
 	mirror_child_t *mc = zio->io_private;
 
+	/* See scrubbing read comment in vdev_mirror_io_start() */
+	if (zio->io_flags & ZIO_FLAG_SCRUB && zio->io_bp == NULL)
+		mc->mc_abd = zio->io_abd;
+
 	mc->mc_error = zio->io_error;
 	mc->mc_tried = 1;
 	mc->mc_skipped = 0;
@@ -637,15 +643,16 @@ vdev_mirror_io_start(zio_t *zio)
 	}
 
 	if (zio->io_type == ZIO_TYPE_READ) {
-		if (zio->io_bp != NULL &&
-		    (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
+		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
 			/*
-			 * For scrubbing reads (if we can verify the
-			 * checksum here, as indicated by io_bp being
-			 * non-NULL) we need to allocate a read buffer for
-			 * each child and issue reads to all children.  If
-			 * any child succeeds, it will copy its data into
-			 * zio->io_data in vdev_mirror_scrub_done.
+			 * For scrubbing reads we need to allocate a read
+			 * buffer for each child and issue reads to all
+			 * children.  If we can verify the checksum here, as
+			 * indicated by io_bp being non-NULL, the data will be
+			 * copied into zio->io_data in vdev_mirror_scrub_done.
+			 * If not, then vdev_mirror_io_done will compare all
+			 * of the read buffers and return an error if they
+			 * don't all match.
 			 */
 			for (c = 0; c < mm->mm_children; c++) {
 				mc = &mm->mm_child[c];
@@ -663,7 +670,8 @@ vdev_mirror_io_start(zio_t *zio)
 				    abd_alloc_sametype(zio->io_abd,
 				    zio->io_size), zio->io_size,
 				    zio->io_type, zio->io_priority, 0,
-				    vdev_mirror_scrub_done, mc));
+				    zio->io_bp ? vdev_mirror_scrub_done :
+				    vdev_mirror_child_done, mc));
 			}
 			zio_execute(zio);
 			return;
@@ -723,6 +731,21 @@ vdev_mirror_worst_error(mirror_map_t *mm)
 	return (error[0] ? error[0] : error[1]);
 }
 
+static void
+vdev_mirror_checksum_error(zio_t *zio, vdev_t *vd)
+{
+	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd != NULL) {
+		zio_bad_cksum_t zbc = { 0 };
+
+		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
+		    &zio->io_bookmark, zio, zio->io_offset, zio->io_size,
+		    zio->io_abd, NULL, &zbc);
+		mutex_enter(&vd->vdev_stat_lock);
+		vd->vdev_stat.vs_checksum_errors++;
+		mutex_exit(&vd->vdev_stat_lock);
+	}
+}
+
 static void
 vdev_mirror_io_done(zio_t *zio)
 {
@@ -731,6 +754,7 @@ vdev_mirror_io_done(zio_t *zio)
 	int c;
 	int good_copies = 0;
 	int unexpected_errors = 0;
+	int last_good_copy = -1;
 
 	if (mm == NULL)
 		return;
@@ -742,6 +766,7 @@ vdev_mirror_io_done(zio_t *zio)
 			if (!mc->mc_skipped)
 				unexpected_errors++;
 		} else if (mc->mc_tried) {
+			last_good_copy = c;
 			good_copies++;
 		}
 	}
@@ -755,7 +780,6 @@ vdev_mirror_io_done(zio_t *zio)
 		 * no non-degraded top-level vdevs left, and not update DTLs
 		 * if we intend to reallocate.
 		 */
-		/* XXPOLICY */
 		if (good_copies != mm->mm_children) {
 			/*
 			 * Always require at least one good copy.
@@ -782,7 +806,6 @@ vdev_mirror_io_done(zio_t *zio)
 	/*
 	 * If we don't have a good copy yet, keep trying other children.
 	 */
-	/* XXPOLICY */
 	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
 		ASSERT(c >= 0 && c < mm->mm_children);
 		mc = &mm->mm_child[c];
@@ -794,7 +817,48 @@ vdev_mirror_io_done(zio_t *zio)
 		return;
 	}
 
-	/* XXPOLICY */
+	/*
+	 * If we're scrubbing but don't have a BP available (because this
+	 * vdev is under a raidz or draid vdev) then the best we can do is
+	 * compare all of the copies read.  If they're not identical then
+	 * at least one copy must be damaged and a repair should be attempted.
+	 * Report there are no good copies and log a checksum error against
+	 * the replacing-1 or sparing-1 mirror vdev.
+	 */
+	if (zio->io_flags & ZIO_FLAG_SCRUB && zio->io_bp == NULL) {
+		abd_t *last_good_abd = NULL;
+
+		if (good_copies > 1) {
+			last_good_abd = mm->mm_child[last_good_copy].mc_abd;
+
+			for (c = 0; c < last_good_copy; c++) {
+				mc = &mm->mm_child[c];
+
+				if (mc->mc_error) {
+					continue;
+				} else if (mc->mc_tried &&
+				    abd_cmp(mc->mc_abd, last_good_abd) != 0) {
+					good_copies = 0;
+					mc->mc_error = SET_ERROR(ECKSUM);
+					vdev_mirror_checksum_error(zio,
+					    zio->io_vd);
+					break;
+				}
+			}
+		} else if (good_copies == 1) {
+			last_good_abd = mm->mm_child[last_good_copy].mc_abd;
+		}
+
+		if (good_copies > 0 && last_good_abd != NULL)
+			abd_copy(zio->io_abd, last_good_abd, zio->io_size);
+
+		for (c = 0; c < mm->mm_children; c++) {
+			mc = &mm->mm_child[c];
+			abd_free(mc->mc_abd);
+			mc->mc_abd = NULL;
+		}
+	}
+
 	if (good_copies == 0) {
 		zio->io_error = vdev_mirror_worst_error(mm);
 		ASSERT(zio->io_error != 0);
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index ae0777b3dcc1..86d7e86d789f 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -2210,11 +2210,8 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
-		if (rc->rc_error) {
-			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
-
+		if (rc->rc_error)
 			total_errors++;
-		}
 	}
 
 	/*
@@ -2250,8 +2247,6 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_error) {
-			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
-
 			if (c < rr->rr_firstdatacol)
 				parity_errors++;
 			else
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 89ee0d3cb7b6..a4ec27a368ac 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -763,7 +763,8 @@ tags = ['functional', 'raidz']
 
 [tests/functional/redundancy]
 tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2',
-    'redundancy_draid3', 'redundancy_draid_damaged', 'redundancy_draid_spare1',
+    'redundancy_draid3', 'redundancy_draid_damaged1',
+    'redundancy_draid_damaged2', 'redundancy_draid_spare1',
     'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror',
     'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2',
     'redundancy_raidz3', 'redundancy_stripe']
diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index ddb9bb7eed1d..673532c3edd1 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -227,6 +227,7 @@ maybe = {
     'pool_checkpoint/checkpoint_discard_busy': ['FAIL', 11946],
     'projectquota/setup': ['SKIP', exec_reason],
     'redundancy/redundancy_004_neg': ['FAIL', 7290],
+    'redundancy/redundancy_draid_damaged2': ['FAIL', known_reason],
     'redundancy/redundancy_draid_spare1': ['FAIL', known_reason],
     'redundancy/redundancy_draid_spare3': ['FAIL', known_reason],
     'removal/removal_condense_export': ['FAIL', known_reason],
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index e65a8bba2c2c..4c5b1121293e 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1632,7 +1632,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/redundancy/redundancy_draid1.ksh \
 	functional/redundancy/redundancy_draid2.ksh \
 	functional/redundancy/redundancy_draid3.ksh \
-	functional/redundancy/redundancy_draid_damaged.ksh \
+	functional/redundancy/redundancy_draid_damaged1.ksh \
+	functional/redundancy/redundancy_draid_damaged2.ksh \
 	functional/redundancy/redundancy_draid.ksh \
 	functional/redundancy/redundancy_draid_spare1.ksh \
 	functional/redundancy/redundancy_draid_spare2.ksh \
diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh
similarity index 97%
rename from tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged.ksh
rename to tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh
index 9b3be9f4e3f7..66e4c2ebbda8 100755
--- a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged.ksh
+++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged1.ksh
@@ -85,9 +85,9 @@ function test_sequential_resilver # <pool> <parity> <dir>
 
 	for (( i=0; i<$nparity; i=i+1 )); do
 		spare=draid${nparity}-0-$i
-		zpool status $pool
-		zpool replace -fsw $pool $dir/dev-$i $spare
-		zpool status $pool
+		log_must zpool status $pool
+		log_must zpool replace -fsw $pool $dir/dev-$i $spare
+		log_must zpool status $pool
 	done
 
 	log_must zpool scrub -w $pool
diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh
new file mode 100755
index 000000000000..4ba12195e841
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_damaged2.ksh
@@ -0,0 +1,138 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2022 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib
+
+#
+# DESCRIPTION:
+#	When sequentially resilvering a dRAID pool to a distributed spare
+#	silent damage to an online vdev in a replacing or spare mirror vdev
+#	is not expected to be repaired.  Not only does the rebuild have no
+#	reason to suspect the silent damage but even if it did there's no
+#	checksum available to determine the correct copy and make the repair.
+#	However, the subsequent scrub should detect and repair any damage.
+#
+# STRATEGY:
+#	1. Create block device files for the test draid pool
+#	2. Create a single parity draid pool
+#	3. Fill it with some directories/files
+#	4. Overwrite one of the devices
+#	5. Replace the damaged vdev with a distributed spare and wait
+#	   for the sequential resilver and scrub to complete.
+#	6. Verify the scrub issued repair IO
+#	7. Detach the distributed spare
+#	8. Scrub the pool again and verify all damage was corrected.
+#	9. Destroy the draid pool
+#
+
+typeset -r devs=7
+typeset -r dev_size_mb=512
+
+typeset -a disks
+
+prefetch_disable=$(get_tunable PREFETCH_DISABLE)
+rebuild_scrub_enabled=$(get_tunable REBUILD_SCRUB_ENABLED)
+
+function cleanup
+{
+	poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL"
+
+	for i in {0..$devs}; do
+		rm -f "$TEST_BASE_DIR/dev-$i"
+	done
+
+	set_tunable32 PREFETCH_DISABLE $prefetch_disable
+	set_tunable32 REBUILD_SCRUB_ENABLED $rebuild_scrub_enabled
+}
+
+log_onexit cleanup
+
+log_must set_tunable32 PREFETCH_DISABLE 1
+log_must set_tunable32 REBUILD_SCRUB_ENABLED 0
+
+# Disk files which will be used by pool
+for i in {0..$(($devs - 1))}; do
+	device=$TEST_BASE_DIR/dev-$i
+	log_must truncate -s ${dev_size_mb}M $device
+	disks[${#disks[*]}+1]=$device
+done
+
+# Disk file which will be attached
+log_must truncate -s 512M $TEST_BASE_DIR/dev-$devs
+
+raid=draid:1s
+dir=$TEST_BASE_DIR
+spare=draid1-0-0
+damaged=$dir/dev-0
+
+log_must zpool create -O compression=off -f -o cachefile=none $TESTPOOL $raid ${disks[@]}
+log_must zfs set primarycache=metadata $TESTPOOL
+
+log_must zfs create $TESTPOOL/fs
+log_must fill_fs /$TESTPOOL/fs 1 512 100 1024 R
+
+log_must zfs create -o compress=on $TESTPOOL/fs2
+log_must fill_fs /$TESTPOOL/fs2 1 512 100 1024 R
+
+log_must zfs create -o compress=on -o recordsize=8k $TESTPOOL/fs3
+log_must fill_fs /$TESTPOOL/fs3 1 512 100 1024 R
+
+log_must zpool export $TESTPOOL
+log_must zpool import -o cachefile=none -d $dir $TESTPOOL
+
+log_must check_pool_status $TESTPOOL "errors" "No known data errors"
+
+log_must zpool export $TESTPOOL
+log_must dd conv=notrunc if=/dev/zero of=$damaged \
+    bs=1M seek=4 count=$(($dev_size_mb-4))
+log_must zpool import -o cachefile=none -d $dir $TESTPOOL
+
+log_must zpool replace -fsw $TESTPOOL $damaged $spare
+log_must zpool status $TESTPOOL
+
+# Scrub the pool after the sequential resilver and verify
+# that the silent damage was repaired by the scrub.
+log_must zpool scrub -w $TESTPOOL
+log_must zpool status $TESTPOOL
+log_must check_pool_status $TESTPOOL "errors" "No known data errors"
+log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
+log_mustnot check_pool_status $TESTPOOL "scan" "repaired 0B"
+
+# Detach the distributed spare and scrub the pool again to
+# verify no damage remained on the originally corrupted vdev.
+log_must zpool detach $TESTPOOL $spare
+log_must zpool clear $TESTPOOL
+log_must zpool scrub -w $TESTPOOL
+log_must zpool status $TESTPOOL
+
+log_must check_pool_status $TESTPOOL "errors" "No known data errors"
+log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
+log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
+
+log_must zpool destroy "$TESTPOOL"
+
+log_pass "draid damaged device scrub test succeeded."