From 141368a4b6da039cefb745f6219d15b71e38326c Mon Sep 17 00:00:00 2001 From: Don Brady Date: Tue, 1 Oct 2024 10:12:11 -0600 Subject: [PATCH] Restrict raidz faulted vdev count Specifically, a child in a replacing vdev won't count when assessing the dtl during a vdev_fault() Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Don Brady Closes #16569 --- module/zfs/vdev.c | 43 +++++++-- tests/runfiles/linux.run | 3 +- tests/zfs-tests/tests/Makefile.am | 1 + .../tests/functional/fault/fault_limits.ksh | 96 +++++++++++++++++++ 4 files changed, 132 insertions(+), 11 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/fault/fault_limits.ksh diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 9305bd894d6f..860572336f69 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -3149,9 +3149,9 @@ vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) * Reassess DTLs after a config change or scrub completion. If txg == 0 no * write operations will be issued to the pool. */ -void -vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, - boolean_t scrub_done, boolean_t rebuild_done) +static void +vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, + boolean_t scrub_done, boolean_t rebuild_done, boolean_t faulting) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; @@ -3160,8 +3160,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) - vdev_dtl_reassess(vd->vdev_child[c], txg, - scrub_txg, scrub_done, rebuild_done); + vdev_dtl_reassess_impl(vd->vdev_child[c], txg, + scrub_txg, scrub_done, rebuild_done, faulting); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; @@ -3255,11 +3255,21 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, if (scrub_done) range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); - if (!vdev_readable(vd)) + + /* + * For the faulting case, treat members of a replacing vdev + * as if they are not available. It's more likely than not that + * a vdev in a replacing vdev could encounter read errors so + * treat it as not being able to contribute. + */ + if (!vdev_readable(vd) || + (faulting && vd->vdev_parent != NULL && + vd->vdev_parent->vdev_ops == &vdev_replacing_ops)) { range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); - else + } else { range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); + } /* * If the vdev was resilvering or rebuilding and no longer @@ -3321,6 +3331,14 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, } } +void +vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, + boolean_t scrub_done, boolean_t rebuild_done) +{ + return (vdev_dtl_reassess_impl(vd, txg, scrub_txg, scrub_done, + rebuild_done, B_FALSE)); +} + /* * Iterate over all the vdevs except spare, and post kobj events */ @@ -3548,7 +3566,11 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) } /* - * Determine whether the specified vdev can be offlined/detached/removed + * Determine whether the specified vdev can be + * - offlined + * - detached + * - removed + * - faulted * without losing data. */ boolean_t @@ -3558,6 +3580,7 @@ vdev_dtl_required(vdev_t *vd) vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; + boolean_t faulting = vd->vdev_state == VDEV_STATE_FAULTED; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); @@ -3570,10 +3593,10 @@ vdev_dtl_required(vdev_t *vd) * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; - vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); + vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; - vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); + vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting); if (!required && zio_injection_enabled) { required = !!zio_handle_device_injection(vd, NULL, diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index d791ce57c3fe..4e262affb788 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -125,7 +125,8 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', 'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault', 'decompress_fault', - 'scrub_after_resilver', 'suspend_resume_single', 'zpool_status_-s'] + 'fault_limits', 'scrub_after_resilver', 'suspend_resume_single', + 'zpool_status_-s'] tags = ['functional', 'fault'] [tests/functional/features/large_dnode:Linux] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 994fba173368..da6dc235cf57 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1525,6 +1525,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/fault/cleanup.ksh \ functional/fault/decompress_fault.ksh \ functional/fault/decrypt_fault.ksh \ + functional/fault/fault_limits.ksh \ functional/fault/scrub_after_resilver.ksh \ functional/fault/suspend_resume_single.ksh \ functional/fault/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/fault/fault_limits.ksh b/tests/zfs-tests/tests/functional/fault/fault_limits.ksh new file mode 100755 index 000000000000..2f62a15c392b --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/fault_limits.ksh @@ -0,0 +1,96 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2024 by Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/fault/fault.cfg + +# +# DESCRIPTION: Verify that raidz children vdev fault count is restricted +# +# STRATEGY: +# 1. Create a raidz2 or raidz3 pool and add some data to it +# 2. Replace one of the child vdevs to create a replacing vdev +# 3. While it is resilvering, attempt to fault disks +# 4. Verify that less than parity count was faulted while replacing +# + +TESTPOOL="fault-test-pool" +PARITY=$((RANDOM%(2) + 2)) +VDEV_CNT=$((4 + (2 * PARITY))) +VDEV_SIZ=512M + +function cleanup +{ + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$((VDEV_CNT - 1))}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done +} + +log_onexit cleanup +log_assert "restricts raidz children vdev fault count" + +log_note "creating $VDEV_CNT vdevs for parity $PARITY test" +typeset -a disks +for i in {0..$((VDEV_CNT - 1))}; do + device=$TEST_BASE_DIR/dev-$i + log_must truncate -s $VDEV_SIZ $device + disks[${#disks[*]}+1]=$device +done + +log_must zpool create -f ${TESTPOOL} raidz${PARITY} ${disks[1..$((VDEV_CNT - 1))]} + +# Add some data to the pool +log_must zfs create $TESTPOOL/fs +MNTPOINT="$(get_prop mountpoint $TESTPOOL/fs)" +log_must fill_fs $MNTPOINT $PARITY 200 32768 1000 Z +sync_pool $TESTPOOL + +# Replace the last child vdev to form a replacing vdev +log_must zpool replace ${TESTPOOL} ${disks[$((VDEV_CNT - 1))]} ${disks[$VDEV_CNT]} +# imediately offline replacement disk to keep replacing vdev around +log_must zpool offline ${TESTPOOL} ${disks[$VDEV_CNT]} + +# Fault disks while a replacing vdev is still active +for disk in ${disks[0..$PARITY]}; do + log_must zpool offline -tf ${TESTPOOL} $disk +done + +zpool status $TESTPOOL + +# Count the faults that succeeded +faults=0 +for disk in ${disks[0..$PARITY]}; do + state=$(zpool get -H -o value state ${TESTPOOL} ${disk}) + if [ "$state" = "FAULTED" ] ; then + ((faults=faults+1)) + fi +done + +log_must test "$faults" -lt "$PARITY" +log_must test "$faults" -gt 0 + +log_pass "restricts raidz children vdev fault count"