From 6611d4609d50d61291286c4ff98a151d0046d4bc Mon Sep 17 00:00:00 2001 From: Don Brady Date: Mon, 14 Aug 2023 21:37:39 -0600 Subject: [PATCH] Detect BTX boot loader in the zfs reserved boot section Signed-off-by: Don Brady --- include/sys/vdev.h | 1 + lib/libzfs/libzfs_pool.c | 20 ++++- lib/libzpool/Makefile.am | 1 + module/Kbuild.in | 1 + module/os/freebsd/zfs/vdev_label_os.c | 59 +++++++++++++ module/os/linux/zfs/vdev_label_os.c | 44 ++++++++++ module/zfs/spa.c | 6 ++ module/zfs/vdev_raidz.c | 4 +- tests/runfiles/common.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../functional/raidz/raidz_expand_007_neg.ksh | 86 +++++++++++++++++++ 11 files changed, 219 insertions(+), 6 deletions(-) create mode 100644 module/os/linux/zfs/vdev_label_os.c create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_expand_007_neg.ksh diff --git a/include/sys/vdev.h b/include/sys/vdev.h index c2ad8c36dfcf..38f62b07dc59 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -209,6 +209,7 @@ extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *); extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *); extern int vdev_uberblock_sync_list(vdev_t **, int, struct uberblock *, int); +extern int vdev_check_boot_reserve(spa_t *, vdev_t *); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 2ac017291b88..e5977a5bab65 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3520,8 +3520,8 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, break; case EBUSY: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "%s is busy"), new_disk); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy"), + new_disk); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; @@ -3561,9 +3561,23 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, "being replaced")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; + } else { + (void) zpool_standard_error(hdl, errno, errbuf); } - /* fall through */ + break; + case EADDRINUSE: + /* + * The boot reserved area is already being used (FreeBSD) + */ + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "the reserved boot area needed for the expansion " + "is already being used by a boot loader")); + } else { + (void) zpool_standard_error(hdl, errno, errbuf); + } + break; default: (void) zpool_standard_error(hdl, errno, errbuf); } diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 58d7f07527aa..3c986a707d2d 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -43,6 +43,7 @@ nodist_libzpool_la_SOURCES = \ module/os/linux/zfs/arc_os.c \ module/os/linux/zfs/trace.c \ module/os/linux/zfs/vdev_file.c \ + module/os/linux/zfs/vdev_label_os.c \ module/os/linux/zfs/zfs_debug.c \ module/os/linux/zfs/zfs_racct.c \ module/os/linux/zfs/zfs_znode.c \ diff --git a/module/Kbuild.in b/module/Kbuild.in index c132171592a8..6408fb106717 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -445,6 +445,7 @@ ZFS_OBJS_OS := \ trace.o \ vdev_disk.o \ vdev_file.o \ + vdev_label_os.o \ zfs_acl.o \ zfs_ctldir.o \ zfs_debug.o \ diff --git a/module/os/freebsd/zfs/vdev_label_os.c b/module/os/freebsd/zfs/vdev_label_os.c index bc856b930235..338982ff6873 100644 --- a/module/os/freebsd/zfs/vdev_label_os.c +++ b/module/os/freebsd/zfs/vdev_label_os.c @@ -72,3 +72,62 @@ vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size) abd_free(pad2); return (error); } + +static void +vdev_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + mutex_enter(&pio->io_lock); + pio->io_error = zio_worst_error(pio->io_error, zio->io_error); + mutex_exit(&pio->io_lock); +} + +/* + * Check if the reserved boot area is in-use. + * + * When booting FreeBSD with an MBR partition with ZFS, the zfsboot file + * (which understands the ZFS file system) is written to the ZFS BOOT + * reserve area (at offset 512K). We check for that here before attaching + * a disk to raidz which would then corrupt this boot data. + */ +int +vdev_check_boot_reserve(spa_t *spa, vdev_t *childvd) +{ + ASSERT(childvd->vdev_ops->vdev_op_leaf); + + size_t size = SPA_MINBLOCKSIZE; + abd_t *abd = abd_alloc_linear(size, B_FALSE); + + zio_t *pio = zio_root(spa, NULL, NULL, 0); + /* + * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to the offset + * to calculate the physical offset to write to. Passing in a negative + * offset lets us access the boot area. + */ + zio_nowait(zio_vdev_child_io(pio, NULL, childvd, + VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abd, size, ZIO_TYPE_READ, + ZIO_PRIORITY_ASYNC_READ, 0, vdev_child_done, pio)); + zio_wait(pio); + + unsigned char *buf = abd_to_buf(abd); + + /* + * The BTX server has a special header at the begining. + * + * btx_hdr: .byte 0xeb # Machine ID + * .byte 0xe # Header size + * .ascii "BTX" # Magic + * .byte 0x1 # Major version + * .byte 0x2 # Minor version + * .byte BTX_FLAGS # Flags + */ + if (buf[0] == 0xeb && buf[1] == 0x0e && + buf[2] == 'B' && buf[3] == 'T' && buf[4] == 'X') { + abd_free(abd); + return (EBUSY); + } + + abd_free(abd); + return (0); +} diff --git a/module/os/linux/zfs/vdev_label_os.c b/module/os/linux/zfs/vdev_label_os.c new file mode 100644 index 000000000000..f0325d84b797 --- /dev/null +++ b/module/os/linux/zfs/vdev_label_os.c @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2023 by iXsystems, Inc. + */ + +#include +#include +#include +#include +#include + +/* + * Check if the reserved boot area is in-use. + * + * Not aware of any external uses on Linux. + */ +int +vdev_check_boot_reserve(spa_t *spa, vdev_t *childvd) +{ + (void) spa; + (void) childvd; + + return (0); +} diff --git a/module/zfs/spa.c b/module/zfs/spa.c index a8f5e9e2d55f..28c93714475e 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -7059,6 +7059,12 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, return (spa_vdev_exit(spa, newrootvd, txg, ENXIO)); } + /* Also fail if reserved boot area is in-use */ + if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i]) + != 0) { + return (spa_vdev_exit(spa, newrootvd, txg, + EADDRINUSE)); + } } } diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index bf0e025cd1b1..1d41297e6443 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -207,7 +207,7 @@ * context. The design also allows for fast discovery of what data to copy. * * The VDEV metaslabs are processed, one at a time, to copy the block data to - * have it flow across all the disks. The metasab is disabled for allocations + * have it flow across all the disks. The metaslab is disabled for allocations * during the copy. As an optimization, we only copy the allocated data which * can be determined by looking at the metaslab range tree. During the copy we * must maintain the redundancy guarantees of the RAIDZ VDEV (e.g. parity count @@ -267,7 +267,7 @@ * * == Reflow Progress Updates == * After the initial scratch-based reflow, the expansion process works - * similarly to device removal. We create a new open context thread whichi + * similarly to device removal. We create a new open context thread which * reflows the data, and periodically kicks off sync tasks to update logical * state. In this case, state is the committed progress (offset of next data * to copy). We need to persist the completed offset on disk, so that if we diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index e0fea3d68d32..57e82db7f1ee 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -772,7 +772,7 @@ tags = ['functional', 'redacted_send'] tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos', 'raidz_expand_001_pos', 'raidz_expand_002_pos', 'raidz_expand_003_neg', 'raidz_expand_003_pos', 'raidz_expand_004_pos', 'raidz_expand_005_pos', - 'raidz_expand_006_neg'] + 'raidz_expand_006_neg', 'raidz_expand_007_neg'] tags = ['functional', 'raidz'] timeout = 1200 diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 9703a5cdb60d..582f3a60a229 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1669,6 +1669,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/raidz/raidz_expand_004_pos.ksh \ functional/raidz/raidz_expand_005_pos.ksh \ functional/raidz/raidz_expand_006_neg.ksh \ + functional/raidz/raidz_expand_007_neg.ksh \ functional/raidz/setup.ksh \ functional/redacted_send/cleanup.ksh \ functional/redacted_send/redacted_compressed.ksh \ diff --git a/tests/zfs-tests/tests/functional/raidz/raidz_expand_007_neg.ksh b/tests/zfs-tests/tests/functional/raidz/raidz_expand_007_neg.ksh new file mode 100755 index 000000000000..78294cb9e516 --- /dev/null +++ b/tests/zfs-tests/tests/functional/raidz/raidz_expand_007_neg.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Negative for FreeBSD Only +# +# Attempting to expand a RAIDZ should fail if the scratch area on the +# existing disks contains BTX Server binary (used to boot FreeBSD when +# using MBR partitions with ZFS). +# +# STRATEGY: +# 1. Create raidz pool +# 2. Add a BTX header to the reserved boot area +# 3. Attempt to attach a device to the raidz vdev +# 4. Verify that device attached failed +# 5. Destroy the raidz pool + +typeset -r devs=4 +typeset -r dev_size_mb=128 +typeset -a disks + +function cleanup +{ + log_pos zpool status "$TESTPOOL" + + poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$TEST_BASE_DIR/dev-$i" + done +} + +log_onexit cleanup + +for i in {0..$devs}; do + device=$TEST_BASE_DIR/dev-$i + # simulate active BTX Server data by inserting a BTX header + printf "\xeb\x0e%s\x01\x02\x80" "BTX" | dd of="$device" \ + bs=512 seek=1024 status=none + log_must truncate -s ${dev_size_mb}M "$device" + if [[ $i -ne $devs ]]; then + disks[${#disks[*]}+1]=$device + fi +done + +log_must zpool create -f -o cachefile=none "$TESTPOOL" raidz1 "${disks[@]}" + +if is_freebsd; then + # expecting attach to fail + log_mustnot_expect "the reserved boot area" zpool attach -f \ + "$TESTPOOL" raidz1-0 "$TEST_BASE_DIR/dev-$devs" + log_must zpool destroy "$TESTPOOL" + log_pass "raidz attach failed with in-use reserved boot area" +else + # expecting attach to pass everywhere else + log_must zpool attach -f "$TESTPOOL" raidz1-0 "$TEST_BASE_DIR/dev-$devs" + log_must zpool destroy "$TESTPOOL" + log_pass "raidz attach passed with in-use reserved boot area" +fi +