Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ZED auto-replace for VDEVs using by-id paths #15363

Merged
merged 2 commits into from
Oct 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 37 additions & 18 deletions cmd/zed/agents/zfs_mod.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2016, 2017, Intel Corporation.
* Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
* Copyright (c) 2023, Klara Inc.
*/

/*
Expand Down Expand Up @@ -204,7 +205,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
uint64_t is_spare = 0;
const char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL;
char rawpath[PATH_MAX], fullpath[PATH_MAX];
char devpath[PATH_MAX];
char pathbuf[PATH_MAX];
int ret;
int online_flag = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE;
boolean_t is_sd = B_FALSE;
Expand All @@ -214,6 +215,11 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
char **lines = NULL;
int lines_cnt = 0;

/*
* Get the persistent path, typically under the '/dev/disk/by-id' or
* '/dev/disk/by-vdev' directories. Note that this path can change
* when a vdev is replaced with a new disk.
*/
if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
return;

Expand Down Expand Up @@ -370,15 +376,17 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
(void) snprintf(rawpath, sizeof (rawpath), "%s%s",
is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath);

if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) {
if (realpath(rawpath, pathbuf) == NULL && !is_mpath_wholedisk) {
zed_log_msg(LOG_INFO, " realpath: %s failed (%s)",
rawpath, strerror(errno));

(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
&newstate);
int err = zpool_vdev_online(zhp, fullpath,
ZFS_ONLINE_FORCEFAULT, &newstate);

zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)",
fullpath, libzfs_error_description(g_zfshdl));
zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s) "
"err %d, new state %d",
fullpath, libzfs_error_description(g_zfshdl), err,
err ? (int)newstate : 0);
return;
}

Expand Down Expand Up @@ -428,15 +436,15 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
* to trigger a ZFS fault for the device (and any hot spare
* replacement).
*/
leafname = strrchr(devpath, '/') + 1;
leafname = strrchr(pathbuf, '/') + 1;

/*
* If this is a request to label a whole disk, then attempt to
* write out the label.
*/
if (zpool_prepare_and_label_disk(g_zfshdl, zhp, leafname,
vdev, "autoreplace", &lines, &lines_cnt) != 0) {
zed_log_msg(LOG_INFO,
zed_log_msg(LOG_WARNING,
" zpool_prepare_and_label_disk: could not "
"label '%s' (%s)", leafname,
libzfs_error_description(g_zfshdl));
Expand Down Expand Up @@ -468,7 +476,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
sizeof (device->pd_physpath));
list_insert_tail(&g_device_list, device);

zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)",
zed_log_msg(LOG_NOTICE, " zpool_label_disk: async '%s' (%llu)",
leafname, (u_longlong_t)guid);

return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */
Expand All @@ -491,8 +499,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
}
if (!found) {
/* unexpected partition slice encountered */
zed_log_msg(LOG_INFO, "labeled disk %s unexpected here",
fullpath);
zed_log_msg(LOG_WARNING, "labeled disk %s was "
"unexpected here", fullpath);
(void) zpool_vdev_online(zhp, fullpath,
ZFS_ONLINE_FORCEFAULT, &newstate);
return;
Expand All @@ -501,8 +509,17 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)",
physpath, (u_longlong_t)guid);

(void) snprintf(devpath, sizeof (devpath), "%s%s",
DEV_BYID_PATH, new_devid);
/*
* Paths that begin with '/dev/disk/by-id/' will change and so
* they must be updated before calling zpool_vdev_attach().
*/
if (strncmp(path, DEV_BYID_PATH, strlen(DEV_BYID_PATH)) == 0) {
(void) snprintf(pathbuf, sizeof (pathbuf), "%s%s",
DEV_BYID_PATH, new_devid);
zed_log_msg(LOG_INFO, " zpool_label_disk: path '%s' "
"replaced by '%s'", path, pathbuf);
path = pathbuf;
}
}

libzfs_free_str_array(lines, lines_cnt);
Expand Down Expand Up @@ -545,9 +562,11 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
* Wait for udev to verify the links exist, then auto-replace
* the leaf disk at same physical location.
*/
if (zpool_label_disk_wait(path, 3000) != 0) {
zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement "
"disk %s is missing", path);
if (zpool_label_disk_wait(path, DISK_LABEL_WAIT) != 0) {
don-brady marked this conversation as resolved.
Show resolved Hide resolved
zed_log_msg(LOG_WARNING, "zfs_mod: pool '%s', after labeling "
"replacement disk, the expected disk partition link '%s' "
"is missing after waiting %u ms",
zpool_get_name(zhp), path, DISK_LABEL_WAIT);
nvlist_free(nvroot);
return;
}
Expand All @@ -562,7 +581,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
B_TRUE, B_FALSE);
}

zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)",
zed_log_msg(LOG_WARNING, " zpool_vdev_replace: %s with %s (%s)",
fullpath, path, (ret == 0) ? "no errors" :
libzfs_error_description(g_zfshdl));

Expand Down Expand Up @@ -660,7 +679,7 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
dp->dd_prop, path);
dp->dd_found = B_TRUE;

/* pass the new devid for use by replacing code */
/* pass the new devid for use by auto-replacing code */
if (dp->dd_new_devid != NULL) {
(void) nvlist_add_string(nvl, "new_devid",
dp->dd_new_devid);
Expand Down
2 changes: 1 addition & 1 deletion include/libzutil.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ extern "C" {
#endif

/*
* Default wait time for a device name to be created.
* Default wait time in milliseconds for a device name to be created.
*/
#define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */

Expand Down
5 changes: 2 additions & 3 deletions lib/libzutil/os/linux/zutil_import_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -582,9 +582,8 @@ zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
* Wait up to timeout_ms for udev to set up the device node. The device is
* considered ready when libudev determines it has been initialized, all of
* the device links have been verified to exist, and it has been allowed to
* settle. At this point the device the device can be accessed reliably.
* Depending on the complexity of the udev rules this process could take
* several seconds.
* settle. At this point the device can be accessed reliably. Depending on
* the complexity of the udev rules this process could take several seconds.
*/
int
zpool_label_disk_wait(const char *path, int timeout_ms)
Expand Down
8 changes: 4 additions & 4 deletions tests/runfiles/linux.run
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,10 @@ tags = ['functional', 'fallocate']

[tests/functional/fault:Linux]
tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos',
'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared',
'decrypt_fault', 'decompress_fault', 'scrub_after_resilver',
'zpool_status_-s']
'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift',
'auto_spare_shared', 'decrypt_fault', 'decompress_fault',
'scrub_after_resilver', 'zpool_status_-s']
tags = ['functional', 'fault']

[tests/functional/features/large_dnode:Linux]
Expand Down
1 change: 1 addition & 0 deletions tests/test-runner/bin/zts-report.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,7 @@ if os.environ.get('CI') == 'true':
'fault/auto_online_001_pos': ['SKIP', ci_reason],
'fault/auto_online_002_pos': ['SKIP', ci_reason],
'fault/auto_replace_001_pos': ['SKIP', ci_reason],
'fault/auto_replace_002_pos': ['SKIP', ci_reason],
'fault/auto_spare_ashift': ['SKIP', ci_reason],
'fault/auto_spare_shared': ['SKIP', ci_reason],
'procfs/pool_state': ['SKIP', ci_reason],
Expand Down
9 changes: 5 additions & 4 deletions tests/zfs-tests/include/commands.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,14 @@ export SYSTEM_FILES_LINUX='attr
chattr
exportfs
fallocate
flock
free
getfattr
groupadd
groupdel
groupmod
hostid
logger
losetup
lsattr
lsblk
Expand All @@ -145,21 +147,20 @@ export SYSTEM_FILES_LINUX='attr
md5sum
mkswap
modprobe
mountpoint
mpstat
nsenter
parted
perf
setfattr
setpriv
sha256sum
udevadm
unshare
useradd
userdel
usermod
setpriv
mountpoint
flock
logger'
wipefs'

export ZFS_FILES='zdb
zfs
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -1426,6 +1426,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/fault/auto_online_001_pos.ksh \
functional/fault/auto_online_002_pos.ksh \
functional/fault/auto_replace_001_pos.ksh \
functional/fault/auto_replace_002_pos.ksh \
functional/fault/auto_spare_001_pos.ksh \
functional/fault/auto_spare_002_pos.ksh \
functional/fault/auto_spare_ashift.ksh \
Expand Down
41 changes: 36 additions & 5 deletions tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,14 @@
# 1. Update /etc/zfs/vdev_id.conf with scsidebug alias for a persistent path.
# This creates keys ID_VDEV and ID_VDEV_PATH and set phys_path="scsidebug".
# 2. Create a pool and set autoreplace=on (auto-replace is opt-in)
# 3. Export a pool
# 3. Export the pool
# 4. Wipe and offline the scsi_debug disk
# 5. Import pool with missing disk
# 5. Import the pool with missing disk
# 6. Re-online the wiped scsi_debug disk
# 7. Verify the ZED detects the new unused disk and adds it back to the pool
# 7. Verify ZED detects the new blank disk and replaces the missing vdev
# 8. Verify that the scsi_debug disk was re-partitioned
#
# Creates a raidz1 zpool using persistent disk path names
# Creates a raidz1 zpool using persistent /dev/disk/by-vdev path names
# (ie not /dev/sdc)
#
# Auto-replace is opt in, and matches by phys_path.
Expand Down Expand Up @@ -83,11 +84,27 @@ log_must zpool create -f $TESTPOOL raidz1 $SD_DEVICE $DISK1 $DISK2 $DISK3
log_must zpool set autoreplace=on $TESTPOOL

# Add some data to the pool
log_must mkfile $FSIZE /$TESTPOOL/data
log_must zfs create $TESTPOOL/fs
log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z
log_must zpool export $TESTPOOL

# Record the partition UUID for later comparison
part_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \
--value /dev/disk/by-id/$SD_DEVICE_ID)
[[ -z "$part_uuid" ]] || log_note original disk GPT uuid ${part_uuid}

#
# Wipe and offline the disk
#
# Note that it is not enough to zero the disk to expunge the partitions.
# You also need to inform the kernel (e.g., 'hdparm -z' or 'partprobe').
#
# Using partprobe is overkill and hdparm is not as common as wipefs. So
# we use wipefs which lets the kernel know the partition was removed
# from the device (i.e., calls BLKRRPART ioctl).
#
log_must dd if=/dev/zero of=/dev/disk/by-id/$SD_DEVICE_ID bs=1M count=$SDSIZE
log_must /usr/sbin/wipefs -a /dev/disk/by-id/$SD_DEVICE_ID
remove_disk $SD
block_device_wait

Expand All @@ -106,4 +123,18 @@ log_must wait_replacing $TESTPOOL 60
# Validate auto-replace was successful
log_must check_state $TESTPOOL "" "ONLINE"

#
# Confirm the partition UUID changed so we know the new disk was relabeled
#
# Note: some older versions of udevadm don't support "--property" option so
# we'll # skip this test when it is not supported
#
if [ ! -z "$part_uuid" ]; then
new_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \
--value /dev/disk/by-id/$SD_DEVICE_ID)
log_note new disk GPT uuid ${new_uuid}
[[ "$part_uuid" = "$new_uuid" ]] && \
log_fail "The new disk was not relabeled as expected"
fi

log_pass "Auto-replace test successful"
Loading