From fd0a16d7e66983c1cc794504c0ed5c176cdaa52e Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 20 Jun 2018 13:39:23 -0700 Subject: [PATCH] Add support for autoexpand property While the autoexpand property may seem like a small feature it depends on a significant amount of system infrastructure. Enough of that infrastructure is now in place with a few modifications for Linux it can be supported. Auto-expand works as follows; when a block device is modified (re-sized, closed after being open r/w, etc) a change uevent is generated for udev. The ZED, which is monitoring udev events, passes the change event along to zfs_deliver_dle() if the disk or partition contains a zfs_member as identified by blkid. From here the device is matched against all imported pool vdevs using the vdev_guid which was read from the label by blkid. If a match is found the ZED reopens the pool vdev. This re-opening is important because it allows the vdev to be briefly closed so the disk partition table can be re-read. Otherwise, it wouldn't be possible to report thee maximum possible expansion size. Finally, if the property autoexpand=on a vdev expansion will be attempted. After performing some sanity checks on the disk to verify that it is safe to expand, the primary partition (-part1) will be expanded and the partition table updated. The partition is then re-opened (again) to detect the updated size which allows the new capacity to be used. In order to make all of the above possible the following changes were required: * Updated the zpool_expand_001_pos and zpool_expand_003_pos tests. These tests now create a pool which is layered on a loopback, scsi_debug, and file vdev. This allows for testing of non- partitioned block device (loopback), a partition block device (scsi_debug), and a file which does not receive udev change events. This provided for better test coverage, and by removing the layering on ZFS volumes there issues surrounding layering one pool on another are avoided. * zpool_find_vdev_by_physpath() updated to accept a vdev guid. This allows for matching by guid rather than path which is a more reliable way for the ZED to reference a vdev. * Fixed zfs_zevent_wait() signal handling which could result in the ZED spinning when a signal was not handled. * Removed vdev_disk_rrpart() functionality which can be abandoned in favor of kernel provided blkdev_reread_part() function. * Added a rwlock which is held as a writer while a disk is being reopened. This is important to prevent errors from occurring for any configuration related IOs which bypass the SCL_ZIO lock. The zpool_reopen_007_pos.ksh test case was added to verify IO error are never observed when reopening. This is not expected to impact IO performance. Additional fixes which aren't critical but were discovered and resolved in the course of developing this functionality. * Added PHYS_PATH="/dev/zvol/dataset" to the vdev configuration for ZFS volumes. This is as good as a unique physical path, while the volumes are not used in the test cases anymore for other reasons this improvement was included. Signed-off-by: Brian Behlendorf Issue #120 Issue #2437 Issue #5771 Issue #7366 Issue #7582 --- cmd/zed/agents/zfs_mod.c | 85 ++++-- config/kernel-blkdev-get.m4 | 19 -- config/kernel-blkdev-reread-part.m4 | 21 ++ config/kernel-get-gendisk.m4 | 17 -- config/kernel.m4 | 3 +- include/linux/blkdev_compat.h | 14 + include/sys/vdev_disk.h | 1 + lib/libzfs/libzfs_import.c | 72 ++++- lib/libzfs/libzfs_pool.c | 14 +- module/zfs/fm.c | 30 +- module/zfs/vdev.c | 3 +- module/zfs/vdev_disk.c | 275 +++++++++--------- tests/runfiles/linux.run | 2 +- tests/test-runner/bin/zts-report.py | 10 +- tests/zfs-tests/include/blkdev.shlib | 5 +- .../cli_root/zpool_expand/setup.ksh | 9 + .../cli_root/zpool_expand/zpool_expand.cfg | 8 +- .../zpool_expand/zpool_expand_001_pos.ksh | 116 ++++---- .../zpool_expand/zpool_expand_002_pos.ksh | 37 ++- .../zpool_expand/zpool_expand_003_neg.ksh | 105 ++++--- .../zpool_expand/zpool_expand_004_pos.ksh | 4 +- .../zpool_expand/zpool_expand_005_pos.ksh | 6 +- .../cli_root/zpool_reopen/Makefile.am | 3 +- .../cli_root/zpool_reopen/cleanup.ksh | 2 +- .../zpool_reopen/zpool_reopen_007_pos.ksh | 67 +++++ 25 files changed, 566 insertions(+), 362 deletions(-) delete mode 100644 config/kernel-blkdev-get.m4 create mode 100644 config/kernel-blkdev-reread-part.m4 delete mode 100644 config/kernel-get-gendisk.m4 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_007_pos.ksh diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 600d6527c0db..f914439f3e46 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -697,8 +697,8 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data) { char *devname = data; boolean_t avail_spare, l2cache; - vdev_state_t newstate; nvlist_t *tgt; + int error; zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'", devname, zpool_get_name(zhp)); @@ -706,42 +706,58 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data) if ((tgt = zpool_find_vdev_by_physpath(zhp, devname, &avail_spare, &l2cache, NULL)) != NULL) { char *path, fullpath[MAXPATHLEN]; - uint64_t wholedisk = 0ULL; + uint64_t wholedisk; - verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, - &path) == 0); - verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, - &wholedisk) == 0); + error = nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &path); + if (error) { + zpool_close(zhp); + return (0); + } - (void) strlcpy(fullpath, path, sizeof (fullpath)); - if (wholedisk) { - char *spath = zfs_strip_partition(fullpath); - boolean_t scrub_restart = B_TRUE; + error = nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk); + if (error) + wholedisk = 0; - if (!spath) { - zed_log_msg(LOG_INFO, "%s: Can't alloc", - __func__); + if (wholedisk) { + path = strrchr(path, '/'); + if (path != NULL) { + path = zfs_strip_partition(path + 1); + if (path == NULL) { + zpool_close(zhp); + return (0); + } + } else { + zpool_close(zhp); return (0); } - (void) strlcpy(fullpath, spath, sizeof (fullpath)); - free(spath); + (void) strlcpy(fullpath, path, sizeof (fullpath)); + free(path); /* * We need to reopen the pool associated with this - * device so that the kernel can update the size - * of the expanded device. + * device so that the kernel can update the size of + * the expanded device. When expanding there is no + * need to restart the scrub from the * beginning. */ + boolean_t scrub_restart = B_FALSE; (void) zpool_reopen_one(zhp, &scrub_restart); + } else { + (void) strlcpy(fullpath, path, sizeof (fullpath)); } if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { - zed_log_msg(LOG_INFO, "zfsdle_vdev_online: setting " - "device '%s' to ONLINE state in pool '%s'", - fullpath, zpool_get_name(zhp)); - if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) - (void) zpool_vdev_online(zhp, fullpath, 0, + vdev_state_t newstate; + + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { + error = zpool_vdev_online(zhp, fullpath, 0, &newstate); + zed_log_msg(LOG_INFO, "zfsdle_vdev_online: " + "setting device '%s' to ONLINE state " + "in pool '%s': %d", fullpath, + zpool_get_name(zhp), error); + } } zpool_close(zhp); return (1); @@ -751,23 +767,32 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data) } /* - * This function handles the ESC_DEV_DLE event. + * This function handles the ESC_DEV_DLE device change event. Use the + * provided vdev guid when looking up a disk or partition, when the guid + * is not present assume the entire disk is owned by ZFS and append the + * expected -part1 partition information then lookup by physical path. */ static int zfs_deliver_dle(nvlist_t *nvl) { - char *devname; - - if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) != 0) { - zed_log_msg(LOG_INFO, "zfs_deliver_dle: no physpath"); - return (-1); + char *devname, name[MAXPATHLEN]; + uint64_t guid; + + if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &guid) == 0) { + sprintf(name, "%llu", (u_longlong_t)guid); + } else if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) == 0) { + strlcpy(name, devname, MAXPATHLEN); + zfs_append_partition(name, MAXPATHLEN); + } else { + zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath"); } - if (zpool_iter(g_zfshdl, zfsdle_vdev_online, devname) != 1) { + if (zpool_iter(g_zfshdl, zfsdle_vdev_online, name) != 1) { zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not " - "found", devname); + "found", name); return (1); } + return (0); } diff --git a/config/kernel-blkdev-get.m4 b/config/kernel-blkdev-get.m4 deleted file mode 100644 index e31d71770511..000000000000 --- a/config/kernel-blkdev-get.m4 +++ /dev/null @@ -1,19 +0,0 @@ -dnl # -dnl # 2.6.37 API change -dnl # Added 3rd argument for the active holder, previously this was -dnl # hardcoded to NULL. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_3ARG_BLKDEV_GET], [ - AC_MSG_CHECKING([whether blkdev_get() wants 3 args]) - ZFS_LINUX_TRY_COMPILE([ - #include - ],[ - struct block_device *bdev = NULL; - (void) blkdev_get(bdev, 0, NULL); - ],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_3ARG_BLKDEV_GET, 1, [blkdev_get() wants 3 args]) - ],[ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel-blkdev-reread-part.m4 b/config/kernel-blkdev-reread-part.m4 new file mode 100644 index 000000000000..5664769a3091 --- /dev/null +++ b/config/kernel-blkdev-reread-part.m4 @@ -0,0 +1,21 @@ +dnl # +dnl # 4.1 API, exported blkdev_reread_part() symbol, backported to the +dnl # 3.10.0 CentOS 7.x enterprise kernels. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_REREAD_PART], [ + AC_MSG_CHECKING([whether blkdev_reread_part() is available]) + ZFS_LINUX_TRY_COMPILE([ + #include + ], [ + struct block_device *bdev = NULL; + int error; + + error = blkdev_reread_part(bdev); + ], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_REREAD_PART, 1, + [blkdev_reread_part() is available]) + ], [ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-get-gendisk.m4 b/config/kernel-get-gendisk.m4 deleted file mode 100644 index b0913770e43d..000000000000 --- a/config/kernel-get-gendisk.m4 +++ /dev/null @@ -1,17 +0,0 @@ -dnl # -dnl # 2.6.34 API change -dnl # Verify the get_gendisk() symbol is available. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_GET_GENDISK], - [AC_MSG_CHECKING([whether get_gendisk() is available]) - ZFS_LINUX_TRY_COMPILE_SYMBOL([ - #include - ], [ - get_gendisk(0, NULL); - ], [get_gendisk], [block/genhd.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_GET_GENDISK, 1, [get_gendisk() is available]) - ], [ - AC_MSG_RESULT(no) - ]) -]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 8c2998204cde..7ae10c127460 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -44,8 +44,8 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID ZFS_AC_KERNEL_TYPE_FMODE_T - ZFS_AC_KERNEL_3ARG_BLKDEV_GET ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH + ZFS_AC_KERNEL_BLKDEV_REREAD_PART ZFS_AC_KERNEL_OPEN_BDEV_EXCLUSIVE ZFS_AC_KERNEL_LOOKUP_BDEV ZFS_AC_KERNEL_INVALIDATE_BDEV_ARGS @@ -73,7 +73,6 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ ZFS_AC_KERNEL_BLK_QUEUE_HAVE_BLK_PLUG ZFS_AC_KERNEL_GET_DISK_AND_MODULE ZFS_AC_KERNEL_GET_DISK_RO - ZFS_AC_KERNEL_GET_GENDISK ZFS_AC_KERNEL_HAVE_BIO_SET_OP_ATTRS ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL ZFS_AC_KERNEL_DISCARD_GRANULARITY diff --git a/include/linux/blkdev_compat.h b/include/linux/blkdev_compat.h index 88b0e48cda09..274552d5dc48 100644 --- a/include/linux/blkdev_compat.h +++ b/include/linux/blkdev_compat.h @@ -364,6 +364,20 @@ bio_set_bi_error(struct bio *bio, int error) #define vdev_bdev_close(bdev, md) close_bdev_excl(bdev) #endif /* HAVE_BLKDEV_GET_BY_PATH | HAVE_OPEN_BDEV_EXCLUSIVE */ +/* + * 4.1 - x.y.z API, + * 3.10.0 CentOS 7.x API, + * blkdev_reread_part() + * + * For older kernels trigger a re-reading of the partition table by calling + * check_disk_change() which calls flush_disk() to invalidate the device. + */ +#ifdef HAVE_BLKDEV_REREAD_PART +#define vdev_bdev_reread_part(bdev) blkdev_reread_part(bdev) +#else +#define vdev_bdev_reread_part(bdev) check_disk_change(bdev) +#endif /* HAVE_BLKDEV_REREAD_PART */ + /* * 2.6.22 API change * The function invalidate_bdev() lost it's second argument because diff --git a/include/sys/vdev_disk.h b/include/sys/vdev_disk.h index b8a32b316882..908f5f32634f 100644 --- a/include/sys/vdev_disk.h +++ b/include/sys/vdev_disk.h @@ -47,6 +47,7 @@ typedef struct vdev_disk { ddi_devid_t vd_devid; char *vd_minor; struct block_device *vd_bdev; + krwlock_t vd_lock; } vdev_disk_t; #endif /* _KERNEL */ diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index 7d2f0e903cce..d2c7d98f966d 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -145,6 +145,21 @@ zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) return (0); } + /* + * For volumes use the persistent /dev/zvol/dataset identifier + */ + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + const char *name; + + name = udev_list_entry_get_name(entry); + if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { + (void) strlcpy(bufptr, name, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } + /* * NVME 'by-id' symlinks are similar to bus case */ @@ -187,26 +202,57 @@ int zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) { const char *physpath = NULL; + struct udev_list_entry *entry; /* - * Normal disks use ID_PATH for their physical path. Device mapper - * devices are virtual and don't have a physical path. For them we - * use ID_VDEV instead, which is setup via the /etc/vdev_id.conf file. - * ID_VDEV provides a persistent path to a virtual device. If you - * don't have vdev_id.conf setup, you cannot use multipath autoreplace. + * Normal disks use ID_PATH for their physical path. */ - if (!((physpath = udev_device_get_property_value(dev, "ID_PATH")) && - physpath[0])) { - if (!((physpath = - udev_device_get_property_value(dev, "ID_VDEV")) && - physpath[0])) { - return (ENODATA); + physpath = udev_device_get_property_value(dev, "ID_PATH"); + if (physpath != NULL && strlen(physpath) > 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + + /* + * Device mapper devices are virtual and don't have a physical + * path. For them we use ID_VDEV instead, which is setup via the + * /etc/vdev_id.conf file. ID_VDEV provides a persistent path + * to a virtual device. If you don't have vdev_id.conf setup, + * you cannot use multipath autoreplace with device mapper. + */ + physpath = udev_device_get_property_value(dev, "ID_VDEV"); + if (physpath != NULL && strlen(physpath) > 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + + /* + * For ZFS volumes use the persistent /dev/zvol/dataset identifier + */ + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + physpath = udev_list_entry_get_name(entry); + if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); } + entry = udev_list_entry_get_next(entry); } - (void) strlcpy(bufptr, physpath, buflen); + /* + * For all other devices fallback to using the by-uuid name. + */ + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + physpath = udev_list_entry_get_name(entry); + if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } - return (0); + return (ENODATA); } boolean_t diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 8f2eedec8003..d19ca77140ec 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2283,17 +2283,25 @@ vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare, } /* - * Given a physical path (minus the "/devices" prefix), find the - * associated vdev. + * Given a physical path or guid, find the associated vdev. */ nvlist_t * zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { nvlist_t *search, *nvroot, *ret; + uint64_t guid; + char *end; verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0); - verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0); + + guid = strtoull(ppath, &end, 0); + if (guid != 0 && *end == '\0') { + verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0); + } else { + verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, + ppath) == 0); + } verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); diff --git a/module/zfs/fm.c b/module/zfs/fm.c index 4986a3fa2350..df8309d8de39 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -671,19 +671,31 @@ zfs_zevent_wait(zfs_zevent_t *ze) int error = 0; mutex_enter(&zevent_lock); + zevent_waiters++; - if (zevent_flags & ZEVENT_SHUTDOWN) { - error = ESHUTDOWN; - goto out; - } + while (error == 0) { + if (zevent_flags & ZEVENT_SHUTDOWN) { + error = SET_ERROR(ESHUTDOWN); + break; + } - zevent_waiters++; - cv_wait_sig(&zevent_cv, &zevent_lock); - if (issig(JUSTLOOKING)) - error = EINTR; + error = cv_timedwait_sig(&zevent_cv, &zevent_lock, + ddi_get_lbolt() + hz); + if (signal_pending(current) || fatal_signal_pending(current)) { + error = SET_ERROR(EINTR); + break; + } else { + if (error == -1) { + error = 0; + continue; + } else { + error = 0; + break; + } + } + } zevent_waiters--; -out: mutex_exit(&zevent_lock); return (error); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index cf1bf2837f18..0a33560c7ac0 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -3216,7 +3216,8 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) - pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); + pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || + spa->spa_autoexpand); } vdev_reopen(tvd); diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 996bab43c6ce..89c9f12a9bff 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -85,50 +85,51 @@ vdev_bdev_mode(int smode) } #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */ -/* The capacity (in bytes) of a bdev that is available to be used by a vdev */ +/* + * Returns the usable capacity (in bytes) for the partition or disk. + */ static uint64_t -bdev_capacity(struct block_device *bdev, boolean_t wholedisk) +bdev_capacity(struct block_device *bdev) { - struct hd_struct *part = bdev->bd_part; - uint64_t sectors = get_capacity(bdev->bd_disk); - /* If there are no paritions, return the entire device capacity */ - if (part == NULL) - return (sectors << SECTOR_BITS); + return (i_size_read(bdev->bd_inode)); +} - /* - * If there are partitions, decide if we are using a `wholedisk` - * layout (composed of part1 and part9) or just a single partition. - */ - if (wholedisk) { - /* Verify the expected device layout */ - ASSERT3P(bdev, !=, bdev->bd_contains); - /* - * Sectors used by the EFI partition (part9) as well as - * partion alignment. - */ - uint64_t used = EFI_MIN_RESV_SIZE + NEW_START_BLOCK + - PARTITION_END_ALIGNMENT; - - /* Space available to the vdev, i.e. the size of part1 */ - if (sectors <= used) - return (0); - uint64_t available = sectors - used; - return (available << SECTOR_BITS); +/* + * Returns the maximum expansion capacity of the block device, When the + * vdev has been created as a 'wholedisk' then expansion may be possible. + * Before any expansion is performed the partition layout is verified to + * confirm the original layout (-part1 and -part9). If everything checks + * out the primary partition will be resized and the reserved partition + * relocated to the new end of device as part of 'zpool online -e'. + */ +static uint64_t +bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) +{ + uint64_t psize; + int64_t available; + + if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) { + available = i_size_read(bdev->bd_contains->bd_inode) - + ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + + PARTITION_END_ALIGNMENT) << SECTOR_BITS); + if (available > 0) + psize = available; + else + psize = bdev_capacity(bdev); } else { - /* The partition capacity referenced by the block device */ - return (part->nr_sects << SECTOR_BITS); + psize = bdev_capacity(bdev); } + + return (psize); } static void vdev_disk_error(zio_t *zio) { -#ifdef ZFS_DEBUG - printk(KERN_WARNING "ZFS: zio error=%d type=%d offset=%llu size=%llu " + zfs_dbgmsg(KERN_WARNING "zio error=%d type=%d offset=%llu size=%llu " "flags=%x\n", zio->io_error, zio->io_type, (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, zio->io_flags); -#endif } /* @@ -200,109 +201,73 @@ vdev_elevator_switch(vdev_t *v, char *elevator) } } -/* - * Expanding a whole disk vdev involves invoking BLKRRPART on the - * whole disk device. This poses a problem, because BLKRRPART will - * return EBUSY if one of the disk's partitions is open. That's why - * we have to do it here, just before opening the data partition. - * Unfortunately, BLKRRPART works by dropping all partitions and - * recreating them, which means that for a short time window, all - * /dev/sdxN device files disappear (until udev recreates them). - * This means two things: - * - When we open the data partition just after a BLKRRPART, we - * can't do it using the normal device file path because of the - * obvious race condition with udev. Instead, we use reliable - * kernel APIs to get a handle to the new partition device from - * the whole disk device. - * - Because vdev_disk_open() initially needs to find the device - * using its path, multiple vdev_disk_open() invocations in - * short succession on the same disk with BLKRRPARTs in the - * middle have a high probability of failure (because of the - * race condition with udev). A typical situation where this - * might happen is when the zpool userspace tool does a - * TRYIMPORT immediately followed by an IMPORT. For this - * reason, we only invoke BLKRRPART in the module when strictly - * necessary (zpool online -e case), and rely on userspace to - * do it when possible. - */ -static struct block_device * -vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd) -{ -#if defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) - struct block_device *bdev, *result = ERR_PTR(-ENXIO); - struct gendisk *disk; - int error, partno; - - bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), zfs_vdev_holder); - if (IS_ERR(bdev)) - return (bdev); - - disk = get_gendisk(bdev->bd_dev, &partno); - vdev_bdev_close(bdev, vdev_bdev_mode(mode)); - - if (disk) { - bdev = bdget(disk_devt(disk)); - if (bdev) { - error = blkdev_get(bdev, vdev_bdev_mode(mode), vd); - if (error == 0) - error = ioctl_by_bdev(bdev, BLKRRPART, 0); - vdev_bdev_close(bdev, vdev_bdev_mode(mode)); - } - - bdev = bdget_disk(disk, partno); - if (bdev) { - error = blkdev_get(bdev, - vdev_bdev_mode(mode) | FMODE_EXCL, vd); - if (error == 0) - result = bdev; - } - put_disk(disk); - } - - return (result); -#else - return (ERR_PTR(-EOPNOTSUPP)); -#endif /* defined(HAVE_3ARG_BLKDEV_GET) && defined(HAVE_GET_GENDISK) */ -} - static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, uint64_t *ashift) { - struct block_device *bdev = ERR_PTR(-ENXIO); + struct block_device *bdev; + fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); + int count = 0, block_size; + int bdev_retry_count = 50; vdev_disk_t *vd; - int count = 0, mode, block_size; /* Must have a pathname and it must be absolute. */ if (v->vdev_path == NULL || v->vdev_path[0] != '/') { v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - vdev_dbgmsg(v, "vdev_disk_open: invalid " - "vdev_path '%s'", v->vdev_path); + vdev_dbgmsg(v, "invalid vdev_path"); return (SET_ERROR(EINVAL)); } /* - * Reopen the device if it's not currently open. Otherwise, - * just update the physical size of the device. + * Reopen the device if it is currently open. When expanding a + * partition force re-scanning the partition table while closed + * in order to get an accurate updated block device size. Then + * since udev may need to recreate the device links increase the + * open retry count before reporting the device as unavailable. */ - if (v->vdev_tsd != NULL) { - ASSERT(v->vdev_reopening); - vd = v->vdev_tsd; - goto skip_open; - } + vd = v->vdev_tsd; + if (vd) { + char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; + boolean_t reread_part = B_FALSE; - vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); - if (vd == NULL) - return (SET_ERROR(ENOMEM)); + rw_enter(&vd->vd_lock, RW_WRITER); + bdev = vd->vd_bdev; + vd->vd_bdev = NULL; + + if (bdev) { + if (v->vdev_expanding && bdev != bdev->bd_contains) { + bdevname(bdev->bd_contains, disk_name + 5); + reread_part = B_TRUE; + } + + vdev_bdev_close(bdev, mode); + } + + if (reread_part) { + bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder); + if (!IS_ERR(bdev)) { + int error = vdev_bdev_reread_part(bdev); + vdev_bdev_close(bdev, mode); + if (error == 0) + bdev_retry_count = 100; + } + } + } else { + vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); + + rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); + rw_enter(&vd->vd_lock, RW_WRITER); + } /* * Devices are always opened by the path provided at configuration * time. This means that if the provided path is a udev by-id path - * then drives may be recabled without an issue. If the provided + * then drives may be re-cabled without an issue. If the provided * path is a udev by-path path, then the physical location information * will be preserved. This can be critical for more complicated * configurations where drives are located in specific physical - * locations to maximize the systems tolerence to component failure. + * locations to maximize the systems tolerance to component failure. + * * Alternatively, you can provide your own udev rule to flexibly map * the drives as you see fit. It is not advised that you use the * /dev/[hd]d devices which may be reordered due to probing order. @@ -317,15 +282,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, * and it is reasonable to sleep and retry before giving up. In * practice delays have been observed to be on the order of 100ms. */ - mode = spa_mode(v->vdev_spa); - if (v->vdev_wholedisk && v->vdev_expanding) - bdev = vdev_disk_rrpart(v->vdev_path, mode, vd); - - while (IS_ERR(bdev) && count < 50) { - bdev = vdev_bdev_open(v->vdev_path, - vdev_bdev_mode(mode), zfs_vdev_holder); + bdev = ERR_PTR(-ENXIO); + while (IS_ERR(bdev) && count < bdev_retry_count) { + bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder); if (unlikely(PTR_ERR(bdev) == -ENOENT)) { - msleep(10); + schedule_timeout(MSEC_TO_TICK(10)); count++; } else if (IS_ERR(bdev)) { break; @@ -333,16 +294,18 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, } if (IS_ERR(bdev)) { - dprintf("failed open v->vdev_path=%s, error=%d count=%d\n", - v->vdev_path, -PTR_ERR(bdev), count); - kmem_free(vd, sizeof (vdev_disk_t)); - return (SET_ERROR(-PTR_ERR(bdev))); + int error = -PTR_ERR(bdev); + vdev_dbgmsg(v, "open error=%d count=%d\n", error, count); + vd->vd_bdev = NULL; + v->vdev_tsd = vd; + rw_exit(&vd->vd_lock); + return (SET_ERROR(error)); + } else { + vd->vd_bdev = bdev; + v->vdev_tsd = vd; + rw_exit(&vd->vd_lock); } - v->vdev_tsd = vd; - vd->vd_bdev = bdev; - -skip_open: /* Determine the physical block size */ block_size = vdev_bdev_block_size(vd->vd_bdev); @@ -352,9 +315,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, /* Inform the ZIO pipeline that we are non-rotational */ v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); - /* Physical volume size in bytes */ - *psize = bdev_capacity(vd->vd_bdev, v->vdev_wholedisk); - *max_psize = *psize; + /* Physical volume size in bytes for the partition */ + *psize = bdev_capacity(vd->vd_bdev); + + /* Physical volume size in bytes including possible expansion space */ + *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); /* Based on the minimum sector size set the block size */ *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; @@ -373,10 +338,12 @@ vdev_disk_close(vdev_t *v) if (v->vdev_reopening || vd == NULL) return; - if (vd->vd_bdev != NULL) + if (vd->vd_bdev != NULL) { vdev_bdev_close(vd->vd_bdev, vdev_bdev_mode(spa_mode(v->vdev_spa))); + } + rw_destroy(&vd->vd_lock); kmem_free(vd, sizeof (vdev_disk_t)); v->vdev_tsd = NULL; } @@ -562,9 +529,15 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) struct blk_plug plug; #endif - - ASSERT(zio != NULL); - ASSERT3U(io_offset + io_size, <=, bdev->bd_inode->i_size); + /* + * Accessing outside the block device is never allowed. + */ + if (io_offset + io_size > bdev->bd_inode->i_size) { + vdev_dbgmsg(zio->io_vd, + "Illegal access %llu size %llu, device size %llu", + io_offset, io_size, i_size_read(bdev->bd_inode)); + return (SET_ERROR(EIO)); + } retry: dr = vdev_disk_dio_alloc(bio_count); @@ -705,10 +678,34 @@ vdev_disk_io_start(zio_t *zio) vdev_disk_t *vd = v->vdev_tsd; int rw, flags, error; + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (vd == NULL) { + zio->io_error = ENXIO; + zio_interrupt(zio); + return; + } + + rw_enter(&vd->vd_lock, RW_READER); + + /* + * If the vdev is closed, it's likely due to a failed reopen and is + * in the UNAVAIL state. Nothing to be done here but return failure. + */ + if (vd->vd_bdev == NULL) { + rw_exit(&vd->vd_lock); + zio->io_error = ENXIO; + zio_interrupt(zio); + return; + } + switch (zio->io_type) { case ZIO_TYPE_IOCTL: if (!vdev_readable(v)) { + rw_exit(&vd->vd_lock); zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; @@ -726,8 +723,10 @@ vdev_disk_io_start(zio_t *zio) } error = vdev_disk_io_flush(vd->vd_bdev, zio); - if (error == 0) + if (error == 0) { + rw_exit(&vd->vd_lock); return; + } zio->io_error = error; @@ -737,6 +736,7 @@ vdev_disk_io_start(zio_t *zio) zio->io_error = SET_ERROR(ENOTSUP); } + rw_exit(&vd->vd_lock); zio_execute(zio); return; case ZIO_TYPE_WRITE: @@ -762,6 +762,7 @@ vdev_disk_io_start(zio_t *zio) break; default: + rw_exit(&vd->vd_lock); zio->io_error = SET_ERROR(ENOTSUP); zio_interrupt(zio); return; @@ -770,6 +771,8 @@ vdev_disk_io_start(zio_t *zio) zio->io_target_timestamp = zio_handle_io_delay(zio); error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_size, zio->io_offset, rw, flags); + rw_exit(&vd->vd_lock); + if (error) { zio->io_error = error; zio_interrupt(zio); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index c80352bdbfe5..18d9732a0108 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -398,7 +398,7 @@ tags = ['functional', 'cli_root', 'zpool_remove'] [tests/functional/cli_root/zpool_reopen] tests = ['zpool_reopen_001_pos', 'zpool_reopen_002_pos', 'zpool_reopen_003_pos', 'zpool_reopen_004_pos', 'zpool_reopen_005_pos', - 'zpool_reopen_006_neg'] + 'zpool_reopen_006_neg', 'zpool_reopen_007_pos'] tags = ['functional', 'cli_root', 'zpool_reopen'] [tests/functional/cli_root/zpool_replace] diff --git a/tests/test-runner/bin/zts-report.py b/tests/test-runner/bin/zts-report.py index fef9b53ec1c2..38ba7f1d5661 100755 --- a/tests/test-runner/bin/zts-report.py +++ b/tests/test-runner/bin/zts-report.py @@ -81,6 +81,13 @@ # tmpfile_reason = 'Kernel O_TMPFILE support required' +# +# Some tests may depend on udev change events being generated when block +# devices change capacity. This functionality wasn't available until the +# 2.6.38 kernel. +# +udev_reason = 'Kernel block device udev change events required' + # # Some tests require that the NFS client and server utilities be installed. # @@ -159,8 +166,6 @@ 'cli_root/zfs_unshare/zfs_unshare_002_pos': ['SKIP', na_reason], 'cli_root/zfs_unshare/zfs_unshare_006_pos': ['SKIP', na_reason], 'cli_root/zpool_create/zpool_create_016_pos': ['SKIP', na_reason], - 'cli_root/zpool_expand/zpool_expand_001_pos': ['SKIP', '5771'], - 'cli_root/zpool_expand/zpool_expand_003_neg': ['SKIP', '5771'], 'cli_user/misc/zfs_share_001_neg': ['SKIP', na_reason], 'cli_user/misc/zfs_unshare_001_neg': ['SKIP', na_reason], 'inuse/inuse_001_pos': ['SKIP', na_reason], @@ -220,6 +225,7 @@ 'cli_root/zpool_create/setup': ['SKIP', disk_reason], 'cli_root/zpool_create/zpool_create_008_pos': ['FAIL', known_reason], 'cli_root/zpool_destroy/zpool_destroy_001_pos': ['SKIP', '6145'], + 'cli_root/zpool_expand/setup': ['SKIP', udev_reason], 'cli_root/zpool_export/setup': ['SKIP', disk_reason], 'cli_root/zpool_import/setup': ['SKIP', disk_reason], 'cli_root/zpool_import/import_rewind_device_replaced': diff --git a/tests/zfs-tests/include/blkdev.shlib b/tests/zfs-tests/include/blkdev.shlib index 5163ea2ae294..9cac7184f9fc 100644 --- a/tests/zfs-tests/include/blkdev.shlib +++ b/tests/zfs-tests/include/blkdev.shlib @@ -312,6 +312,7 @@ function on_off_disk # disk state{online,offline} host log_fail "Onlining $disk failed" fi elif is_real_device $disk; then + block_device_wait typeset -i retries=0 while ! lsscsi | egrep -q $disk; do if (( $retries > 2 )); then @@ -410,9 +411,7 @@ function load_scsi_debug # dev_size_mb add_host num_tgts max_luns blksz # function unload_scsi_debug { - if lsmod | grep scsi_debug >/dev/null; then - log_must modprobe -r scsi_debug - fi + log_must_retry "in use" 5 modprobe -r scsi_debug } # diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/setup.ksh index 7d6a43ef5280..9832a441c20b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/setup.ksh @@ -29,6 +29,15 @@ verify_runnable "global" +# +# The pool expansion tests depend on udev change events being generated +# when block devices change capacity. Since this functionality wasn't +# available until the 2.6.38 kernel skip this test group. +# +if [[ $(linux_version) -lt $(linux_version "2.6.38") ]]; then + log_unsupported "Requires block device udev change events" +fi + zed_setup zed_start diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand.cfg index e15471e22743..bec5fb1638aa 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand.cfg @@ -29,7 +29,9 @@ # -export org_size=$MINVDEVSIZE -export exp_size=$((2*$org_size)) +export org_size=$((1024*1024*1024)) +export exp_size=$((2*1024*1024*1024)) +export org_size_mb=$((org_size/(1024*1024))) -export VFS=$TESTPOOL/$TESTFS +export FILE_LO=$TEST_BASE_DIR/vdev_lo +export FILE_RAW=$TEST_BASE_DIR/vdev_raw diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh index 06ab1b84fd1c..289e3e33fa4b 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh @@ -27,6 +27,7 @@ # # Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. # . $STF_SUITE/include/libtest.shlib @@ -35,68 +36,85 @@ # # DESCRIPTION: # Once zpool set autoexpand=on poolname, zpool can autoexpand by -# Dynamic LUN Expansion +# Dynamic VDEV Expansion # # # STRATEGY: -# 1) Create a pool -# 2) Create volume on top of the pool -# 3) Create pool by using the zvols and set autoexpand=on -# 4) Expand the vol size by 'zfs set volsize' -# 5) Check that the pool size was expanded +# 1) Create three vdevs (loopback, scsi_debug, and file) +# 2) Create pool by using the different devices and set autoexpand=on +# 3) Expand each device as appropriate +# 4) Check that the pool size was expanded +# +# NOTE: Three different device types are used in this test to verify +# expansion of non-partitioned block devices (loopback), partitioned +# block devices (scsi_debug), and non-disk file vdevs. ZFS volumes +# are not used in order to avoid a possible lock inversion when +# layering pools on zvols. # verify_runnable "global" -# See issue: https://github.com/zfsonlinux/zfs/issues/5771 -if is_linux; then - log_unsupported "Requires autoexpand property support" -fi - function cleanup { - if poolexists $TESTPOOL1; then - log_must zpool destroy $TESTPOOL1 + poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 + + if losetup -a | grep -q $DEV1; then + losetup -d $DEV1 fi - for i in 1 2 3; do - if datasetexists $VFS/vol$i; then - log_must zfs destroy $VFS/vol$i - fi - done + rm -f $FILE_LO $FILE_RAW + + block_device_wait + unload_scsi_debug } log_onexit cleanup -log_assert "zpool can be autoexpanded after set autoexpand=on on LUN expansion" - -for i in 1 2 3; do - log_must zfs create -V $org_size $VFS/vol$i -done -block_device_wait +log_assert "zpool can be autoexpanded after set autoexpand=on on vdev expansion" for type in " " mirror raidz raidz2; do + log_note "Setting up loopback, scsi_debug, and file vdevs" + log_must truncate -s $org_size $FILE_LO + DEV1=$(losetup -f) + log_must losetup $DEV1 $FILE_LO + + load_scsi_debug $org_size_mb 1 1 1 '512b' + block_device_wait + DEV2=$(get_debug_device) + + log_must truncate -s $org_size $FILE_RAW + DEV3=$FILE_RAW - log_must zpool create -o autoexpand=on $TESTPOOL1 $type \ - ${ZVOL_DEVDIR}/$VFS/vol1 ${ZVOL_DEVDIR}/$VFS/vol2 \ - ${ZVOL_DEVDIR}/$VFS/vol3 + # The -f is required since we're mixing disk and file vdevs. + log_must zpool create -f -o autoexpand=on $TESTPOOL1 $type \ + $DEV1 $DEV2 $DEV3 typeset autoexp=$(get_pool_prop autoexpand $TESTPOOL1) if [[ $autoexp != "on" ]]; then - log_fail "zpool $TESTPOOL1 autoexpand should on but is $autoexp" + log_fail "zpool $TESTPOOL1 autoexpand should be on but is " \ + "$autoexp" fi typeset prev_size=$(get_pool_prop size $TESTPOOL1) typeset zfs_prev_size=$(zfs get -p avail $TESTPOOL1 | tail -1 | \ awk '{print $3}') - for i in 1 2 3; do - log_must zfs set volsize=$exp_size $VFS/vol$i - done + # Expand each device as appropriate being careful to add an artificial + # delay to ensure we get a single history entry for each. This makes + # is easier to verify each expansion for the striped pool case, since + # they will not be merged in to a single larger expansion. + log_note "Expanding loopback, scsi_debug, and file vdevs" + log_must truncate -s $exp_size $FILE_LO + log_must losetup -c $DEV1 + sleep 3 - sync - sleep 10 - sync + echo "2" > /sys/bus/pseudo/drivers/scsi_debug/virtual_gb + echo "1" > /sys/class/block/$DEV2/device/rescan + block_device_wait + sleep 3 + + log_must truncate -s $exp_size $FILE_RAW + log_must zpool online -e $TESTPOOL1 $FILE_RAW typeset expand_size=$(get_pool_prop size $TESTPOOL1) typeset zfs_expand_size=$(zfs get -p avail $TESTPOOL1 | tail -1 | \ @@ -105,8 +123,8 @@ for type in " " mirror raidz raidz2; do log_note "$TESTPOOL1 $type has previous size: $prev_size and " \ "expanded size: $expand_size" # compare available pool size from zfs - if [[ $zfs_expand_size > $zfs_prev_size ]]; then - # check for zpool history for the pool size expansion + if [[ $zfs_expand_size -gt $zfs_prev_size ]]; then + # check for zpool history for the pool size expansion if [[ $type == " " ]]; then typeset expansion_size=$(($exp_size-$org_size)) typeset size_addition=$(zpool history -il $TESTPOOL1 |\ @@ -114,9 +132,9 @@ for type in " " mirror raidz raidz2; do grep "vdev online" | \ grep "(+${expansion_size}" | wc -l) - if [[ $size_addition -ne $i ]]; then - log_fail "pool $TESTPOOL1 is not autoexpand " \ - "after LUN expansion" + if [[ $size_addition -ne 3 ]]; then + log_fail "pool $TESTPOOL1 has not expanded, " \ + "$size_addition/3 vdevs expanded" fi elif [[ $type == "mirror" ]]; then typeset expansion_size=$(($exp_size-$org_size)) @@ -126,8 +144,7 @@ for type in " " mirror raidz raidz2; do grep "(+${expansion_size})" >/dev/null 2>&1 if [[ $? -ne 0 ]] ; then - log_fail "pool $TESTPOOL1 is not autoexpand " \ - "after LUN expansion" + log_fail "pool $TESTPOOL1 has not expanded" fi else typeset expansion_size=$((3*($exp_size-$org_size))) @@ -137,19 +154,16 @@ for type in " " mirror raidz raidz2; do grep "(+${expansion_size})" >/dev/null 2>&1 if [[ $? -ne 0 ]]; then - log_fail "pool $TESTPOOL is not autoexpand " \ - "after LUN expansion" + log_fail "pool $TESTPOOL has not expanded" fi fi else - log_fail "pool $TESTPOOL1 is not autoexpanded after LUN " \ - "expansion" + log_fail "pool $TESTPOOL1 is not autoexpanded after vdev " \ + "expansion. Previous size: $zfs_prev_size and expanded " \ + "size: $zfs_expand_size" fi - log_must zpool destroy $TESTPOOL1 - for i in 1 2 3; do - log_must zfs set volsize=$org_size $VFS/vol$i - done - + cleanup done -log_pass "zpool can be autoexpanded after set autoexpand=on on LUN expansion" + +log_pass "zpool can autoexpand if autoexpand=on after vdev expansion" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh index 66b6969db3dc..a49d4fc17068 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh @@ -36,7 +36,7 @@ # # DESCRIPTION: # After zpool online -e poolname zvol vdevs, zpool can autoexpand by -# Dynamic LUN Expansion +# Dynamic VDEV Expansion # # # STRATEGY: @@ -52,9 +52,7 @@ verify_runnable "global" function cleanup { - if poolexists $TESTPOOL1; then - log_must zpool destroy $TESTPOOL1 - fi + poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 for i in 1 2 3; do [ -e ${TEMPFILE}.$i ] && log_must rm ${TEMPFILE}.$i @@ -63,7 +61,7 @@ function cleanup log_onexit cleanup -log_assert "zpool can expand after zpool online -e zvol vdevs on LUN expansion" +log_assert "zpool can expand after zpool online -e zvol vdevs on vdev expansion" for type in " " mirror raidz raidz2; do # Initialize the file devices and the pool @@ -77,7 +75,7 @@ for type in " " mirror raidz raidz2; do typeset autoexp=$(get_pool_prop autoexpand $TESTPOOL1) if [[ $autoexp != "off" ]]; then - log_fail "zpool $TESTPOOL1 autoexpand should off but is " \ + log_fail "zpool $TESTPOOL1 autoexpand should be off but is " \ "$autoexp" fi typeset prev_size=$(get_pool_prop size $TESTPOOL1) @@ -109,15 +107,15 @@ for type in " " mirror raidz raidz2; do "expected $expected_zpool_expandsize" fi - # Online the devices to add the new space to the pool + # Online the devices to add the new space to the pool. Add an + # artificial delay between online commands order to prevent them + # from being merged in to a single history entry. This makes + # is easier to verify each expansion for the striped pool case. for i in 1 2 3; do log_must zpool online -e $TESTPOOL1 ${TEMPFILE}.$i + sleep 3 done - sync - sleep 10 - sync - typeset expand_size=$(get_pool_prop size $TESTPOOL1) typeset zfs_expand_size=$(get_prop avail $TESTPOOL1) log_note "$TESTPOOL1 $type has previous size: $prev_size and " \ @@ -134,8 +132,9 @@ for type in " " mirror raidz raidz2; do grep "(+${expansion_size}" | wc -l) if [[ $size_addition -ne $i ]]; then - log_fail "pool $TESTPOOL1 did not expand " \ - "after LUN expansion and zpool online -e" + log_fail "pool $TESTPOOL1 has not expanded " \ + "after zpool online -e, " \ + "$size_addition/3 vdevs expanded" fi elif [[ $type == "mirror" ]]; then typeset expansion_size=$(($exp_size-$org_size)) @@ -145,8 +144,8 @@ for type in " " mirror raidz raidz2; do grep "(+${expansion_size})" >/dev/null 2>&1 if [[ $? -ne 0 ]]; then - log_fail "pool $TESTPOOL1 did not expand " \ - "after LUN expansion and zpool online -e" + log_fail "pool $TESTPOOL1 has not expanded " \ + "after zpool online -e" fi else typeset expansion_size=$((3*($exp_size-$org_size))) @@ -156,14 +155,14 @@ for type in " " mirror raidz raidz2; do grep "(+${expansion_size})" >/dev/null 2>&1 if [[ $? -ne 0 ]] ; then - log_fail "pool $TESTPOOL1 did not expand " \ - "after LUN expansion and zpool online -e" + log_fail "pool $TESTPOOL1 has not expanded " \ + "after zpool online -e" fi fi else - log_fail "pool $TESTPOOL1 did not expand after LUN expansion " \ + log_fail "pool $TESTPOOL1 did not expand after vdev expansion " \ "and zpool online -e" fi log_must zpool destroy $TESTPOOL1 done -log_pass "zpool can expand after zpool online -e zvol vdevs on LUN expansion" +log_pass "zpool can expand after zpool online -e" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh index 585dd050fd63..323d0b907bd0 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh @@ -27,95 +27,112 @@ # # Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. # + . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/cli_root/zpool_expand/zpool_expand.cfg # # Description: # Once set zpool autoexpand=off, zpool can *NOT* autoexpand by -# Dynamic LUN Expansion +# Dynamic VDEV Expansion # # # STRATEGY: -# 1) Create a pool -# 2) Create volumes on top of the pool -# 3) Create pool by using the zvols and set autoexpand=off -# 4) Expand the vol size by zfs set volsize -# 5) Check that the pool size is not changed +# 1) Create three vdevs (loopback, scsi_debug, and file) +# 2) Create pool by using the different devices and set autoexpand=off +# 3) Expand each device as appropriate +# 4) Check that the pool size is not expanded +# +# NOTE: Three different device types are used in this test to verify +# expansion of non-partitioned block devices (loopback), partitioned +# block devices (scsi_debug), and non-disk file vdevs. ZFS volumes +# are not used in order to avoid a possible lock inversion when +# layering pools on zvols. # verify_runnable "global" -# See issue: https://github.com/zfsonlinux/zfs/issues/5771 -if is_linux; then - log_unsupported "Requires autoexpand property support" -fi - function cleanup { - if poolexists $TESTPOOL1; then - log_must zpool destroy $TESTPOOL1 - fi - - for i in 1 2 3; do - if datasetexists $VFS/vol$i; then - log_must zfs destroy $VFS/vol$i - fi - done + poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 + + if losetup -a | grep -q $DEV1; then + losetup -d $DEV1 + fi + + rm -f $FILE_LO $FILE_RAW + + block_device_wait + unload_scsi_debug } log_onexit cleanup -log_assert "zpool can not expand if set autoexpand=off after LUN expansion" - -for i in 1 2 3; do - log_must zfs create -V $org_size $VFS/vol$i -done -block_device_wait +log_assert "zpool can not expand if set autoexpand=off after vdev expansion" for type in " " mirror raidz raidz2; do - log_must zpool create $TESTPOOL1 $type ${ZVOL_DEVDIR}/$VFS/vol1 \ - ${ZVOL_DEVDIR}/$VFS/vol2 ${ZVOL_DEVDIR}/$VFS/vol3 + log_note "Setting up loopback, scsi_debug, and file vdevs" + log_must truncate -s $org_size $FILE_LO + DEV1=$(losetup -f) + log_must losetup $DEV1 $FILE_LO + + load_scsi_debug $org_size_mb 1 1 1 '512b' + block_device_wait + DEV2=$(get_debug_device) + + log_must truncate -s $org_size $FILE_RAW + DEV3=$FILE_RAW + + # The -f is required since we're mixing disk and file vdevs. + log_must zpool create -f $TESTPOOL1 $type $DEV1 $DEV2 $DEV3 typeset autoexp=$(get_pool_prop autoexpand $TESTPOOL1) if [[ $autoexp != "off" ]]; then - log_fail "zpool $TESTPOOL1 autoexpand should off but is " \ + log_fail "zpool $TESTPOOL1 autoexpand should be off but is " \ "$autoexp" fi typeset prev_size=$(get_pool_prop size $TESTPOOL1) - for i in 1 2 3; do - log_must zfs set volsize=$exp_size $VFS/vol$i - done - sync - sleep 10 - sync + # Expand each device as appropriate being careful to add an artificial + # delay to ensure we get a single history entry for each. This makes + # is easier to verify each expansion for the striped pool case, since + # they will not be merged in to a single larger expansion. + log_note "Expanding loopback, scsi_debug, and file vdevs" + log_must truncate -s $exp_size $FILE_LO + log_must losetup -c $DEV1 + sleep 3 + + echo "2" > /sys/bus/pseudo/drivers/scsi_debug/virtual_gb + echo "1" > /sys/class/block/$DEV2/device/rescan + block_device_wait + sleep 3 + + log_must truncate -s $exp_size $FILE_RAW + + # This is far longer than we should need to wait, but let's be sure. + sleep 5 # check for zpool history for the pool size expansion zpool history -il $TESTPOOL1 | grep "pool '$TESTPOOL1' size:" | \ grep "vdev online" >/dev/null 2>&1 if [[ $? -eq 0 ]]; then - log_fail "pool $TESTPOOL1 is not autoexpand after LUN " \ + log_fail "pool $TESTPOOL1 is not autoexpand after vdev " \ "expansion" fi typeset expand_size=$(get_pool_prop size $TESTPOOL1) if [[ "$prev_size" != "$expand_size" ]]; then - log_fail "pool $TESTPOOL1 size changed after LUN expansion" + log_fail "pool $TESTPOOL1 size changed after vdev expansion" fi - log_must zpool destroy $TESTPOOL1 - - for i in 1 2 3; do - log_must zfs set volsize=$org_size $VFS/vol$i - done - + cleanup done -log_pass "zpool can not expand if set autoexpand=off after LUN expansion" +log_pass "zpool can not autoexpand if autoexpand=off after vdev expansion" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh index 69481ba1ac8f..8a4db824bc9c 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh @@ -50,9 +50,7 @@ verify_runnable "global" function cleanup { - if poolexists $TESTPOOL1; then - log_must zpool destroy $TESTPOOL1 - fi + poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 for i in 1 2 3; do [ -e ${TEMPFILE}.$i ] && log_must rm ${TEMPFILE}.$i diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh index 8430c95b5cf1..54ec73b67b26 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh @@ -47,9 +47,7 @@ verify_runnable "global" function cleanup { - if poolexists $TESTPOOL1; then - log_must zpool destroy $TESTPOOL1 - fi + poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 unload_scsi_debug } @@ -95,7 +93,7 @@ typeset new_size=$(get_pool_prop size $TESTPOOL1) log_note "new pool size: $new_size" if [[ $new_size -le $prev_size ]]; then log_fail "pool $TESTPOOL1 did not expand " \ - "after LUN expansion and zpool online -e" + "after vdev expansion and zpool online -e" fi log_pass "zpool based on scsi_debug can be expanded with reopen and online -e" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/Makefile.am index f4686c04e2e3..01ad68c817f2 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/Makefile.am @@ -7,7 +7,8 @@ dist_pkgdata_SCRIPTS = \ zpool_reopen_003_pos.ksh \ zpool_reopen_004_pos.ksh \ zpool_reopen_005_pos.ksh \ - zpool_reopen_006_neg.ksh + zpool_reopen_006_neg.ksh \ + zpool_reopen_007_pos.ksh dist_pkgdata_DATA = \ zpool_reopen.cfg \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/cleanup.ksh index 99c51351c5c8..a9fcef790586 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/cleanup.ksh @@ -25,7 +25,7 @@ cleanup_devices $DISKS # Unplug the disk and remove scsi_debug module if is_linux; then for SDDEVICE in $(get_debug_device); do - unplug $SDDEVICE + remove_disk $SDDEVICE done unload_scsi_debug fi diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_007_pos.ksh new file mode 100755 index 000000000000..4ba56af85d32 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_007_pos.ksh @@ -0,0 +1,67 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_reopen/zpool_reopen.shlib + +# +# DESCRIPTION: +# Test zpool reopen while performing IO to the pool. +# Verify that no IO errors of any kind of reported. +# +# STRATEGY: +# 1. Create a non-redundant pool. +# 2. Repeat: +# a. Write files to the pool. +# b. Execute 'zpool reopen'. +# 3. Verify that no errors are reported by 'zpool status'. + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_assert "Testing zpool reopen with concurrent user IO" +log_onexit cleanup + +set_removed_disk +scsi_host=$(get_scsi_host $REMOVED_DISK) + +# 1. Create a non-redundant pool. +log_must zpool create $TESTPOOL $DISK1 $DISK2 $DISK3 + +for i in $(seq 10); do + # 3a. Write files in the background to the pool. + mkfile 64m /$TESTPOOL/data.$i & + + # 3b. Execute 'zpool reopen'. + log_must zpool reopen $TESTPOOL + + for disk in $DISK1 $DISK2 $DISK3; do + zpool status -P -v $TESTPOOL | grep $disk | \ + read -r name state rd wr cksum + log_must [ $state = "ONLINE" ] + log_must [ $rd -eq 0 ] + log_must [ $wr -eq 0 ] + log_must [ $cksum -eq 0 ] + done +done + +wait + +log_pass "Zpool reopen with concurrent user IO successful"