diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 index 61e66421f8ec..9c60e5dd4210 100644 --- a/config/kernel-blkdev.m4 +++ b/config/kernel-blkdev.m4 @@ -294,6 +294,27 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE], [ ]) ]) +dnl # +dnl # 5.13 API change +dnl # blkdev_get_by_path() no longer handles ERESTARTSYS +dnl # +dnl # Unfortunately we're forced to rely solely on the kernel version +dnl # number in order to determine the expected behavior. This was an +dnl # internal change to blkdev_get_by_dev(), see commit a8ed1a0607. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS], [ + AC_MSG_CHECKING([whether blkdev_get_by_path() handles ERESTARTSYS]) + AS_VERSION_COMPARE([$LINUX_VERSION], [5.13.0], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_GET_ERESTARTSYS, 1, + [blkdev_get_by_path() handles ERESTARTSYS]) + ],[ + AC_MSG_RESULT(no) + ],[ + AC_MSG_RESULT(no) + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH ZFS_AC_KERNEL_SRC_BLKDEV_PUT @@ -318,4 +339,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE + ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS ]) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 42144322a38f..82af0a9756bc 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -265,6 +265,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, * a ENOENT failure at this point is highly likely to be transient * and it is reasonable to sleep and retry before giving up. In * practice delays have been observed to be on the order of 100ms. + * + * When ERESTARTSYS is returned it indicates the block device is + * a zvol which could not be opened due to the deadlock detection + * logic in zvol_open(). Extend the timeout and retry the open + * subsequent attempts are expected to eventually succeed. */ hrtime_t start = gethrtime(); bdev = ERR_PTR(-ENXIO); @@ -273,6 +278,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, zfs_vdev_holder); if (unlikely(PTR_ERR(bdev) == -ENOENT)) { schedule_timeout(MSEC_TO_TICK(10)); + } else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) { + timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); + continue; } else if (IS_ERR(bdev)) { break; } diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index c17423426319..2a70c8ebcc77 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -46,6 +46,7 @@ unsigned int zvol_request_sync = 0; unsigned int zvol_prefetch_bytes = (128 * 1024); unsigned long zvol_max_discard_blocks = 16384; unsigned int zvol_threads = 32; +unsigned int zvol_open_timeout_ms = 1000; struct zvol_state_os { struct gendisk *zvo_disk; /* generic disk */ @@ -490,7 +491,13 @@ zvol_open(struct block_device *bdev, fmode_t flag) zvol_state_t *zv; int error = 0; boolean_t drop_suspend = B_TRUE; + boolean_t drop_namespace = B_FALSE; +#ifndef HAVE_BLKEDV_GET_ERESTARTSYS + hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); + hrtime_t start = gethrtime(); +retry: +#endif rw_enter(&zvol_state_lock, RW_READER); /* * Obtain a copy of private_data under the zvol_state_lock to make @@ -504,6 +511,48 @@ zvol_open(struct block_device *bdev, fmode_t flag) return (SET_ERROR(-ENXIO)); } + if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { + /* + * In all other call paths the spa_namespace_lock is taken + * before the bdev->bd_mutex lock. However, on open(2) + * the __blkdev_get() function calls fops->open() with the + * bdev->bd_mutex lock held. This can result in a deadlock + * when zvols from one pool are used as vdevs in another. + * + * To prevent a lock inversion deadlock we preemptively + * take the spa_namespace_lock. Normally the lock will not + * be contended and this is safe because spa_open_common() + * handles the case where the caller already holds the + * spa_namespace_lock. + * + * When the lock cannot be aquired after multiple retries + * this must be the vdev on zvol deadlock case and we have + * no choice but to return an error. For 5.12 and older + * kernels returning -ERESTARTSYS will result in the + * bdev->bd_mutex being dropped, then reacquired, and + * fops->open() being called again. This process can be + * repeated safely until both locks are acquired. For 5.13 + * and newer the -ERESTARTSYS retry logic was removed from + * the kernel so the only option is to return the error for + * the caller to handle it. + */ + if (!mutex_tryenter(&spa_namespace_lock)) { + rw_exit(&zvol_state_lock); + +#ifdef HAVE_BLKEDV_GET_ERESTARTSYS + return (SET_ERROR(-ERESTARTSYS)); +#else + if ((gethrtime() - start) > timeout) + return (SET_ERROR(-ERESTARTSYS)); + + schedule_timeout(MSEC_TO_TICK(10)); + goto retry; +#endif + } else { + drop_namespace = B_TRUE; + } + } + mutex_enter(&zv->zv_state_lock); /* * make sure zvol is not suspended during first open @@ -543,6 +592,8 @@ zvol_open(struct block_device *bdev, fmode_t flag) zv->zv_open_count++; mutex_exit(&zv->zv_state_lock); + if (drop_namespace) + mutex_exit(&spa_namespace_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); @@ -556,12 +607,11 @@ zvol_open(struct block_device *bdev, fmode_t flag) out_mutex: mutex_exit(&zv->zv_state_lock); + if (drop_namespace) + mutex_exit(&spa_namespace_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); - if (error == -EINTR) { - error = -ERESTARTSYS; - schedule(); - } + return (SET_ERROR(error)); } diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index d50cce7d7357..cb5ee483c1c1 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -867,54 +867,26 @@ int zvol_first_open(zvol_state_t *zv, boolean_t readonly) { objset_t *os; - int error, locked = 0; - boolean_t ro; + int error; ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(mutex_owned(&spa_namespace_lock)); - /* - * In all other cases the spa_namespace_lock is taken before the - * bdev->bd_mutex lock. But in this case the Linux __blkdev_get() - * function calls fops->open() with the bdev->bd_mutex lock held. - * This deadlock can be easily observed with zvols used as vdevs. - * - * To avoid a potential lock inversion deadlock we preemptively - * try to take the spa_namespace_lock(). Normally it will not - * be contended and this is safe because spa_open_common() handles - * the case where the caller already holds the spa_namespace_lock. - * - * When it is contended we risk a lock inversion if we were to - * block waiting for the lock. Luckily, the __blkdev_get() - * function allows us to return -ERESTARTSYS which will result in - * bdev->bd_mutex being dropped, reacquired, and fops->open() being - * called again. This process can be repeated safely until both - * locks are acquired. - */ - if (!mutex_owned(&spa_namespace_lock)) { - locked = mutex_tryenter(&spa_namespace_lock); - if (!locked) - return (SET_ERROR(EINTR)); - } - - ro = (readonly || (strchr(zv->zv_name, '@') != NULL)); + boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL)); error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os); if (error) - goto out_mutex; + return (SET_ERROR(error)); zv->zv_objset = os; error = zvol_setup_zv(zv); - if (error) { dmu_objset_disown(os, 1, zv); zv->zv_objset = NULL; } -out_mutex: - if (locked) - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(error)); + return (error); } void diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index d04663705309..ce43d204fc9d 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -293,7 +293,6 @@ elif sys.platform.startswith('linux'): 'alloc_class/alloc_class_011_neg': ['FAIL', known_reason], 'alloc_class/alloc_class_012_pos': ['FAIL', known_reason], 'alloc_class/alloc_class_013_pos': ['FAIL', '11888'], - 'cli_root/zfs_copies/zfs_copies_003_pos': ['FAIL', '12301'], 'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason], 'cli_root/zpool_expand/zpool_expand_001_pos': ['FAIL', known_reason], 'cli_root/zpool_expand/zpool_expand_005_pos': ['FAIL', known_reason],