Skip to content

Commit

Permalink
Fix zvol_open() lock inversion
Browse files Browse the repository at this point in the history
When restructuring the zvol_open() logic for the Linux 5.13 kernel
a lock inversion was accidentally introduced.  In the updated code
the spa_namespace_lock is now taken before the zv_suspend_lock
allowing the following scenario to occur:

    down_read <=== waiting for zv_suspend_lock
    zvol_open <=== holds spa_namespace_lock
    __blkdev_get
    blkdev_get_by_dev
    blkdev_open
    ...

     mutex_lock <== waiting for spa_namespace_lock
     spa_open_common
     spa_open
     dsl_pool_hold
     dmu_objset_hold_flags
     dmu_objset_hold
     dsl_prop_get
     dsl_prop_get_integer
     zvol_create_minor
     dmu_recv_end
     zfs_ioc_recv_impl <=== holds zv_suspend_lock via zvol_suspend()
     zfs_ioc_recv
     ...

This commit resolves the issue by moving the acquisition of the
spa_namespace_lock back to after the zv_suspend_lock which restores
the original ordering.

Additionally, as part of this change the error exit paths were
simplified where possible.

Reviewed-by: Tony Hutter <[email protected]>
Reviewed-by: Rich Ercolani <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #12863
  • Loading branch information
behlendorf authored Dec 17, 2021
1 parent ca1b2bb commit 8a02d01
Showing 1 changed file with 58 additions and 63 deletions.
121 changes: 58 additions & 63 deletions module/os/linux/zfs/zvol_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -496,8 +496,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
{
zvol_state_t *zv;
int error = 0;
boolean_t drop_suspend = B_TRUE;
boolean_t drop_namespace = B_FALSE;
boolean_t drop_suspend = B_FALSE;
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
hrtime_t start = gethrtime();
Expand All @@ -517,7 +516,36 @@ zvol_open(struct block_device *bdev, fmode_t flag)
return (SET_ERROR(-ENXIO));
}

if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
mutex_enter(&zv->zv_state_lock);
/*
* Make sure zvol is not suspended during first open
* (hold zv_suspend_lock) and respect proper lock acquisition
* ordering - zv_suspend_lock before zv_state_lock
*/
if (zv->zv_open_count == 0) {
if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
/* check to see if zv_suspend_lock is needed */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
} else {
drop_suspend = B_TRUE;
}
} else {
drop_suspend = B_TRUE;
}
}
rw_exit(&zvol_state_lock);

ASSERT(MUTEX_HELD(&zv->zv_state_lock));

if (zv->zv_open_count == 0) {
boolean_t drop_namespace = B_FALSE;

ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));

/*
* In all other call paths the spa_namespace_lock is taken
* before the bdev->bd_mutex lock. However, on open(2)
Expand All @@ -542,84 +570,51 @@ zvol_open(struct block_device *bdev, fmode_t flag)
* the kernel so the only option is to return the error for
* the caller to handle it.
*/
if (!mutex_tryenter(&spa_namespace_lock)) {
rw_exit(&zvol_state_lock);
if (!mutex_owned(&spa_namespace_lock)) {
if (!mutex_tryenter(&spa_namespace_lock)) {
mutex_exit(&zv->zv_state_lock);
rw_exit(&zv->zv_suspend_lock);

#ifdef HAVE_BLKDEV_GET_ERESTARTSYS
schedule();
return (SET_ERROR(-ERESTARTSYS));
#else
if ((gethrtime() - start) > timeout)
schedule();
return (SET_ERROR(-ERESTARTSYS));
#else
if ((gethrtime() - start) > timeout)
return (SET_ERROR(-ERESTARTSYS));

schedule_timeout(MSEC_TO_TICK(10));
goto retry;
schedule_timeout(MSEC_TO_TICK(10));
goto retry;
#endif
} else {
drop_namespace = B_TRUE;
}
}

mutex_enter(&zv->zv_state_lock);
/*
* make sure zvol is not suspended during first open
* (hold zv_suspend_lock) and respect proper lock acquisition
* ordering - zv_suspend_lock before zv_state_lock
*/
if (zv->zv_open_count == 0) {
if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
/* check to see if zv_suspend_lock is needed */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
drop_suspend = B_FALSE;
} else {
drop_namespace = B_TRUE;
}
}
} else {
drop_suspend = B_FALSE;
}
rw_exit(&zvol_state_lock);

ASSERT(MUTEX_HELD(&zv->zv_state_lock));

if (zv->zv_open_count == 0) {
ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
if (error)
goto out_mutex;
}

if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
error = -EROFS;
goto out_open_count;
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
}

zv->zv_open_count++;

mutex_exit(&zv->zv_state_lock);
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);

zfs_check_media_change(bdev);

return (0);
if (error == 0) {
if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
if (zv->zv_open_count == 0)
zvol_last_close(zv);

out_open_count:
if (zv->zv_open_count == 0)
zvol_last_close(zv);
error = SET_ERROR(-EROFS);
} else {
zv->zv_open_count++;
}
}

out_mutex:
mutex_exit(&zv->zv_state_lock);
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);

return (SET_ERROR(error));
if (error == 0)
zfs_check_media_change(bdev);

return (error);
}

static void
Expand Down

0 comments on commit 8a02d01

Please sign in to comment.