Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FreeBSD: Fix zvol_*_open() lock inversion #12934

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 79 additions & 51 deletions module/os/freebsd/zfs/zvol_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -210,41 +210,34 @@ zvol_geom_open(struct g_provider *pp, int flag, int count)
zvol_state_t *zv;
int err = 0;
boolean_t drop_suspend = B_FALSE;
boolean_t drop_namespace = B_FALSE;

if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
/*
* if zfs_geom_probe_vdev_key is set, that means that zfs is
* If zfs_geom_probe_vdev_key is set, that means that zfs is
* attempting to probe geom providers while looking for a
* replacement for a missing VDEV. In this case, the
* spa_namespace_lock will not be held, but it is still illegal
* to use a zvol as a vdev. Deadlocks can result if another
* thread has spa_namespace_lock
* thread has spa_namespace_lock.
*/
return (SET_ERROR(EOPNOTSUPP));
}

retry:
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
/*
* Obtain a copy of private under zvol_state_lock to make sure either
* the result of zvol free code setting private to NULL is observed,
* or the zv is protected from being freed because of the positive
* zv_open_count.
*/
zv = pp->private;
if (zv == NULL) {
rw_exit(&zvol_state_lock);
err = SET_ERROR(ENXIO);
goto out_locked;
}

if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
/*
* We need to guarantee that the namespace lock is held
* to avoid spurious failures in zvol_first_open.
*/
drop_namespace = B_TRUE;
if (!mutex_tryenter(&spa_namespace_lock)) {
rw_exit(&zvol_state_lock);
mutex_enter(&spa_namespace_lock);
goto retry;
}
}
mutex_enter(&zv->zv_state_lock);
if (zv->zv_zso->zso_dying) {
rw_exit(&zvol_state_lock);
Expand All @@ -254,17 +247,17 @@ zvol_geom_open(struct g_provider *pp, int flag, int count)
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);

/*
* make sure zvol is not suspended during first open
* Make sure zvol is not suspended during first open
* (hold zv_suspend_lock) and respect proper lock acquisition
* ordering - zv_suspend_lock before zv_state_lock
* ordering - zv_suspend_lock before zv_state_lock.
*/
if (zv->zv_open_count == 0) {
drop_suspend = B_TRUE;
if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
/* check to see if zv_suspend_lock is needed */
/* Check to see if zv_suspend_lock is needed. */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
drop_suspend = B_FALSE;
Expand All @@ -276,15 +269,36 @@ zvol_geom_open(struct g_provider *pp, int flag, int count)
ASSERT(MUTEX_HELD(&zv->zv_state_lock));

if (zv->zv_open_count == 0) {
boolean_t drop_namespace = B_FALSE;

ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));

/*
* Take spa_namespace_lock to prevent lock inversion when
* zvols from one pool are opened as vdevs in another.
*/
if (!mutex_owned(&spa_namespace_lock)) {
if (!mutex_tryenter(&spa_namespace_lock)) {
mutex_exit(&zv->zv_state_lock);
rw_exit(&zv->zv_suspend_lock);
kern_yield(PRI_USER);
goto retry;
} else {
drop_namespace = B_TRUE;
}
}
err = zvol_first_open(zv, !(flag & FWRITE));
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
if (err)
goto out_zv_locked;
pp->mediasize = zv->zv_volsize;
pp->stripeoffset = 0;
pp->stripesize = zv->zv_volblocksize;
}

ASSERT(MUTEX_HELD(&zv->zv_state_lock));

/*
* Check for a bad on-disk format version now since we
* lied about owning the dataset readonly before.
Expand Down Expand Up @@ -317,8 +331,6 @@ zvol_geom_open(struct g_provider *pp, int flag, int count)
out_zv_locked:
mutex_exit(&zv->zv_state_lock);
out_locked:
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
return (err);
Expand Down Expand Up @@ -354,17 +366,17 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
ASSERT3U(zv->zv_open_count, >, 0);

/*
* make sure zvol is not suspended during last close
* Make sure zvol is not suspended during last close
* (hold zv_suspend_lock) and respect proper lock acquisition
* ordering - zv_suspend_lock before zv_state_lock
* ordering - zv_suspend_lock before zv_state_lock.
*/
new_open_count = zv->zv_open_count - count;
if (new_open_count == 0) {
if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
/* check to see if zv_suspend_lock is needed */
/* Check to see if zv_suspend_lock is needed. */
new_open_count = zv->zv_open_count - count;
if (new_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
Expand Down Expand Up @@ -695,7 +707,7 @@ zvol_geom_bio_strategy(struct bio *bp)
}
}
if (error) {
/* convert checksum errors into IO errors */
/* Convert checksum errors into IO errors. */
if (error == ECKSUM)
error = SET_ERROR(EIO);
break;
Expand Down Expand Up @@ -773,13 +785,13 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);

/* don't read past the end */
/* Don't read past the end. */
if (bytes > volsize - zfs_uio_offset(&uio))
bytes = volsize - zfs_uio_offset(&uio);

error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
if (error) {
/* convert checksum errors into IO errors */
/* Convert checksum errors into IO errors. */
if (error == ECKSUM)
error = SET_ERROR(EIO);
break;
Expand Down Expand Up @@ -826,7 +838,7 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
uint64_t off = zfs_uio_offset(&uio);
dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);

if (bytes > volsize - off) /* don't write past the end */
if (bytes > volsize - off) /* Don't write past the end. */
bytes = volsize - off;

dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
Expand Down Expand Up @@ -859,45 +871,42 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
struct zvol_state_dev *zsd;
int err = 0;
boolean_t drop_suspend = B_FALSE;
boolean_t drop_namespace = B_FALSE;

retry:
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
/*
* Obtain a copy of si_drv2 under zvol_state_lock to make sure either
* the result of zvol free code setting si_drv2 to NULL is observed,
* or the zv is protected from being freed because of the positive
* zv_open_count.
*/
zv = dev->si_drv2;
if (zv == NULL) {
rw_exit(&zvol_state_lock);
err = SET_ERROR(ENXIO);
goto out_locked;
}

if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
/*
* We need to guarantee that the namespace lock is held
* to avoid spurious failures in zvol_first_open.
*/
drop_namespace = B_TRUE;
if (!mutex_tryenter(&spa_namespace_lock)) {
rw_exit(&zvol_state_lock);
mutex_enter(&spa_namespace_lock);
goto retry;
}
}
mutex_enter(&zv->zv_state_lock);

if (zv->zv_zso->zso_dying) {
rw_exit(&zvol_state_lock);
err = SET_ERROR(ENXIO);
goto out_zv_locked;
}
ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);

/*
* make sure zvol is not suspended during first open
* Make sure zvol is not suspended during first open
* (hold zv_suspend_lock) and respect proper lock acquisition
* ordering - zv_suspend_lock before zv_state_lock
* ordering - zv_suspend_lock before zv_state_lock.
*/
if (zv->zv_open_count == 0) {
drop_suspend = B_TRUE;
if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
/* check to see if zv_suspend_lock is needed */
/* Check to see if zv_suspend_lock is needed. */
if (zv->zv_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
drop_suspend = B_FALSE;
Expand All @@ -909,12 +918,33 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
ASSERT(MUTEX_HELD(&zv->zv_state_lock));

if (zv->zv_open_count == 0) {
boolean_t drop_namespace = B_FALSE;

ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));

/*
* Take spa_namespace_lock to prevent lock inversion when
* zvols from one pool are opened as vdevs in another.
*/
if (!mutex_owned(&spa_namespace_lock)) {
if (!mutex_tryenter(&spa_namespace_lock)) {
rw_exit(&zvol_state_lock);
mutex_enter(&spa_namespace_lock);
kern_yield(PRI_USER);
goto retry;
} else {
drop_namespace = B_TRUE;
}
}
err = zvol_first_open(zv, !(flags & FWRITE));
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
if (err)
goto out_zv_locked;
}

ASSERT(MUTEX_HELD(&zv->zv_state_lock));

if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
err = SET_ERROR(EROFS);
goto out_opened;
Expand Down Expand Up @@ -949,8 +979,6 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
out_zv_locked:
mutex_exit(&zv->zv_state_lock);
out_locked:
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
return (err);
Expand Down Expand Up @@ -984,16 +1012,16 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
*/
ASSERT3U(zv->zv_open_count, >, 0);
/*
* make sure zvol is not suspended during last close
* Make sure zvol is not suspended during last close
* (hold zv_suspend_lock) and respect proper lock acquisition
* ordering - zv_suspend_lock before zv_state_lock
* ordering - zv_suspend_lock before zv_state_lock.
*/
if (zv->zv_open_count == 1) {
if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
/* check to see if zv_suspend_lock is needed */
/* Check to see if zv_suspend_lock is needed. */
if (zv->zv_open_count != 1) {
rw_exit(&zv->zv_suspend_lock);
drop_suspend = B_FALSE;
Expand Down Expand Up @@ -1189,7 +1217,7 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname)
ASSERT(RW_LOCK_HELD(&zvol_state_lock));
ASSERT(MUTEX_HELD(&zv->zv_state_lock));

/* move to new hashtable entry */
/* Move to a new hashtable entry. */
zv->zv_hash = zvol_name_hash(zv->zv_name);
hlist_del(&zv->zv_hlink);
hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
Expand Down Expand Up @@ -1319,7 +1347,7 @@ zvol_create_minor_impl(const char *name)

doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);

/* lie and say we're read-only */
/* Lie and say we're read-only. */
error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
if (error)
goto out_doi;
Expand Down