From 7332e27919a0e6b68082f96dacdc311a14c0e05f Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Thu, 23 Dec 2021 19:03:23 +0000 Subject: [PATCH 1/2] FreeBSD: Fix zvol_*_open() locking These are the changes for FreeBSD corresponding to the changes made for Linux in #12863, see that PR for details. Changes from #12863 are applied for zvol_geom_open and zvol_cdev_open on FreeBSD. This also adds a check for the zvol dying which we had in zvol_geom_open but was missing in zvol_cdev_open. The check causes the open to fail early with ENXIO when we are in the middle of changing volmode. Signed-off-by: Ryan Moeller --- module/os/freebsd/zfs/zvol_os.c | 90 +++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 31 deletions(-) diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index 450369192569..ffd96d159e36 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -210,7 +210,6 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) zvol_state_t *zv; int err = 0; boolean_t drop_suspend = B_FALSE; - boolean_t drop_namespace = B_FALSE; if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { /* @@ -226,6 +225,12 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) retry: rw_enter(&zvol_state_lock, ZVOL_RW_READER); + /* + * Obtain a copy of private under zvol_state_lock to make sure either + * the result of zvol free code setting private to NULL is observed, + * or the zv is protected from being freed because of the positive + * zv_open_count. + */ zv = pp->private; if (zv == NULL) { rw_exit(&zvol_state_lock); @@ -233,18 +238,6 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) goto out_locked; } - if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { - /* - * We need to guarantee that the namespace lock is held - * to avoid spurious failures in zvol_first_open. - */ - drop_namespace = B_TRUE; - if (!mutex_tryenter(&spa_namespace_lock)) { - rw_exit(&zvol_state_lock); - mutex_enter(&spa_namespace_lock); - goto retry; - } - } mutex_enter(&zv->zv_state_lock); if (zv->zv_zso->zso_dying) { rw_exit(&zvol_state_lock); @@ -276,8 +269,27 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { + boolean_t drop_namespace = B_FALSE; + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + + /* + * Take spa_namespace_lock to prevent lock inversion when + * zvols from one pool are opened as vdevs in another. + */ + if (!mutex_owned(&spa_namespace_lock)) { + if (!mutex_tryenter(&spa_namespace_lock)) { + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); + kern_yield(PRI_USER); + goto retry; + } else { + drop_namespace = B_TRUE; + } + } err = zvol_first_open(zv, !(flag & FWRITE)); + if (drop_namespace) + mutex_exit(&spa_namespace_lock); if (err) goto out_zv_locked; pp->mediasize = zv->zv_volsize; @@ -285,6 +297,8 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) pp->stripesize = zv->zv_volblocksize; } + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + /* * Check for a bad on-disk format version now since we * lied about owning the dataset readonly before. @@ -317,8 +331,6 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) out_zv_locked: mutex_exit(&zv->zv_state_lock); out_locked: - if (drop_namespace) - mutex_exit(&spa_namespace_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (err); @@ -859,10 +871,15 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) struct zvol_state_dev *zsd; int err = 0; boolean_t drop_suspend = B_FALSE; - boolean_t drop_namespace = B_FALSE; retry: rw_enter(&zvol_state_lock, ZVOL_RW_READER); + /* + * Obtain a copy of si_drv2 under zvol_state_lock to make sure either + * the result of zvol free code setting si_drv2 to NULL is observed, + * or the zv is protected from being freed because of the positive + * zv_open_count. + */ zv = dev->si_drv2; if (zv == NULL) { rw_exit(&zvol_state_lock); @@ -870,20 +887,12 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) goto out_locked; } - if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { - /* - * We need to guarantee that the namespace lock is held - * to avoid spurious failures in zvol_first_open. - */ - drop_namespace = B_TRUE; - if (!mutex_tryenter(&spa_namespace_lock)) { - rw_exit(&zvol_state_lock); - mutex_enter(&spa_namespace_lock); - goto retry; - } - } mutex_enter(&zv->zv_state_lock); - + if (zv->zv_zso->zso_dying) { + rw_exit(&zvol_state_lock); + err = SET_ERROR(ENXIO); + goto out_zv_locked; + } ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); /* @@ -909,12 +918,33 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { + boolean_t drop_namespace = B_FALSE; + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + + /* + * Take spa_namespace_lock to prevent lock inversion when + * zvols from one pool are opened as vdevs in another. + */ + if (!mutex_owned(&spa_namespace_lock)) { + if (!mutex_tryenter(&spa_namespace_lock)) { + rw_exit(&zvol_state_lock); + mutex_enter(&spa_namespace_lock); + kern_yield(PRI_USER); + goto retry; + } else { + drop_namespace = B_TRUE; + } + } err = zvol_first_open(zv, !(flags & FWRITE)); + if (drop_namespace) + mutex_exit(&spa_namespace_lock); if (err) goto out_zv_locked; } + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { err = SET_ERROR(EROFS); goto out_opened; @@ -949,8 +979,6 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) out_zv_locked: mutex_exit(&zv->zv_state_lock); out_locked: - if (drop_namespace) - mutex_exit(&spa_namespace_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (err); From 6e8fcfe222b0769670d889dda3a067a493d5809e Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Thu, 23 Dec 2021 19:04:07 +0000 Subject: [PATCH 2/2] FreeBSD: Touch up comments in zvol_os Signed-off-by: Ryan Moeller --- module/os/freebsd/zfs/zvol_os.c | 40 ++++++++++++++++----------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index ffd96d159e36..c5d563d812d3 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -213,12 +213,12 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { /* - * if zfs_geom_probe_vdev_key is set, that means that zfs is + * If zfs_geom_probe_vdev_key is set, that means that zfs is * attempting to probe geom providers while looking for a * replacement for a missing VDEV. In this case, the * spa_namespace_lock will not be held, but it is still illegal * to use a zvol as a vdev. Deadlocks can result if another - * thread has spa_namespace_lock + * thread has spa_namespace_lock. */ return (SET_ERROR(EOPNOTSUPP)); } @@ -247,9 +247,9 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); /* - * make sure zvol is not suspended during first open + * Make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition - * ordering - zv_suspend_lock before zv_state_lock + * ordering - zv_suspend_lock before zv_state_lock. */ if (zv->zv_open_count == 0) { drop_suspend = B_TRUE; @@ -257,7 +257,7 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); - /* check to see if zv_suspend_lock is needed */ + /* Check to see if zv_suspend_lock is needed. */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; @@ -366,9 +366,9 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) ASSERT3U(zv->zv_open_count, >, 0); /* - * make sure zvol is not suspended during last close + * Make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition - * ordering - zv_suspend_lock before zv_state_lock + * ordering - zv_suspend_lock before zv_state_lock. */ new_open_count = zv->zv_open_count - count; if (new_open_count == 0) { @@ -376,7 +376,7 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); - /* check to see if zv_suspend_lock is needed */ + /* Check to see if zv_suspend_lock is needed. */ new_open_count = zv->zv_open_count - count; if (new_open_count != 0) { rw_exit(&zv->zv_suspend_lock); @@ -707,7 +707,7 @@ zvol_geom_bio_strategy(struct bio *bp) } } if (error) { - /* convert checksum errors into IO errors */ + /* Convert checksum errors into IO errors. */ if (error == ECKSUM) error = SET_ERROR(EIO); break; @@ -785,13 +785,13 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); - /* don't read past the end */ + /* Don't read past the end. */ if (bytes > volsize - zfs_uio_offset(&uio)) bytes = volsize - zfs_uio_offset(&uio); error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); if (error) { - /* convert checksum errors into IO errors */ + /* Convert checksum errors into IO errors. */ if (error == ECKSUM) error = SET_ERROR(EIO); break; @@ -838,7 +838,7 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) uint64_t off = zfs_uio_offset(&uio); dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); - if (bytes > volsize - off) /* don't write past the end */ + if (bytes > volsize - off) /* Don't write past the end. */ bytes = volsize - off; dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); @@ -896,9 +896,9 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); /* - * make sure zvol is not suspended during first open + * Make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition - * ordering - zv_suspend_lock before zv_state_lock + * ordering - zv_suspend_lock before zv_state_lock. */ if (zv->zv_open_count == 0) { drop_suspend = B_TRUE; @@ -906,7 +906,7 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); - /* check to see if zv_suspend_lock is needed */ + /* Check to see if zv_suspend_lock is needed. */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; @@ -1012,16 +1012,16 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) */ ASSERT3U(zv->zv_open_count, >, 0); /* - * make sure zvol is not suspended during last close + * Make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition - * ordering - zv_suspend_lock before zv_state_lock + * ordering - zv_suspend_lock before zv_state_lock. */ if (zv->zv_open_count == 1) { if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); - /* check to see if zv_suspend_lock is needed */ + /* Check to see if zv_suspend_lock is needed. */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; @@ -1217,7 +1217,7 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname) ASSERT(RW_LOCK_HELD(&zvol_state_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); - /* move to new hashtable entry */ + /* Move to a new hashtable entry. */ zv->zv_hash = zvol_name_hash(zv->zv_name); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); @@ -1347,7 +1347,7 @@ zvol_create_minor_impl(const char *name) doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); - /* lie and say we're read-only */ + /* Lie and say we're read-only. */ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (error) goto out_doi;