From b9e5ce5f9fdf3fff480ef9583f10c21f7ad3eed3 Mon Sep 17 00:00:00 2001 From: Boris Protopopov Date: Sat, 22 Mar 2014 05:07:14 -0400 Subject: [PATCH 1/3] zfsonlinux issue #2217 - zvol_create_minors(): check snapdev before traversing snapshots --- include/sys/zvol.h | 1 - module/zfs/zfs_ioctl.c | 5 -- module/zfs/zvol.c | 173 ++++++++++++++++++++++++++++------------- 3 files changed, 118 insertions(+), 61 deletions(-) diff --git a/include/sys/zvol.h b/include/sys/zvol.h index 898e2352156b..cec9dc897700 100644 --- a/include/sys/zvol.h +++ b/include/sys/zvol.h @@ -38,7 +38,6 @@ extern int zvol_check_volblocksize(const char *name, uint64_t volblocksize); extern int zvol_get_stats(objset_t *os, nvlist_t *nv); extern boolean_t zvol_is_zvol(const char *); extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); -extern int zvol_create_minor(const char *name); extern int zvol_create_minors(const char *name); extern int zvol_remove_minor(const char *name); extern void zvol_remove_minors(const char *name); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index d026e92d6934..a0626972583e 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -3305,11 +3305,6 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) error = dsl_dataset_snapshot(snaps, props, outnvl); -#ifdef _KERNEL - if (error == 0) - zvol_create_minors(poolname); -#endif - return (error); } diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 8208d29f382c..966cee8e4635 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -48,6 +48,11 @@ #include #include +static int +zvol_create_minors_cb(const char *dsname, void *arg); +static int +zvol_create_snap_minor_cb(const char *dsname, void *arg); + unsigned int zvol_inhibit_dev = 0; unsigned int zvol_major = ZVOL_MAJOR; unsigned int zvol_prefetch_bytes = (128 * 1024); @@ -988,7 +993,7 @@ zvol_open(struct block_device *bdev, fmode_t flag) /* * If the caller is already holding the mutex do not take it - * again, this will happen as part of zvol_create_minor(). + * again, this will happen as part of __zvol_create_minor(). * Once add_disk() is called the device is live and the kernel * will attempt to open it to read the partition information. */ @@ -1285,31 +1290,13 @@ zvol_free(zvol_state_t *zv) kmem_free(zv, sizeof (zvol_state_t)); } +/* + * Create a block device minor node and setup the linkage between it + * and the specified volume. Once this function returns the block + * device is live and ready for use. + */ static int -__zvol_snapdev_hidden(const char *name) -{ - uint64_t snapdev; - char *parent; - char *atp; - int error = 0; - - parent = kmem_alloc(MAXPATHLEN, KM_SLEEP); - (void) strlcpy(parent, name, MAXPATHLEN); - - if ((atp = strrchr(parent, '@')) != NULL) { - *atp = '\0'; - error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL); - if ((error == 0) && (snapdev == ZFS_SNAPDEV_HIDDEN)) - error = SET_ERROR(ENODEV); - } - - kmem_free(parent, MAXPATHLEN); - - return (SET_ERROR(error)); -} - -static int -__zvol_create_minor(const char *name, boolean_t ignore_snapdev) +__zvol_create_minor(const char *name) { zvol_state_t *zv; objset_t *os; @@ -1327,13 +1314,7 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev) goto out; } - if (ignore_snapdev == B_FALSE) { - error = __zvol_snapdev_hidden(name); - if (error) - goto out; - } - - doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); + doi = kmem_alloc(sizeof (dmu_object_info_t), KM_PUSHPAGE); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os); if (error) @@ -1416,23 +1397,6 @@ __zvol_create_minor(const char *name, boolean_t ignore_snapdev) return (SET_ERROR(error)); } -/* - * Create a block device minor node and setup the linkage between it - * and the specified volume. Once this function returns the block - * device is live and ready for use. - */ -int -zvol_create_minor(const char *name) -{ - int error; - - mutex_enter(&zvol_state_lock); - error = __zvol_create_minor(name, B_FALSE); - mutex_exit(&zvol_state_lock); - - return (SET_ERROR(error)); -} - static int __zvol_remove_minor(const char *name) { @@ -1492,25 +1456,124 @@ __zvol_rename_minor(zvol_state_t *zv, const char *newname) set_disk_ro(zv->zv_disk, readonly); } +/* + * Mask errors to continue dmu_objset_find() traversal + */ static int zvol_create_minors_cb(const char *dsname, void *arg) { - (void) zvol_create_minor(dsname); + uint64_t snapdev; + int error; + + error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL); + if (error) + return (0); + + /* + * Given the name and the 'snapdev' property, create device minor nodes + * with the linkages to zvols/snapshots as needed. + * The name represents a zvol, so create a minor node for the zvol, then + * check if its snapshots are 'visible', and if so, iterate over the + * snapshots and create device minor nodes for those. Note, that we + * know that this is a zvol, having just obtained the 'snapdev' + * zvol property without errors. + */ + if (strchr(dsname, '@') == 0) { + /* create minor for the 'dsname' explicitly */ + mutex_enter(&zvol_state_lock); + error = __zvol_create_minor(dsname); + mutex_exit(&zvol_state_lock); + if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) { + /* + * traverse snapshots only, do not traverse children, + * and skip the 'dsname' + */ + error = dmu_objset_find((char *)dsname, + zvol_create_snap_minor_cb, (void *)dsname, + DS_FIND_SNAPSHOTS); + } + } else { + printk(KERN_INFO + "zvol_create_minors_cb(): %s is not a zvol name\n", + dsname); + } + + return (0); +} + +/* + * Mask errors to continue dmu_objset_find() traversal + */ +static int +zvol_create_snap_minor_cb(const char *dsname, void *arg) +{ + const char *name = (const char *)arg; + + /* skip the designated dataset */ + if (name && strcmp(dsname, name) == 0) + return (0); + + /* at this point, the dsname should name a snapshot */ + if (strchr(dsname, '@') == 0) { + printk(KERN_INFO "zvol_create_snap_minor_cb(): " + "%s is not a shapshot name\n", dsname); + } else { + /* create minor for the snapshot */ + mutex_enter(&zvol_state_lock); + (void) __zvol_create_minor(name); + mutex_exit(&zvol_state_lock); + } return (0); } /* - * Create minors for specified dataset including children and snapshots. + * Create minors for the specified dataset, including children and snapshots. + * Pay attention to the 'snapdev' property and iterate over the snapshots + * only if they are 'visible'. This approach allows one to assure that the + * snapshot metadata is read from disk only if it is needed. + * + * The name can represent a dataset to be recursively scanned for zvols and + * their snapshots, or a single zvol snapshot. If the name represents a + * dataset, the scan is performed in two nested stages: + * - scan the dataset for zvols, and + * - for each zvol, create a minor node, then check if the zvol's snapshots + * are 'visible', and only then iterate over the snapshots if needed + * + * If the name represents a snapshot, a check is perfromed if the snapshot is + * 'visible' (which also verifies that the parent is a zvol), and if so, + * a minor node for that snapshot is created. */ int zvol_create_minors(const char *name) { int error = 0; - if (!zvol_inhibit_dev) - error = dmu_objset_find((char *)name, zvol_create_minors_cb, - NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + if (!zvol_inhibit_dev) { + char *atp, *parent; + + parent = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE); + (void) strlcpy(parent, name, MAXPATHLEN); + + if ((atp = strrchr(parent, '@')) != NULL) { + uint64_t snapdev; + + *atp = '\0'; + error = dsl_prop_get_integer(parent, "snapdev", + &snapdev, NULL); + + if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) { + mutex_enter(&zvol_state_lock); + error = __zvol_create_minor(name); + mutex_exit(&zvol_state_lock); + } + } else { + error = dmu_objset_find(parent, zvol_create_minors_cb, + NULL, DS_FIND_CHILDREN); + } + + kmem_free(parent, MAXPATHLEN); + } return (SET_ERROR(error)); } @@ -1592,7 +1655,7 @@ snapdev_snapshot_changed_cb(const char *dsname, void *arg) { switch (snapdev) { case ZFS_SNAPDEV_VISIBLE: mutex_enter(&zvol_state_lock); - (void) __zvol_create_minor(dsname, B_TRUE); + (void) __zvol_create_minor(dsname); mutex_exit(&zvol_state_lock); break; case ZFS_SNAPDEV_HIDDEN: From 23b58b3835c5ac4648e788a8e00852298b7d3684 Mon Sep 17 00:00:00 2001 From: Boris Protopopov Date: Thu, 13 Aug 2015 00:27:49 -0400 Subject: [PATCH 2/3] zfsonlinux issue #3681 - lock order inversion between zvol_open() and dsl_pool_sync()...zvol_rename_minors() Execute zvol_*_minor(s) functions asynchronously in taskqs (one taskq per pool) in order to avoid having to deal with the locking state of the caller. --- include/sys/zvol.h | 31 +++- module/zfs/dsl_dataset.c | 4 +- module/zfs/dsl_dir.c | 2 +- module/zfs/spa.c | 10 +- module/zfs/zfs_ioctl.c | 31 ++-- module/zfs/zvol.c | 301 +++++++++++++++++++++++++++++++++++++-- 6 files changed, 349 insertions(+), 30 deletions(-) diff --git a/include/sys/zvol.h b/include/sys/zvol.h index cec9dc897700..7b217245f8dc 100644 --- a/include/sys/zvol.h +++ b/include/sys/zvol.h @@ -31,6 +31,25 @@ #define ZVOL_OBJ 1ULL #define ZVOL_ZAP_OBJ 2ULL +typedef enum { + ZVOL_ASYNC_CREATE_TASKQ, + ZVOL_ASYNC_REMOVE_TASKQ, + ZVOL_ASYNC_CREATE_MINORS, + ZVOL_ASYNC_REMOVE_MINORS, + ZVOL_ASYNC_REMOVE_MINOR, + ZVOL_ASYNC_RENAME_MINORS, + ZVOL_ASYNC_SET_SNAPDEV, + ZVOL_ASYNC_MAX +} zvol_async_op_t; + +typedef struct { + zvol_async_op_t op; + char pool[MAXNAMELEN]; + char name1[MAXNAMELEN]; + char name2[MAXNAMELEN]; + uint64_t value; +} zvol_async_arg_t; + #ifdef _KERNEL extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); @@ -38,13 +57,15 @@ extern int zvol_check_volblocksize(const char *name, uint64_t volblocksize); extern int zvol_get_stats(objset_t *os, nvlist_t *nv); extern boolean_t zvol_is_zvol(const char *); extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); -extern int zvol_create_minors(const char *name); -extern int zvol_remove_minor(const char *name); -extern void zvol_remove_minors(const char *name); -extern void zvol_rename_minors(const char *oldname, const char *newname); +extern void zvol_async_create_taskq(const char *pool); +extern void zvol_async_remove_taskq(const char *pool); +extern void zvol_async_create_minors(const char *name); +extern void zvol_async_remove_minor(const char *name); +extern void zvol_async_remove_minors(const char *name); +extern void zvol_async_rename_minors(const char *oldname, const char *newname); +extern int zvol_async_set_snapdev(const char *, uint64_t); extern int zvol_set_volsize(const char *, uint64_t); extern int zvol_set_volblocksize(const char *, uint64_t); -extern int zvol_set_snapdev(const char *, uint64_t); extern int zvol_init(void); extern void zvol_fini(void); diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 829452b1d22a..accd8e4b0da2 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -1497,7 +1497,7 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char *snapname = nvpair_name(pair); - zvol_create_minors(snapname); + zvol_async_create_minors(snapname); } } #endif @@ -1968,7 +1968,7 @@ dsl_dataset_rename_snapshot(const char *fsname, #ifdef _KERNEL oldname = kmem_asprintf("%s@%s", fsname, oldsnapname); newname = kmem_asprintf("%s@%s", fsname, newsnapname); - zvol_rename_minors(oldname, newname); + zvol_async_rename_minors(oldname, newname); strfree(newname); strfree(oldname); #endif diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index ba6c24486463..a75b82ca5cee 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -1914,7 +1914,7 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) dd->dd_myname, 8, 1, &dd->dd_object, tx)); #ifdef _KERNEL - zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname); + zvol_async_rename_minors(ddra->ddra_oldname, ddra->ddra_newname); #endif dsl_prop_notify_all(dd); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b4831a724c36..d2aa4e46d619 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -3084,8 +3084,11 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, } #ifdef _KERNEL - if (firstopen) - zvol_create_minors(spa->spa_name); + if (firstopen) { + char *name = spa_name(spa); + zvol_async_create_taskq(name); + zvol_async_create_minors(name); + } #endif *spapp = spa; @@ -4208,7 +4211,8 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) spa_history_log_version(spa, "import"); #ifdef _KERNEL - zvol_create_minors(pool); + zvol_async_create_taskq(pool); + zvol_async_create_minors(pool); #endif return (0); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index a0626972583e..b768b6dede0a 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1485,6 +1485,12 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) (void) spa_destroy(zc->zc_name); + /* + * Create a taskq for asynchronous zvol-related work in this pool + */ + if (!error) + zvol_async_create_taskq(zc->zc_name); + pool_props_bad: nvlist_free(rootprops); nvlist_free(zplprops); @@ -1500,8 +1506,10 @@ zfs_ioc_pool_destroy(zfs_cmd_t *zc) int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); - if (error == 0) - zvol_remove_minors(zc->zc_name); + if (error == 0) { + zvol_async_remove_minors(zc->zc_name); + zvol_async_remove_taskq(zc->zc_name); + } return (error); } @@ -1553,8 +1561,10 @@ zfs_ioc_pool_export(zfs_cmd_t *zc) zfs_log_history(zc); error = spa_export(zc->zc_name, NULL, force, hardforce); - if (error == 0) - zvol_remove_minors(zc->zc_name); + if (error == 0) { + zvol_async_remove_minors(zc->zc_name); + zvol_async_remove_taskq(zc->zc_name); + } return (error); } @@ -2395,7 +2405,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_SNAPDEV: - err = zvol_set_snapdev(dsname, intval); + err = zvol_async_set_snapdev(dsname, intval); break; case ZFS_PROP_VERSION: { @@ -3192,7 +3202,7 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) #ifdef _KERNEL if (error == 0 && type == DMU_OST_ZVOL) - zvol_create_minors(fsname); + zvol_async_create_minors(fsname); #endif return (error); @@ -3240,7 +3250,7 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) #ifdef _KERNEL if (error == 0) - zvol_create_minors(fsname); + zvol_async_create_minors(fsname); #endif return (error); @@ -3430,7 +3440,7 @@ zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { (void) zfs_unmount_snap(nvpair_name(pair)); - (void) zvol_remove_minor(nvpair_name(pair)); + (void) zvol_async_remove_minor(nvpair_name(pair)); } return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); @@ -3557,7 +3567,8 @@ zfs_ioc_destroy(zfs_cmd_t *zc) else err = dsl_destroy_head(zc->zc_name); if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0) - (void) zvol_remove_minor(zc->zc_name); + (void) zvol_async_remove_minor(zc->zc_name); + return (err); } @@ -4125,7 +4136,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) #ifdef _KERNEL if (error == 0) - zvol_create_minors(tofs); + (void) zvol_async_create_minors(tofs); #endif /* diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 966cee8e4635..4a08b856d7f8 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -53,6 +53,26 @@ zvol_create_minors_cb(const char *dsname, void *arg); static int zvol_create_snap_minor_cb(const char *dsname, void *arg); +/* + * Perform device minor related actions in taskqs, one taskq + * (one worker thread) per pool + */ +#include +static list_t zvol_async_taskq_list; +static krwlock_t zvol_async_taskq_lock; + +typedef struct zvol_taskq { + const char zt_pool[MAXNAMELEN]; + taskq_t *zt_tq; + list_node_t zt_next; +} zvol_taskq_t; + +/* zvol_create_minors_cb argument */ +typedef struct { + const char *name; + uint64_t snapdev; +} zvcb_arg_t; + unsigned int zvol_inhibit_dev = 0; unsigned int zvol_major = ZVOL_MAJOR; unsigned int zvol_prefetch_bytes = (128 * 1024); @@ -934,6 +954,7 @@ zvol_first_open(zvol_state_t *zv) error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) { dmu_objset_disown(os, zvol_tag); + zv->zv_objset = NULL; goto out_mutex; } @@ -941,6 +962,7 @@ zvol_first_open(zvol_state_t *zv) error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); if (error) { dmu_objset_disown(os, zvol_tag); + zv->zv_objset = NULL; goto out_mutex; } @@ -1314,7 +1336,7 @@ __zvol_create_minor(const char *name) goto out; } - doi = kmem_alloc(sizeof (dmu_object_info_t), KM_PUSHPAGE); + doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os); if (error) @@ -1420,7 +1442,7 @@ __zvol_remove_minor(const char *name) /* * Remove a block device minor node for the specified volume. */ -int +static int zvol_remove_minor(const char *name) { int error; @@ -1456,6 +1478,7 @@ __zvol_rename_minor(zvol_state_t *zv, const char *newname) set_disk_ro(zv->zv_disk, readonly); } + /* * Mask errors to continue dmu_objset_find() traversal */ @@ -1544,7 +1567,7 @@ zvol_create_snap_minor_cb(const char *dsname, void *arg) * 'visible' (which also verifies that the parent is a zvol), and if so, * a minor node for that snapshot is created. */ -int +static int zvol_create_minors(const char *name) { int error = 0; @@ -1552,7 +1575,7 @@ zvol_create_minors(const char *name) if (!zvol_inhibit_dev) { char *atp, *parent; - parent = kmem_alloc(MAXPATHLEN, KM_PUSHPAGE); + parent = kmem_alloc(MAXPATHLEN, KM_SLEEP); (void) strlcpy(parent, name, MAXPATHLEN); if ((atp = strrchr(parent, '@')) != NULL) { @@ -1581,7 +1604,7 @@ zvol_create_minors(const char *name) /* * Remove minors for specified dataset including children and snapshots. */ -void +static void zvol_remove_minors(const char *name) { zvol_state_t *zv, *zv_next; @@ -1609,7 +1632,7 @@ zvol_remove_minors(const char *name) /* * Rename minors for specified dataset including children and snapshots. */ -void +static void zvol_rename_minors(const char *oldname, const char *newname) { zvol_state_t *zv, *zv_next; @@ -1666,12 +1689,10 @@ snapdev_snapshot_changed_cb(const char *dsname, void *arg) { return (0); } -int +static void zvol_set_snapdev(const char *dsname, uint64_t snapdev) { (void) dmu_objset_find((char *) dsname, snapdev_snapshot_changed_cb, &snapdev, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); - /* caller should continue to modify snapdev property */ - return (-1); } int @@ -1684,6 +1705,11 @@ zvol_init(void) mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zvol_async_taskq_list, sizeof (zvol_taskq_t), + offsetof(zvol_taskq_t, zt_next)); + + rw_init(&zvol_async_taskq_lock, NULL, RW_DEFAULT, NULL); + error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); @@ -1696,6 +1722,9 @@ zvol_init(void) return (0); out: + rw_destroy(&zvol_async_taskq_lock); + list_destroy(&zvol_async_taskq_list); + mutex_destroy(&zvol_state_lock); list_destroy(&zvol_state_list); @@ -1705,6 +1734,24 @@ zvol_init(void) void zvol_fini(void) { + zvol_taskq_t *zt = NULL; + + /* + * cleanup async taskqs + * this step performs all the tasks on the queues first + */ + rw_enter(&zvol_async_taskq_lock, RW_WRITER); + while ((zt = list_remove_head(&zvol_async_taskq_list))) { + taskq_destroy(zt->zt_tq); + kmem_free(zt, sizeof (zvol_taskq_t)); + } + rw_exit(&zvol_async_taskq_lock); + + /* cleanup the list and the lock */ + list_destroy(&zvol_async_taskq_list); + rw_destroy(&zvol_async_taskq_lock); + + /* remove minors synchronously */ zvol_remove_minors(NULL); blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); unregister_blkdev(zvol_major, ZVOL_DRIVER); @@ -1712,6 +1759,242 @@ zvol_fini(void) list_destroy(&zvol_state_list); } + +/* + * Add/remove taskq per pool, create/remove/rename minor(s) + */ + +/* Create task for the pool, performed at pool create and import */ +static int +zvol_create_taskq(const char *pool) +{ + zvol_taskq_t *zt, *ezt; + char *tqname; + + zt = kmem_zalloc(sizeof (zvol_taskq_t), KM_SLEEP); + + tqname = kmem_alloc(MAXNAMELEN, KM_SLEEP); + snprintf(tqname, MAXNAMELEN, "zvol_tq/%s", pool); + /* Use one thread to assure ordered completion of requests */ + zt->zt_tq = taskq_create(tqname, 1, maxclsyspri, 1, INT_MAX, + TASKQ_PREPOPULATE); + kmem_free(tqname, MAXNAMELEN); + if (zt->zt_tq == NULL) { + kmem_free(zt, sizeof (zvol_taskq_t)); + printk(KERN_INFO + "ZFS: failed to create async work taskq for pool %s\n", + pool); + return (-ENOMEM); + } + + strlcpy((char *)zt->zt_pool, (char *)pool, MAXNAMELEN); + + rw_enter(&zvol_async_taskq_lock, RW_WRITER); + /* check if the queue already exists */ + for (ezt = list_head(&zvol_async_taskq_list); ezt != NULL; + ezt = list_next(&zvol_async_taskq_list, ezt)) { + if (strncmp(ezt->zt_pool, pool, MAXNAMELEN) == 0) { + break; + } + } + if (ezt == NULL) + list_insert_head(&zvol_async_taskq_list, zt); + rw_exit(&zvol_async_taskq_lock); + + if (ezt) { + printk(KERN_INFO + "ZFS: async work taskq for pool %s already exists\n", pool); + taskq_destroy(zt->zt_tq); + kmem_free(zt, sizeof (zvol_taskq_t)); + } + + return (0); +} + +/* Remove task for the pool, performed at pool export and destroy */ +static int +zvol_remove_taskq(const char *pool) +{ + zvol_taskq_t *zt; + + rw_enter(&zvol_async_taskq_lock, RW_WRITER); + for (zt = list_head(&zvol_async_taskq_list); zt != NULL; + zt = list_next(&zvol_async_taskq_list, zt)) { + if (strncmp(zt->zt_pool, pool, MAXNAMELEN) == 0) { + list_remove(&zvol_async_taskq_list, zt); + break; + } + } + rw_exit(&zvol_async_taskq_lock); + + /* This may have to wait for completion of the jobs on the queue */ + if (zt) { + taskq_destroy(zt->zt_tq); + kmem_free(zt, sizeof (zvol_taskq_t)); + return (0); + } + + printk(KERN_INFO "ZFS: async work taskq for pool %s not found\n", pool); + return (-ENOENT); +} + +/* The worker thread function performed asynchronously */ +static void +_zvol_async_task(void *param) +{ + zvol_async_arg_t *arg = (zvol_async_arg_t *)param; + + switch (arg->op) { + case ZVOL_ASYNC_CREATE_MINORS: + (void) zvol_create_minors(arg->name1); + break; + case ZVOL_ASYNC_REMOVE_MINORS: + zvol_remove_minors(arg->name1); + break; + case ZVOL_ASYNC_REMOVE_MINOR: + (void) zvol_remove_minor(arg->name1); + break; + case ZVOL_ASYNC_RENAME_MINORS: + zvol_rename_minors(arg->name1, arg->name2); + break; + case ZVOL_ASYNC_SET_SNAPDEV: + zvol_set_snapdev(arg->name1, arg->value); + break; + default: + /* should not get here */ + printk(KERN_INFO "ZFS: invalid async work operation %d\n", + arg->op); + break; + } + + /* the task is performed, free the argument */ + kmem_free(arg, sizeof (zvol_async_arg_t)); +} + +/* Lookup the taskq for the pool and dispatch the task */ +static int +zvol_dispatch_taskq(zvol_async_arg_t *arg) +{ + int error = 0; + zvol_taskq_t *zt; + + rw_enter(&zvol_async_taskq_lock, RW_READER); + for (zt = list_head(&zvol_async_taskq_list); zt != NULL; + zt = list_next(&zvol_async_taskq_list, zt)) { + if (strncmp(zt->zt_pool, arg->pool, MAXNAMELEN) == 0) + break; + } + if (zt) { + error = taskq_dispatch(zt->zt_tq, _zvol_async_task, + arg, TQ_SLEEP); + } else { + printk(KERN_INFO "ZFS: async work taskq for pool %s not found;" + " failed to dispatch work op %d\n", arg->pool, arg->op); + error = -ENOENT; + } + rw_exit(&zvol_async_taskq_lock); + + return (error); +} + +static int +zvol_async_dispatch(zvol_async_op_t op, const char *name1, const char *name2, + uint64_t val) +{ + int error; + char *delim; + size_t len; + zvol_async_arg_t *arg; + + switch (op) { + case ZVOL_ASYNC_CREATE_TASKQ: + error = zvol_create_taskq(name1); + break; + case ZVOL_ASYNC_REMOVE_TASKQ: + error = zvol_remove_taskq(name1); + break; + case ZVOL_ASYNC_CREATE_MINORS: + case ZVOL_ASYNC_REMOVE_MINORS: + case ZVOL_ASYNC_REMOVE_MINOR: + case ZVOL_ASYNC_RENAME_MINORS: + case ZVOL_ASYNC_SET_SNAPDEV: + /* + * these requests are dispatched to a taskq and + * the argument is freed in the worker function + */ + arg = kmem_zalloc(sizeof (zvol_async_arg_t), KM_SLEEP); + + arg->op = op; + /* extract the pool name from the dataset name */ + delim = strchr(name1, '/'); + /* + 1 for strlcpy() to accommodate the \0 */ + len = (delim) ? (delim - name1 + 1) : MAXNAMELEN; + strlcpy(arg->pool, name1, len); + strlcpy(arg->name1, name1, MAXNAMELEN); + if (name2) + strlcpy(arg->name2, name2, MAXNAMELEN); + arg->value = val; + + error = zvol_dispatch_taskq(arg); + break; + default: + printk(KERN_INFO "ZFS: invalid async op %d for %s\n", + op, name1); + error = -EINVAL; + break; + } + + return (error); +} + +/* + * The external interfaces wrapping zvol_async_dispatch() + */ +void +zvol_async_create_taskq(const char *pool) +{ + zvol_async_dispatch(ZVOL_ASYNC_CREATE_TASKQ, pool, NULL, 0); +} + +void +zvol_async_remove_taskq(const char *pool) +{ + zvol_async_dispatch(ZVOL_ASYNC_REMOVE_TASKQ, pool, NULL, 0); +} + +void +zvol_async_create_minors(const char *name) +{ + zvol_async_dispatch(ZVOL_ASYNC_CREATE_MINORS, name, NULL, 0); +} + +void +zvol_async_remove_minors(const char *name) +{ + zvol_async_dispatch(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, 0); +} + +void +zvol_async_remove_minor(const char *name) +{ + zvol_async_dispatch(ZVOL_ASYNC_REMOVE_MINOR, name, NULL, 0); +} + +void +zvol_async_rename_minors(const char *name1, const char *name2) +{ + zvol_async_dispatch(ZVOL_ASYNC_RENAME_MINORS, name1, name2, 0); +} + +int +zvol_async_set_snapdev(const char *dsname, uint64_t snapdev) { + zvol_async_dispatch(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, snapdev); + /* caller should continue to modify snapdev property */ + return (-1); +} + +/* Parameter definitions */ + module_param(zvol_inhibit_dev, uint, 0644); MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); From bb30425a7ebd2965568acbb3ff0cc1f6ed229b0a Mon Sep 17 00:00:00 2001 From: Boris Protopopov Date: Wed, 23 Sep 2015 12:34:51 -0400 Subject: [PATCH 3/3] zfsonlinux issue #3681 - lock order inversion between zvol_open() and dsl_pool_sync()...zvol_rename_minors() Remove trylock of spa_namespace_lock as it is no longer needed; the code paths that take zvol_state_lock no longer take spa_namespace_lock prior to taking zvol_state_lock --- module/zfs/zvol.c | 60 ++++++++++++++--------------------------------- 1 file changed, 17 insertions(+), 43 deletions(-) diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 4a08b856d7f8..b6b4fb211485 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -915,56 +915,28 @@ zvol_first_open(zvol_state_t *zv) { objset_t *os; uint64_t volsize; - int locked = 0; int error; + int owned = 0; uint64_t ro; - /* - * In all other cases the spa_namespace_lock is taken before the - * bdev->bd_mutex lock. But in this case the Linux __blkdev_get() - * function calls fops->open() with the bdev->bd_mutex lock held. - * - * To avoid a potential lock inversion deadlock we preemptively - * try to take the spa_namespace_lock(). Normally it will not - * be contended and this is safe because spa_open_common() handles - * the case where the caller already holds the spa_namespace_lock. - * - * When it is contended we risk a lock inversion if we were to - * block waiting for the lock. Luckily, the __blkdev_get() - * function allows us to return -ERESTARTSYS which will result in - * bdev->bd_mutex being dropped, reacquired, and fops->open() being - * called again. This process can be repeated safely until both - * locks are acquired. - */ - if (!mutex_owned(&spa_namespace_lock)) { - locked = mutex_tryenter(&spa_namespace_lock); - if (!locked) - return (-SET_ERROR(ERESTARTSYS)); - } - - error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); - if (error) - goto out_mutex; - /* lie and say we're read-only */ error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os); if (error) - goto out_mutex; + goto out_owned; + zv->zv_objset = os; + owned = 1; + + error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); + if (error) + goto out_owned; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); - if (error) { - dmu_objset_disown(os, zvol_tag); - zv->zv_objset = NULL; - goto out_mutex; - } + if (error) + goto out_owned; - zv->zv_objset = os; error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); - if (error) { - dmu_objset_disown(os, zvol_tag); - zv->zv_objset = NULL; - goto out_mutex; - } + if (error) + goto out_owned; set_capacity(zv->zv_disk, volsize >> 9); zv->zv_volsize = volsize; @@ -979,9 +951,11 @@ zvol_first_open(zvol_state_t *zv) zv->zv_flags &= ~ZVOL_RDONLY; } -out_mutex: - if (locked) - mutex_exit(&spa_namespace_lock); +out_owned: + if (error && owned) { + dmu_objset_disown(os, zvol_tag); + zv->zv_objset = NULL; + } return (SET_ERROR(-error)); }