From dd2a7947a723c5f37b8f52e7cd5765ebb596ec15 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 5 Aug 2014 13:46:49 -0700
Subject: [PATCH 1/3] Avoid 128K kmem allocations in mzap_upgrade()

As originally implemented the mzap_upgrade() function will
perform up to SPA_MAXBLOCKSIZE allocations using kmem_alloc().
These large allocations can potentially block indefinitely
if contiguous memory is not available.  Since this allocation
is done under the zap->zap_rwlock it can appear as if there is
a deadlock in zap_lockdir().  This is shown below.

The optimal fix for this would be to rework mzap_upgrade()
such that no large allocations are required.  This could be
done but it would result in us diverging further from the other
implementations.  Therefore I've opted against doing this
unless it becomes absolutely necessary.

Instead mzap_upgrade() has been updated to use zio_buf_alloc()
which can reliably provide buffers of up to SPA_MAXBLOCKSIZE.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #2580
---
 module/zfs/zap_micro.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index 68fb747697d2..2249b7338027 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -533,7 +533,7 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 	sz = zap->zap_dbuf->db_size;
-	mzp = kmem_alloc(sz, KM_PUSHPAGE | KM_NODEBUG);
+	mzp = zio_buf_alloc(sz);
 	bcopy(zap->zap_dbuf->db_data, mzp, sz);
 	nchunks = zap->zap_m.zap_num_chunks;
 
@@ -541,7 +541,7 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
 		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
 		    1ULL << fzap_default_block_shift, 0, tx);
 		if (err) {
-			kmem_free(mzp, sz);
+			zio_buf_free(mzp, sz);
 			return (err);
 		}
 	}
@@ -567,7 +567,7 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
 		if (err)
 			break;
 	}
-	kmem_free(mzp, sz);
+	zio_buf_free(mzp, sz);
 	*zapp = zap;
 	return (err);
 }

From ca043ca48aecc85de05f207f801c3e1f9358da79 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Mon, 4 Aug 2014 13:30:20 -0700
Subject: [PATCH 2/3] Add zfs_iput_async() interface

Handle all iputs in zfs_purgedir() and zfs_inode_destroy()
asynchronously to prevent deadlocks.  When the iputs are allowed
to run synchronously in the destroy call path deadlocks between
xattr directory inodes and their parent file inodes are possible.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #457
---
 include/sys/zfs_vnops.h |  1 +
 module/zfs/zfs_dir.c    |  7 ++++---
 module/zfs/zfs_vnops.c  | 18 +++++++++++-------
 module/zfs/zfs_znode.c  |  2 +-
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h
index c9fecf8ba98e..c331035c544a 100644
--- a/include/sys/zfs_vnops.h
+++ b/include/sys/zfs_vnops.h
@@ -79,6 +79,7 @@ extern int zfs_putpage(struct inode *ip, struct page *pp,
 extern int zfs_dirty_inode(struct inode *ip, int flags);
 extern int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp,
     size_t len, unsigned long vm_flags);
+extern void zfs_iput_async(struct inode *ip);
 
 #ifdef	__cplusplus
 }
diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c
index 448a8727ebbb..d2b5bc4088fa 100644
--- a/module/zfs/zfs_dir.c
+++ b/module/zfs/zfs_dir.c
@@ -46,6 +46,7 @@
 #include <sys/policy.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
+#include <sys/zfs_vnops.h>
 #include <sys/fs/zfs.h>
 #include "fs/fs_subr.h"
 #include <sys/zap.h>
@@ -528,7 +529,7 @@ zfs_purgedir(znode_t *dzp)
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
-			iput(ZTOI(xzp));
+			zfs_iput_async(ZTOI(xzp));
 			skipped += 1;
 			continue;
 		}
@@ -541,7 +542,7 @@ zfs_purgedir(znode_t *dzp)
 			skipped += 1;
 		dmu_tx_commit(tx);
 
-		iput(ZTOI(xzp));
+		zfs_iput_async(ZTOI(xzp));
 	}
 	zap_cursor_fini(&zc);
 	if (error != ENOENT)
@@ -729,7 +730,7 @@ zfs_rmnode(znode_t *zp)
 	dmu_tx_commit(tx);
 out:
 	if (xzp)
-		iput(ZTOI(xzp));
+		zfs_iput_async(ZTOI(xzp));
 }
 
 static uint64_t
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 4f531733e42f..18b2564a270d 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -101,7 +101,7 @@
  *	pushing cached pages (which acquires range locks) and syncing out
  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
  *	which could deadlock the system if you were already holding one.
- *	If you must call iput() within a tx then use iput_ASYNC().
+ *	If you must call iput() within a tx then use zfs_iput_async().
  *
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
@@ -927,12 +927,17 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 }
 EXPORT_SYMBOL(zfs_write);
 
-static void
-iput_async(struct inode *ip, taskq_t *taskq)
+void
+zfs_iput_async(struct inode *ip)
 {
+	objset_t *os = ITOZSB(ip)->z_os;
+
 	ASSERT(atomic_read(&ip->i_count) > 0);
+	ASSERT(os != NULL);
+
 	if (atomic_read(&ip->i_count) == 1)
-		taskq_dispatch(taskq, (task_func_t *)iput, ip, TQ_PUSHPAGE);
+		taskq_dispatch(dsl_pool_iput_taskq(dmu_objset_pool(os)),
+		    (task_func_t *)iput, ip, TQ_PUSHPAGE);
 	else
 		iput(ip);
 }
@@ -941,7 +946,6 @@ void
 zfs_get_done(zgd_t *zgd, int error)
 {
 	znode_t *zp = zgd->zgd_private;
-	objset_t *os = ZTOZSB(zp)->z_os;
 
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
@@ -952,7 +956,7 @@ zfs_get_done(zgd_t *zgd, int error)
 	 * Release the vnode asynchronously as we currently have the
 	 * txg stopped from syncing.
 	 */
-	iput_async(ZTOI(zp), dsl_pool_iput_taskq(dmu_objset_pool(os)));
+	zfs_iput_async(ZTOI(zp));
 
 	if (error == 0 && zgd->zgd_bp)
 		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
@@ -994,7 +998,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 		 * Release the vnode asynchronously as we currently have the
 		 * txg stopped from syncing.
 		 */
-		iput_async(ZTOI(zp), dsl_pool_iput_taskq(dmu_objset_pool(os)));
+		zfs_iput_async(ZTOI(zp));
 		return (SET_ERROR(ENOENT));
 	}
 
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index 4403e3289c9c..5fcb9e930f74 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -292,7 +292,7 @@ zfs_inode_destroy(struct inode *ip)
 	}
 
 	if (zp->z_xattr_parent) {
-		iput(ZTOI(zp->z_xattr_parent));
+		zfs_iput_async(ZTOI(zp->z_xattr_parent));
 		zp->z_xattr_parent = NULL;
 	}
 

From 9cff2296d4a054db266fc65b5e806ebb84d79556 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 31 Jul 2014 11:19:47 -0700
Subject: [PATCH 3/3] Revert "Revert "Revert "Fix unlink/xattr deadlock"""

This reverts commit 7973e46 which brings the basic flow of the
code back in line with the other ZFS implementations.  This
was possible due to the following related changes.

e89260a Directory xattr znodes hold a reference on their parent
6f9548c Fix deadlock in zfs_zget()
ca043ca Add zfs_iput_async() interface
dd2a794 Avoid 128K kmem allocations in mzap_upgrade()

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #457
Issue #2058
Issue #2128
Issue #2240
---
 module/zfs/zfs_dir.c | 142 +++++++++++++++++--------------------------
 1 file changed, 57 insertions(+), 85 deletions(-)

diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c
index d2b5bc4088fa..712cb4656430 100644
--- a/module/zfs/zfs_dir.c
+++ b/module/zfs/zfs_dir.c
@@ -484,6 +484,57 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
 	    zap_add_int(zsb->z_os, zsb->z_unlinkedobj, zp->z_id, tx));
 }
 
+/*
+ * Clean up any znodes that had no links when we either crashed or
+ * (force) umounted the file system.
+ */
+void
+zfs_unlinked_drain(zfs_sb_t *zsb)
+{
+	zap_cursor_t	zc;
+	zap_attribute_t zap;
+	dmu_object_info_t doi;
+	znode_t		*zp;
+	int		error;
+
+	/*
+	 * Iterate over the contents of the unlinked set.
+	 */
+	for (zap_cursor_init(&zc, zsb->z_os, zsb->z_unlinkedobj);
+	    zap_cursor_retrieve(&zc, &zap) == 0;
+	    zap_cursor_advance(&zc)) {
+
+		/*
+		 * See what kind of object we have in list
+		 */
+
+		error = dmu_object_info(zsb->z_os, zap.za_first_integer, &doi);
+		if (error != 0)
+			continue;
+
+		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+		/*
+		 * We need to re-mark these list entries for deletion,
+		 * so we pull them back into core and set zp->z_unlinked.
+		 */
+		error = zfs_zget(zsb, zap.za_first_integer, &zp);
+
+		/*
+		 * We may pick up znodes that are already marked for deletion.
+		 * This could happen during the purge of an extended attribute
+		 * directory.  All we need to do is skip over them, since they
+		 * are already in the system marked z_unlinked.
+		 */
+		if (error != 0)
+			continue;
+
+		zp->z_unlinked = B_TRUE;
+		iput(ZTOI(zp));
+	}
+	zap_cursor_fini(&zc);
+}
+
 /*
  * Delete the entire contents of a directory.  Return a count
  * of the number of entries that could not be deleted. If we encounter
@@ -517,7 +568,8 @@ zfs_purgedir(znode_t *dzp)
 			continue;
 		}
 
-		ASSERT(S_ISREG(ZTOI(xzp)->i_mode)||S_ISLNK(ZTOI(xzp)->i_mode));
+		ASSERT(S_ISREG(ZTOI(xzp)->i_mode) ||
+		    S_ISLNK(ZTOI(xzp)->i_mode));
 
 		tx = dmu_tx_create(zsb->z_os);
 		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
@@ -550,71 +602,6 @@ zfs_purgedir(znode_t *dzp)
 	return (skipped);
 }
 
-/*
- * Clean up any znodes that had no links when we either crashed or
- * (force) umounted the file system.
- */
-void
-zfs_unlinked_drain(zfs_sb_t *zsb)
-{
-	zap_cursor_t	zc;
-	zap_attribute_t zap;
-	dmu_object_info_t doi;
-	znode_t		*zp;
-	int		error;
-
-	/*
-	 * Iterate over the contents of the unlinked set.
-	 */
-	for (zap_cursor_init(&zc, zsb->z_os, zsb->z_unlinkedobj);
-	    zap_cursor_retrieve(&zc, &zap) == 0;
-	    zap_cursor_advance(&zc)) {
-
-		/*
-		 * See what kind of object we have in list
-		 */
-
-		error = dmu_object_info(zsb->z_os, zap.za_first_integer, &doi);
-		if (error != 0)
-			continue;
-
-		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
-		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
-		/*
-		 * We need to re-mark these list entries for deletion,
-		 * so we pull them back into core and set zp->z_unlinked.
-		 */
-		error = zfs_zget(zsb, zap.za_first_integer, &zp);
-
-		/*
-		 * We may pick up znodes that are already marked for deletion.
-		 * This could happen during the purge of an extended attribute
-		 * directory.  All we need to do is skip over them, since they
-		 * are already in the system marked z_unlinked.
-		 */
-		if (error != 0)
-			continue;
-
-		zp->z_unlinked = B_TRUE;
-
-		/*
-		 * If this is an attribute directory, purge its contents.
-		 */
-		if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) {
-			/*
-			 * We don't need to check the return value of
-			 * zfs_purgedir here, because zfs_rmnode will just
-			 * return this xattr directory to the unlinked set
-			 * until all of its xattrs are gone.
-			 */
-			(void) zfs_purgedir(zp);
-		}
-
-		iput(ZTOI(zp));
-	}
-	zap_cursor_fini(&zc);
-}
-
 void
 zfs_rmnode(znode_t *zp)
 {
@@ -624,7 +611,6 @@ zfs_rmnode(znode_t *zp)
 	dmu_tx_t	*tx;
 	uint64_t	acl_obj;
 	uint64_t	xattr_obj;
-	uint64_t	count;
 	int		error;
 
 	ASSERT(zp->z_links == 0);
@@ -634,27 +620,13 @@ zfs_rmnode(znode_t *zp)
 	 * If this is an attribute directory, purge its contents.
 	 */
 	if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) {
-		error = zap_count(os, zp->z_id, &count);
-		if (error) {
-			zfs_znode_dmu_fini(zp);
-			return;
-		}
-
-		if (count > 0) {
-			taskq_t *taskq;
-
+		if (zfs_purgedir(zp) != 0) {
 			/*
-			 * There are still directory entries in this xattr
-			 * directory.  Let zfs_unlinked_drain() deal with
-			 * them to avoid deadlocking this process in the
-			 * zfs_purgedir()->zfs_zget()->ilookup() callpath
-			 * on the xattr inode's I_FREEING bit.
+			 * Not enough space to delete some xattrs.
+			 * Leave it in the unlinked set.
 			 */
-			taskq = dsl_pool_iput_taskq(dmu_objset_pool(os));
-			taskq_dispatch(taskq, (task_func_t *)
-			    zfs_unlinked_drain, zsb, TQ_SLEEP);
-
 			zfs_znode_dmu_fini(zp);
+
 			return;
 		}
 	}