From b00131d43ca344d4b205a03ab3eb771a060e5087 Mon Sep 17 00:00:00 2001
From: Gunnar Beutner <gunnar@beutner.name>
Date: Mon, 20 Jun 2011 10:36:58 -0700
Subject: [PATCH] Fix unlink/xattr deadlock

The problem here is that prune_icache() tries to evict/delete
both the xattr directory inode as well as at least one xattr
inode contained in that directory. Here's what happens:

1. File is created.
2. xattr is created for that file (behind the scenes a xattr
   directory and a file in that xattr directory are created)
3. File is deleted.
4. Both the xattr directory inode and at least one xattr
   inode from that directory are evicted by prune_icache();
   prune_icache() acquires a lock on both inodes before it
   calls ->evict() on the inodes

When the xattr directory inode is evicted zfs_zinactive attempts
to delete the xattr files contained in that directory. While
enumerating these files zfs_zget() is called to obtain a reference
to the xattr file znode - which tries to lock the xattr inode.
However that very same xattr inode was already locked by
prune_icache() further up the call stack, thus leading to a
deadlock.

This can be reliably reproduced like this:
$ touch test
$ attr -s a -V b test
$ rm test
$ echo 3 > /proc/sys/vm/drop_caches

This patch fixes the deadlock by moving the zfs_purgedir() call to
zfs_unlinked_drain().  Instead zfs_rmnode() now checks whether the
xattr dir is empty and leaves the xattr dir in the unlinked set if
it finds any xattrs.

To ensure zfs_unlinked_drain() never accesses a stale super block
zfsvfs_teardown() has been update to block until the iput taskq
has been drained.  This avoids a potential race where a file with
an xattr directory is removed and the file system is immediately
unmounted.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #266
---
 module/zfs/zfs_dir.c    | 139 ++++++++++++++++++++++++----------------
 module/zfs/zfs_vfsops.c |   6 ++
 2 files changed, 90 insertions(+), 55 deletions(-)

diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c
index 58a18bcfc985..0df7cc1f83d8 100644
--- a/module/zfs/zfs_dir.c
+++ b/module/zfs/zfs_dir.c
@@ -473,57 +473,6 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
 	    zap_add_int(zsb->z_os, zsb->z_unlinkedobj, zp->z_id, tx));
 }
 
-/*
- * Clean up any znodes that had no links when we either crashed or
- * (force) umounted the file system.
- */
-void
-zfs_unlinked_drain(zfs_sb_t *zsb)
-{
-	zap_cursor_t	zc;
-	zap_attribute_t zap;
-	dmu_object_info_t doi;
-	znode_t		*zp;
-	int		error;
-
-	/*
-	 * Interate over the contents of the unlinked set.
-	 */
-	for (zap_cursor_init(&zc, zsb->z_os, zsb->z_unlinkedobj);
-	    zap_cursor_retrieve(&zc, &zap) == 0;
-	    zap_cursor_advance(&zc)) {
-
-		/*
-		 * See what kind of object we have in list
-		 */
-
-		error = dmu_object_info(zsb->z_os, zap.za_first_integer, &doi);
-		if (error != 0)
-			continue;
-
-		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
-		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
-		/*
-		 * We need to re-mark these list entries for deletion,
-		 * so we pull them back into core and set zp->z_unlinked.
-		 */
-		error = zfs_zget(zsb, zap.za_first_integer, &zp);
-
-		/*
-		 * We may pick up znodes that are already marked for deletion.
-		 * This could happen during the purge of an extended attribute
-		 * directory.  All we need to do is skip over them, since they
-		 * are already in the system marked z_unlinked.
-		 */
-		if (error != 0)
-			continue;
-
-		zp->z_unlinked = B_TRUE;
-		iput(ZTOI(zp));
-	}
-	zap_cursor_fini(&zc);
-}
-
 /*
  * Delete the entire contents of a directory.  Return a count
  * of the number of entries that could not be deleted. If we encounter
@@ -590,6 +539,71 @@ zfs_purgedir(znode_t *dzp)
 	return (skipped);
 }
 
+/*
+ * Clean up any znodes that had no links when we either crashed or
+ * (force) umounted the file system.
+ */
+void
+zfs_unlinked_drain(zfs_sb_t *zsb)
+{
+	zap_cursor_t	zc;
+	zap_attribute_t zap;
+	dmu_object_info_t doi;
+	znode_t		*zp;
+	int		error;
+
+	/*
+	 * Interate over the contents of the unlinked set.
+	 */
+	for (zap_cursor_init(&zc, zsb->z_os, zsb->z_unlinkedobj);
+	    zap_cursor_retrieve(&zc, &zap) == 0;
+	    zap_cursor_advance(&zc)) {
+
+		/*
+		 * See what kind of object we have in list
+		 */
+
+		error = dmu_object_info(zsb->z_os, zap.za_first_integer, &doi);
+		if (error != 0)
+			continue;
+
+		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+		/*
+		 * We need to re-mark these list entries for deletion,
+		 * so we pull them back into core and set zp->z_unlinked.
+		 */
+		error = zfs_zget(zsb, zap.za_first_integer, &zp);
+
+		/*
+		 * We may pick up znodes that are already marked for deletion.
+		 * This could happen during the purge of an extended attribute
+		 * directory.  All we need to do is skip over them, since they
+		 * are already in the system marked z_unlinked.
+		 */
+		if (error != 0)
+			continue;
+
+		zp->z_unlinked = B_TRUE;
+
+		/*
+		 * If this is an attribute directory, purge its contents.
+		 */
+		if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) {
+			/*
+			 * We don't need to check the return value of
+			 * zfs_purgedir here, because zfs_rmnode will just
+			 * return this xattr directory to the unlinked set
+			 * until all of its xattrs are gone.
+			 */
+			(void) zfs_purgedir(zp);
+		}
+
+		iput(ZTOI(zp));
+	}
+	zap_cursor_fini(&zc);
+}
+
 void
 zfs_rmnode(znode_t *zp)
 {
@@ -599,6 +613,7 @@ zfs_rmnode(znode_t *zp)
 	dmu_tx_t	*tx;
 	uint64_t	acl_obj;
 	uint64_t	xattr_obj;
+	uint64_t	count;
 	int		error;
 
 	ASSERT(zp->z_links == 0);
@@ -608,13 +623,27 @@ zfs_rmnode(znode_t *zp)
 	 * If this is an attribute directory, purge its contents.
 	 */
 	if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) {
-		if (zfs_purgedir(zp) != 0) {
+		error = zap_count(os, zp->z_id, &count);
+		if (error) {
+			zfs_znode_dmu_fini(zp);
+			return;
+		}
+
+		if (count > 0) {
+			taskq_t *taskq;
+
 			/*
-			 * Not enough space to delete some xattrs.
-			 * Leave it in the unlinked set.
+			 * There are still directory entries in this xattr
+			 * directory.  Let zfs_unlinked_drain() deal with
+			 * them to avoid deadlocking this process in the
+			 * zfs_purgedir()->zfs_zget()->ilookup() callpath
+			 * on the xattr inode's I_FREEING bit.
 			 */
-			zfs_znode_dmu_fini(zp);
+			taskq = dsl_pool_iput_taskq(dmu_objset_pool(os));
+			taskq_dispatch(taskq, (task_func_t *)
+			    zfs_unlinked_drain, zsb, TQ_SLEEP);
 
+			zfs_znode_dmu_fini(zp);
 			return;
 		}
 	}
diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
index e71aa91e1d68..257563887052 100644
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@@ -1091,6 +1091,12 @@ zfsvfs_teardown(zfs_sb_t *zsb, boolean_t unmounting)
 		(void) spl_invalidate_inodes(zsb->z_parent->z_sb, 0);
 	}
 
+	/*
+	 * Drain the iput_taskq to ensure all active references to the
+	 * zfs_sb_t have been handled only then can it be safely destroyed.
+	 */
+	taskq_wait(dsl_pool_iput_taskq(dmu_objset_pool(zsb->z_os)));
+
 	/*
 	 * Close the zil. NB: Can't close the zil while zfs_inactive
 	 * threads are blocked as zil_close can call zfs_inactive.