Add zfs_sb_prune_compat() function

For kernels which do not implement a per-suberblock shrinker the shrink_dcache_parent() function was used to attempt to reclaim dentries. This was found not be entirely reliable which could lead to performance issues on older kernels running meta-data heavy workloads. To address this issue a zfs_sb_prune_compat() function has been added to implement this functionality. It relies on traversing the list of znodes for a filesystem and adding them to a private list with a reference held. The private list can then be safely walked outside the z_znodes_lock to prune dentires and drop the last reference so the inode can be freed. This provides the same synchronous behavior as the per-filesystem shrinker and has the advantage of depending on only long standing interfaces. The number of threads in the iput taskq has also been increased to speed up the handling of asynchronous iputs. This improves the rate of meta data reclaim regardless of the kernel version. Signed-off-by: Brian Behlendorf <[email protected]>
behlendorf · Jun 17, 2015 · 78aa2b1 · dweeezil · Jun 18, 2015 · behlendorf
1 parent 036391c
commit 78aa2b1
Show file tree

Hide file tree

Showing 4 changed files with 75 additions and 13 deletions.
diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h
@@ -211,6 +211,7 @@ typedef struct znode {
 	nvlist_t	*z_xattr_cached; /* cached xattrs */
 	struct znode	*z_xattr_parent; /* xattr parent znode */
 	list_node_t	z_link_node;	/* all znodes in fs link */
+	list_node_t	z_prune_node;	/* znodes being pruned link */
 	sa_handle_t	*z_sa_hdl;	/* handle to sa data */
 	boolean_t	z_is_sa;	/* are we native sa? */
 	boolean_t	z_is_zvol;	/* are we used by the zvol */

diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
@@ -170,8 +170,8 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
 
-	dp->dp_iput_taskq = taskq_create("zfs_iput_taskq", 1, minclsyspri,
-	    1, 4, 0);
+	dp->dp_iput_taskq = taskq_create("z_iput", max_ncpus, minclsyspri,
+	    max_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE);
 
 	return (dp);
 }

diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c
@@ -1072,6 +1072,75 @@ zfs_root(zfs_sb_t *zsb, struct inode **ipp)
 }
 EXPORT_SYMBOL(zfs_root);
 
+#if !defined(HAVE_SHRINK) && !defined(HAVE_SPLIT_SHRINKER_CALLBACK)
+/*
+ * Linux kernels older than 3.1 do not support a per-filesystem shrinker.
+ * To accomodate this we must improvise and manually walk the list of znodes
+ * attempting to prune dentries in order to be able to drop the inodes.
+ *
+ * To avoid scanning the same znodes multiple times they are always rotated
+ * to the end of the z_all_znodes list.  New znodes are inserted at the
+ * end of the list so we're always scanning the oldest znodes first.
+ */
+static int
+zfs_sb_prune_compat(zfs_sb_t *zsb, unsigned long nr_to_scan)
+{
+	list_t prune_list;
+	znode_t *zp;
+	int objects = 0;
+	int i = 0;
+
+	list_create(&prune_list, sizeof (znode_t),
+	    offsetof(znode_t, z_prune_node));
+
+	mutex_enter(&zsb->z_znodes_lock);
+	while ((zp = list_head(&zsb->z_all_znodes)) != NULL) {
+
+		if (i++ > nr_to_scan)
+			break;
+
+		ASSERT(list_link_active(&zp->z_link_node));
+		list_remove(&zsb->z_all_znodes, zp);
+		list_insert_tail(&zsb->z_all_znodes, zp);
+
+		if (!mutex_tryenter(&zp->z_lock))
+			continue;
+
+		if (list_link_active(&zp->z_prune_node)) {
+			mutex_exit(&zp->z_lock);
+			continue;
+		}
+
+		if (igrab(ZTOI(zp)) == NULL) {
+			mutex_exit(&zp->z_lock);
+			continue;
+		}
+
+		list_insert_tail(&prune_list, zp);
+		mutex_exit(&zp->z_lock);
+	}
+	mutex_exit(&zsb->z_znodes_lock);
+
+	while ((zp = list_head(&prune_list)) != NULL) {
+
+		d_prune_aliases(ZTOI(zp));
+
+		if (atomic_read(&ZTOI(zp)->i_count) == 1)
+			objects++;
+
+		mutex_enter(&zp->z_lock);
+		list_remove(&prune_list, zp);
+		mutex_exit(&zp->z_lock);
+
+		iput(ZTOI(zp));
+	}
+
+	list_destroy(&prune_list);
+
+	return (objects);
+}
+#endif
+
 /*
  * The ARC has requested that the filesystem drop entries from the dentry
  * and inode caches.  This can occur when the ARC needs to free meta data
@@ -1107,17 +1176,7 @@ zfs_sb_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
 #elif defined(HAVE_SHRINK)
 	*objects = (*shrinker->shrink)(shrinker, &sc);
 #else
-	/*
-	 * Linux kernels older than 3.1 do not support a per-filesystem
-	 * shrinker.  Therefore, we must fall back to the only available
-	 * interface which is to discard all unused dentries and inodes.
-	 * This behavior clearly isn't ideal but it's required so the ARC
-	 * may free memory.  The performance impact is mitigated by the
-	 * fact that the frequently accessed dentry and inode buffers will
-	 * still be in the ARC making them relatively cheap to recreate.
-	 */
-	*objects = 0;
-	shrink_dcache_parent(sb->s_root);
+	*objects = zfs_sb_prune_compat(zsb, nr_to_scan);
 #endif
 	ZFS_EXIT(zsb);
 

diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
@@ -104,6 +104,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 
 	inode_init_once(ZTOI(zp));
 	list_link_init(&zp->z_link_node);
+	list_link_init(&zp->z_prune_node);
 
 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
@@ -130,6 +131,7 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 	znode_t *zp = buf;
 
 	ASSERT(!list_link_active(&zp->z_link_node));
+	ASSERT(!list_link_active(&zp->z_prune_node));
 	mutex_destroy(&zp->z_lock);
 	rw_destroy(&zp->z_parent_lock);
 	rw_destroy(&zp->z_name_lock);