From 2a940ed6bb40ccfda4dfe53494d5a469f8b8bfd7 Mon Sep 17 00:00:00 2001
From: Chunwei Chen <david.chen@osnexus.com>
Date: Mon, 11 Apr 2016 14:53:48 -0700
Subject: [PATCH] Remove dummy znode from zvol_state

struct zvol_state contains a dummy znode, which is around 1KB on x64, only for
zfs_range_lock. But in reality, other than z_range_lock and z_range_avl,
zfs_range_lock only need znode on regular file, which means we add 1KB on a
structure and gain nothing.

In this patch, we remove the dummy znode for zvol_state. In order to do that,
we also need to refactor zfs_range_lock a bit. We move z_range_lock and
z_range_avl pair out of znode_t to form zfs_rlock_t. This new struct replaces
znode_t as the main handle inside the range lock functions. Since regular
files still need znode for RL_WRITER and RL_APPEND, we make znode an optional
argument in zfs_range_lock_impl.

To reduce possible merge conflict, we retain the prototype of zfs_range_lock.
And zvol now should call zvol_range_lock instead.

Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
---
 include/sys/zfs_rlock.h | 35 +++++++++++++++++++--
 include/sys/zfs_znode.h |  5 ++-
 module/zfs/zfs_ctldir.c |  1 -
 module/zfs/zfs_rlock.c  | 68 ++++++++++++++++++++++-------------------
 module/zfs/zfs_znode.c  |  8 ++---
 module/zfs/zvol.c       | 31 +++++++++----------
 6 files changed, 86 insertions(+), 62 deletions(-)

diff --git a/include/sys/zfs_rlock.h b/include/sys/zfs_rlock.h
index ea5e40369291..cbc573a40cde 100644
--- a/include/sys/zfs_rlock.h
+++ b/include/sys/zfs_rlock.h
@@ -32,7 +32,9 @@ extern "C" {
 
 #ifdef _KERNEL
 
-#include <sys/zfs_znode.h>
+#include <sys/list.h>
+#include <sys/avl.h>
+#include <sys/condvar.h>
 
 typedef enum {
 	RL_READER,
@@ -40,8 +42,13 @@ typedef enum {
 	RL_APPEND
 } rl_type_t;
 
+typedef struct zfs_rlock {
+	kmutex_t zr_mutex;	/* protects changes to zr_avl */
+	avl_tree_t zr_avl;	/* avl tree of range locks */
+} zfs_rlock_t;
+
 typedef struct rl {
-	znode_t *r_zp;		/* znode this lock applies to */
+	zfs_rlock_t *r_zrl;
 	avl_node_t r_node;	/* avl node link */
 	uint64_t r_off;		/* file range offset */
 	uint64_t r_len;		/* file range length */
@@ -55,16 +62,26 @@ typedef struct rl {
 	list_node_t rl_node;	/* used for deferred release */
 } rl_t;
 
+struct znode;
 /*
  * Lock a range (offset, length) as either shared (RL_READER)
  * or exclusive (RL_WRITER or RL_APPEND).  RL_APPEND is a special type that
  * is converted to RL_WRITER that specified to lock from the start of the
  * end of file.  Returns the range lock structure.
+ *
+ * Filesystem should call zfs_range_lock.
+ * Zvol should call zvol_range_lock.
  */
-rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type);
+rl_t *zfs_range_lock_impl(zfs_rlock_t *zrl, uint64_t off, uint64_t len,
+    rl_type_t type, struct znode *zp);
+#define	zfs_range_lock(zp, off, len, type) \
+	zfs_range_lock_impl(&(zp)->z_range_lock, off, len, type, zp)
+#define	zvol_range_lock(zrl, off, len, type) \
+	zfs_range_lock_impl(zrl, off, len, type, NULL)
 
 /* Unlock range and destroy range lock structure. */
 void zfs_range_unlock(rl_t *rl);
+#define	zvol_range_unlock(rl) zfs_range_unlock(rl)
 
 /*
  * Reduce range locked as RW_WRITER from whole file to specified range.
@@ -78,6 +95,18 @@ void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len);
  */
 int zfs_range_compare(const void *arg1, const void *arg2);
 
+static inline void zfs_rlock_init(zfs_rlock_t *zrl)
+{
+	mutex_init(&zrl->zr_mutex, NULL, MUTEX_DEFAULT, NULL);
+	avl_create(&zrl->zr_avl, zfs_range_compare,
+	    sizeof (rl_t), offsetof(rl_t, r_node));
+}
+
+static inline void zfs_rlock_destroy(zfs_rlock_t *zrl)
+{
+	avl_destroy(&zrl->zr_avl);
+	mutex_destroy(&zrl->zr_mutex);
+}
 #endif /* _KERNEL */
 
 #ifdef	__cplusplus
diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h
index 65fa1039926f..30a6ec9f807a 100644
--- a/include/sys/zfs_znode.h
+++ b/include/sys/zfs_znode.h
@@ -37,6 +37,7 @@
 #include <sys/rrwlock.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfs_stat.h>
+#include <sys/zfs_rlock.h>
 #endif
 #include <sys/zfs_acl.h>
 #include <sys/zil.h>
@@ -187,8 +188,7 @@ typedef struct znode {
 	krwlock_t	z_parent_lock;	/* parent lock for directories */
 	krwlock_t	z_name_lock;	/* "master" lock for dirent locks */
 	zfs_dirlock_t	*z_dirlocks;	/* directory entry lock list */
-	kmutex_t	z_range_lock;	/* protects changes to z_range_avl */
-	avl_tree_t	z_range_avl;	/* avl tree of file range locks */
+	zfs_rlock_t	z_range_lock;	/* file range lock */
 	uint8_t		z_unlinked;	/* file has been unlinked */
 	uint8_t		z_atime_dirty;	/* atime needs to be synced */
 	uint8_t		z_zn_prefetch;	/* Prefetch znodes? */
@@ -212,7 +212,6 @@ typedef struct znode {
 	list_node_t	z_link_node;	/* all znodes in fs link */
 	sa_handle_t	*z_sa_hdl;	/* handle to sa data */
 	boolean_t	z_is_sa;	/* are we native sa? */
-	boolean_t	z_is_zvol;	/* are we used by the zvol */
 	boolean_t	z_is_mapped;	/* are we mmap'ed */
 	boolean_t	z_is_ctldir;	/* are we .zfs entry */
 	boolean_t	z_is_stale;	/* are we stale due to rollback? */
diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
index 937feac23148..4aad284dbe03 100644
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@@ -484,7 +484,6 @@ zfsctl_inode_alloc(zfs_sb_t *zsb, uint64_t id,
 	zp->z_gid = 0;
 	zp->z_mode = 0;
 	zp->z_sync_cnt = 0;
-	zp->z_is_zvol = B_FALSE;
 	zp->z_is_mapped = B_FALSE;
 	zp->z_is_ctldir = B_TRUE;
 	zp->z_is_sa = B_FALSE;
diff --git a/module/zfs/zfs_rlock.c b/module/zfs/zfs_rlock.c
index 5064eb796b79..a3e9998146c3 100644
--- a/module/zfs/zfs_rlock.c
+++ b/module/zfs/zfs_rlock.c
@@ -96,14 +96,15 @@
  */
 
 #include <sys/zfs_rlock.h>
+#include <sys/zfs_znode.h>
 
 /*
  * Check if a write lock can be grabbed, or wait and recheck until available.
  */
 static void
-zfs_range_lock_writer(znode_t *zp, rl_t *new)
+zfs_range_lock_writer(zfs_rlock_t *zrl, rl_t *new, znode_t *zp)
 {
-	avl_tree_t *tree = &zp->z_range_avl;
+	avl_tree_t *tree = &zrl->zr_avl;
 	rl_t *rl;
 	avl_index_t where;
 	uint64_t end_size;
@@ -112,17 +113,16 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new)
 
 	for (;;) {
 		/*
-		 * Range locking is also used by zvol and uses a
-		 * dummied up znode. However, for zvol, we don't need to
-		 * append or grow blocksize, and besides we don't have
-		 * a "sa" data or zfs_sb_t - so skip that processing.
+		 * Range locking is also used by zvol. However, for zvol, we
+		 * don't need to append or grow blocksize, so skip that
+		 * processing.
 		 *
 		 * Yes, this is ugly, and would be solved by not handling
 		 * grow or append in range lock code. If that was done then
 		 * we could make the range locking code generically available
 		 * to other non-zfs consumers.
 		 */
-		if (!zp->z_is_zvol) { /* caller is ZPL */
+		if (zp) { /* caller is ZPL */
 			/*
 			 * If in append mode pick up the current end of file.
 			 * This is done under z_range_lock to avoid races.
@@ -175,7 +175,7 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new)
 			cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
 			rl->r_write_wanted = B_TRUE;
 		}
-		cv_wait(&rl->r_wr_cv, &zp->z_range_lock);
+		cv_wait(&rl->r_wr_cv, &zrl->zr_mutex);
 
 		/* reset to original */
 		new->r_off = off;
@@ -353,9 +353,9 @@ zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
  * Check if a reader lock can be grabbed, or wait and recheck until available.
  */
 static void
-zfs_range_lock_reader(znode_t *zp, rl_t *new)
+zfs_range_lock_reader(zfs_rlock_t *zrl, rl_t *new)
 {
-	avl_tree_t *tree = &zp->z_range_avl;
+	avl_tree_t *tree = &zrl->zr_avl;
 	rl_t *prev, *next;
 	avl_index_t where;
 	uint64_t off = new->r_off;
@@ -378,7 +378,7 @@ zfs_range_lock_reader(znode_t *zp, rl_t *new)
 				cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
 				prev->r_read_wanted = B_TRUE;
 			}
-			cv_wait(&prev->r_rd_cv, &zp->z_range_lock);
+			cv_wait(&prev->r_rd_cv, &zrl->zr_mutex);
 			goto retry;
 		}
 		if (off + len < prev->r_off + prev->r_len)
@@ -401,7 +401,7 @@ zfs_range_lock_reader(znode_t *zp, rl_t *new)
 				cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
 				next->r_read_wanted = B_TRUE;
 			}
-			cv_wait(&next->r_rd_cv, &zp->z_range_lock);
+			cv_wait(&next->r_rd_cv, &zrl->zr_mutex);
 			goto retry;
 		}
 		if (off + len <= next->r_off + next->r_len)
@@ -421,16 +421,20 @@ zfs_range_lock_reader(znode_t *zp, rl_t *new)
  * or exclusive (RL_WRITER). Returns the range lock structure
  * for later unlocking or reduce range (if entire file
  * previously locked as RL_WRITER).
+ *
+ * Filesystem should call zfs_range_lock.
+ * Zvol should call zvol_range_lock.
  */
 rl_t *
-zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
+zfs_range_lock_impl(zfs_rlock_t *zrl, uint64_t off, uint64_t len,
+    rl_type_t type, znode_t *zp)
 {
 	rl_t *new;
 
 	ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
 
 	new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
-	new->r_zp = zp;
+	new->r_zrl = zrl;
 	new->r_off = off;
 	if (len + off < off)	/* overflow */
 		len = UINT64_MAX - off;
@@ -441,18 +445,18 @@ zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
 	new->r_write_wanted = B_FALSE;
 	new->r_read_wanted = B_FALSE;
 
-	mutex_enter(&zp->z_range_lock);
+	mutex_enter(&zrl->zr_mutex);
 	if (type == RL_READER) {
 		/*
 		 * First check for the usual case of no locks
 		 */
-		if (avl_numnodes(&zp->z_range_avl) == 0)
-			avl_add(&zp->z_range_avl, new);
+		if (avl_numnodes(&zrl->zr_avl) == 0)
+			avl_add(&zrl->zr_avl, new);
 		else
-			zfs_range_lock_reader(zp, new);
-	} else
-		zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */
-	mutex_exit(&zp->z_range_lock);
+			zfs_range_lock_reader(zrl, new);
+	} else /* RL_WRITER or RL_APPEND */
+		zfs_range_lock_writer(zrl, new, zp);
+	mutex_exit(&zrl->zr_mutex);
 	return (new);
 }
 
@@ -474,9 +478,9 @@ zfs_range_free(void *arg)
  * Unlock a reader lock
  */
 static void
-zfs_range_unlock_reader(znode_t *zp, rl_t *remove, list_t *free_list)
+zfs_range_unlock_reader(zfs_rlock_t *zrl, rl_t *remove, list_t *free_list)
 {
-	avl_tree_t *tree = &zp->z_range_avl;
+	avl_tree_t *tree = &zrl->zr_avl;
 	rl_t *rl, *next = NULL;
 	uint64_t len;
 
@@ -543,7 +547,7 @@ zfs_range_unlock_reader(znode_t *zp, rl_t *remove, list_t *free_list)
 void
 zfs_range_unlock(rl_t *rl)
 {
-	znode_t *zp = rl->r_zp;
+	zfs_rlock_t *zrl = rl->r_zrl;
 	list_t free_list;
 	rl_t *free_rl;
 
@@ -552,10 +556,10 @@ zfs_range_unlock(rl_t *rl)
 	ASSERT(!rl->r_proxy);
 	list_create(&free_list, sizeof (rl_t), offsetof(rl_t, rl_node));
 
-	mutex_enter(&zp->z_range_lock);
+	mutex_enter(&zrl->zr_mutex);
 	if (rl->r_type == RL_WRITER) {
 		/* writer locks can't be shared or split */
-		avl_remove(&zp->z_range_avl, rl);
+		avl_remove(&zrl->zr_avl, rl);
 		if (rl->r_write_wanted)
 			cv_broadcast(&rl->r_wr_cv);
 
@@ -568,9 +572,9 @@ zfs_range_unlock(rl_t *rl)
 		 * lock may be shared, let zfs_range_unlock_reader()
 		 * release the zp->z_range_lock lock and free the rl_t
 		 */
-		zfs_range_unlock_reader(zp, rl, &free_list);
+		zfs_range_unlock_reader(zrl, rl, &free_list);
 	}
-	mutex_exit(&zp->z_range_lock);
+	mutex_exit(&zrl->zr_mutex);
 
 	while ((free_rl = list_head(&free_list)) != NULL) {
 		list_remove(&free_list, free_rl);
@@ -588,17 +592,17 @@ zfs_range_unlock(rl_t *rl)
 void
 zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
 {
-	znode_t *zp = rl->r_zp;
+	zfs_rlock_t *zrl = rl->r_zrl;
 
 	/* Ensure there are no other locks */
-	ASSERT(avl_numnodes(&zp->z_range_avl) == 1);
+	ASSERT(avl_numnodes(&zrl->zr_avl) == 1);
 	ASSERT(rl->r_off == 0);
 	ASSERT(rl->r_type == RL_WRITER);
 	ASSERT(!rl->r_proxy);
 	ASSERT3U(rl->r_len, ==, UINT64_MAX);
 	ASSERT3U(rl->r_cnt, ==, 1);
 
-	mutex_enter(&zp->z_range_lock);
+	mutex_enter(&zrl->zr_mutex);
 	rl->r_off = off;
 	rl->r_len = len;
 
@@ -607,7 +611,7 @@ zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
 	if (rl->r_read_wanted)
 		cv_broadcast(&rl->r_rd_cv);
 
-	mutex_exit(&zp->z_range_lock);
+	mutex_exit(&zrl->zr_mutex);
 }
 
 /*
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index 39b2ba07afd8..d9c132b1f7f7 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -113,9 +113,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
 
-	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
-	avl_create(&zp->z_range_avl, zfs_range_compare,
-	    sizeof (rl_t), offsetof(rl_t, r_node));
+	zfs_rlock_init(&zp->z_range_lock);
 
 	zp->z_dirlocks = NULL;
 	zp->z_acl_cached = NULL;
@@ -137,8 +135,7 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 	rw_destroy(&zp->z_name_lock);
 	mutex_destroy(&zp->z_acl_lock);
 	rw_destroy(&zp->z_xattr_lock);
-	avl_destroy(&zp->z_range_avl);
-	mutex_destroy(&zp->z_range_lock);
+	zfs_rlock_destroy(&zp->z_range_lock);
 
 	ASSERT(zp->z_dirlocks == NULL);
 	ASSERT(zp->z_acl_cached == NULL);
@@ -615,7 +612,6 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz,
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
-	zp->z_is_zvol = B_FALSE;
 	zp->z_is_mapped = B_FALSE;
 	zp->z_is_ctldir = B_FALSE;
 	zp->z_is_stale = B_FALSE;
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index ba482a4740eb..7b4b331abe97 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -75,7 +75,7 @@ typedef struct zvol_state {
 	uint32_t		zv_open_count;	/* open counts */
 	uint32_t		zv_changed;	/* disk changed */
 	zilog_t			*zv_zilog;	/* ZIL handle */
-	znode_t			zv_znode;	/* for range locking */
+	zfs_rlock_t		zv_range_lock;	/* range lock */
 	dmu_buf_t		*zv_dbuf;	/* bonus handle */
 	dev_t			zv_dev;		/* device id */
 	struct gendisk		*zv_disk;	/* generic disk */
@@ -633,8 +633,8 @@ zvol_write(zvol_state_t *zv, uio_t *uio, boolean_t sync)
 
 	ASSERT(zv && zv->zv_open_count > 0);
 
-	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
-	    RL_WRITER);
+	rl = zvol_range_lock(&zv->zv_range_lock, uio->uio_loffset,
+	    uio->uio_resid, RL_WRITER);
 
 	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
 		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
@@ -660,7 +660,7 @@ zvol_write(zvol_state_t *zv, uio_t *uio, boolean_t sync)
 		if (error)
 			break;
 	}
-	zfs_range_unlock(rl);
+	zvol_range_unlock(rl);
 	if (sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 	return (error);
@@ -725,7 +725,7 @@ zvol_discard(struct bio *bio)
 	if (start >= end)
 		return (0);
 
-	rl = zfs_range_lock(&zv->zv_znode, start, size, RL_WRITER);
+	rl = zvol_range_lock(&zv->zv_range_lock, start, size, RL_WRITER);
 	tx = dmu_tx_create(zv->zv_objset);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
@@ -738,7 +738,7 @@ zvol_discard(struct bio *bio)
 		    ZVOL_OBJ, start, size);
 	}
 
-	zfs_range_unlock(rl);
+	zvol_range_unlock(rl);
 
 	return (error);
 }
@@ -752,8 +752,8 @@ zvol_read(zvol_state_t *zv, uio_t *uio)
 
 	ASSERT(zv && zv->zv_open_count > 0);
 
-	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
-	    RL_READER);
+	rl = zvol_range_lock(&zv->zv_range_lock, uio->uio_loffset,
+	    uio->uio_resid, RL_READER);
 	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
 		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
 
@@ -769,7 +769,7 @@ zvol_read(zvol_state_t *zv, uio_t *uio)
 			break;
 		}
 	}
-	zfs_range_unlock(rl);
+	zvol_range_unlock(rl);
 	return (error);
 }
 
@@ -850,7 +850,7 @@ zvol_get_done(zgd_t *zgd, int error)
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
-	zfs_range_unlock(zgd->zgd_rl);
+	zvol_range_unlock(zgd->zgd_rl);
 
 	if (error == 0 && zgd->zgd_bp)
 		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
@@ -879,7 +879,8 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 
 	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_zilog = zv->zv_zilog;
-	zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
+	zgd->zgd_rl = zvol_range_lock(&zv->zv_range_lock, offset, size,
+	    RL_READER);
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
@@ -1305,10 +1306,7 @@ zvol_alloc(dev_t dev, const char *name)
 	zv->zv_open_count = 0;
 	strlcpy(zv->zv_name, name, MAXNAMELEN);
 
-	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
-	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
-	    sizeof (rl_t), offsetof(rl_t, r_node));
-	zv->zv_znode.z_is_zvol = TRUE;
+	zfs_rlock_init(&zv->zv_range_lock);
 
 	zv->zv_disk->major = zvol_major;
 	zv->zv_disk->first_minor = (dev & MINORMASK);
@@ -1337,8 +1335,7 @@ zvol_free(zvol_state_t *zv)
 	ASSERT(MUTEX_HELD(&zvol_state_lock));
 	ASSERT(zv->zv_open_count == 0);
 
-	avl_destroy(&zv->zv_znode.z_range_avl);
-	mutex_destroy(&zv->zv_znode.z_range_lock);
+	zfs_rlock_destroy(&zv->zv_range_lock);
 
 	zv->zv_disk->private_data = NULL;