From 2a940ed6bb40ccfda4dfe53494d5a469f8b8bfd7 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Mon, 11 Apr 2016 14:53:48 -0700 Subject: [PATCH] Remove dummy znode from zvol_state struct zvol_state contains a dummy znode, which is around 1KB on x64, only for zfs_range_lock. But in reality, other than z_range_lock and z_range_avl, zfs_range_lock only need znode on regular file, which means we add 1KB on a structure and gain nothing. In this patch, we remove the dummy znode for zvol_state. In order to do that, we also need to refactor zfs_range_lock a bit. We move z_range_lock and z_range_avl pair out of znode_t to form zfs_rlock_t. This new struct replaces znode_t as the main handle inside the range lock functions. Since regular files still need znode for RL_WRITER and RL_APPEND, we make znode an optional argument in zfs_range_lock_impl. To reduce possible merge conflict, we retain the prototype of zfs_range_lock. And zvol now should call zvol_range_lock instead. Signed-off-by: Chunwei Chen --- include/sys/zfs_rlock.h | 35 +++++++++++++++++++-- include/sys/zfs_znode.h | 5 ++- module/zfs/zfs_ctldir.c | 1 - module/zfs/zfs_rlock.c | 68 ++++++++++++++++++++++------------------- module/zfs/zfs_znode.c | 8 ++--- module/zfs/zvol.c | 31 +++++++++---------- 6 files changed, 86 insertions(+), 62 deletions(-) diff --git a/include/sys/zfs_rlock.h b/include/sys/zfs_rlock.h index ea5e40369291..cbc573a40cde 100644 --- a/include/sys/zfs_rlock.h +++ b/include/sys/zfs_rlock.h @@ -32,7 +32,9 @@ extern "C" { #ifdef _KERNEL -#include +#include +#include +#include typedef enum { RL_READER, @@ -40,8 +42,13 @@ typedef enum { RL_APPEND } rl_type_t; +typedef struct zfs_rlock { + kmutex_t zr_mutex; /* protects changes to zr_avl */ + avl_tree_t zr_avl; /* avl tree of range locks */ +} zfs_rlock_t; + typedef struct rl { - znode_t *r_zp; /* znode this lock applies to */ + zfs_rlock_t *r_zrl; avl_node_t r_node; /* avl node link */ uint64_t r_off; /* file range offset */ uint64_t r_len; /* file range length */ @@ -55,16 +62,26 @@ typedef struct rl { list_node_t rl_node; /* used for deferred release */ } rl_t; +struct znode; /* * Lock a range (offset, length) as either shared (RL_READER) * or exclusive (RL_WRITER or RL_APPEND). RL_APPEND is a special type that * is converted to RL_WRITER that specified to lock from the start of the * end of file. Returns the range lock structure. + * + * Filesystem should call zfs_range_lock. + * Zvol should call zvol_range_lock. */ -rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type); +rl_t *zfs_range_lock_impl(zfs_rlock_t *zrl, uint64_t off, uint64_t len, + rl_type_t type, struct znode *zp); +#define zfs_range_lock(zp, off, len, type) \ + zfs_range_lock_impl(&(zp)->z_range_lock, off, len, type, zp) +#define zvol_range_lock(zrl, off, len, type) \ + zfs_range_lock_impl(zrl, off, len, type, NULL) /* Unlock range and destroy range lock structure. */ void zfs_range_unlock(rl_t *rl); +#define zvol_range_unlock(rl) zfs_range_unlock(rl) /* * Reduce range locked as RW_WRITER from whole file to specified range. @@ -78,6 +95,18 @@ void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len); */ int zfs_range_compare(const void *arg1, const void *arg2); +static inline void zfs_rlock_init(zfs_rlock_t *zrl) +{ + mutex_init(&zrl->zr_mutex, NULL, MUTEX_DEFAULT, NULL); + avl_create(&zrl->zr_avl, zfs_range_compare, + sizeof (rl_t), offsetof(rl_t, r_node)); +} + +static inline void zfs_rlock_destroy(zfs_rlock_t *zrl) +{ + avl_destroy(&zrl->zr_avl); + mutex_destroy(&zrl->zr_mutex); +} #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index 65fa1039926f..30a6ec9f807a 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -37,6 +37,7 @@ #include #include #include +#include #endif #include #include @@ -187,8 +188,7 @@ typedef struct znode { krwlock_t z_parent_lock; /* parent lock for directories */ krwlock_t z_name_lock; /* "master" lock for dirent locks */ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ - kmutex_t z_range_lock; /* protects changes to z_range_avl */ - avl_tree_t z_range_avl; /* avl tree of file range locks */ + zfs_rlock_t z_range_lock; /* file range lock */ uint8_t z_unlinked; /* file has been unlinked */ uint8_t z_atime_dirty; /* atime needs to be synced */ uint8_t z_zn_prefetch; /* Prefetch znodes? */ @@ -212,7 +212,6 @@ typedef struct znode { list_node_t z_link_node; /* all znodes in fs link */ sa_handle_t *z_sa_hdl; /* handle to sa data */ boolean_t z_is_sa; /* are we native sa? */ - boolean_t z_is_zvol; /* are we used by the zvol */ boolean_t z_is_mapped; /* are we mmap'ed */ boolean_t z_is_ctldir; /* are we .zfs entry */ boolean_t z_is_stale; /* are we stale due to rollback? */ diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index 937feac23148..4aad284dbe03 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -484,7 +484,6 @@ zfsctl_inode_alloc(zfs_sb_t *zsb, uint64_t id, zp->z_gid = 0; zp->z_mode = 0; zp->z_sync_cnt = 0; - zp->z_is_zvol = B_FALSE; zp->z_is_mapped = B_FALSE; zp->z_is_ctldir = B_TRUE; zp->z_is_sa = B_FALSE; diff --git a/module/zfs/zfs_rlock.c b/module/zfs/zfs_rlock.c index 5064eb796b79..a3e9998146c3 100644 --- a/module/zfs/zfs_rlock.c +++ b/module/zfs/zfs_rlock.c @@ -96,14 +96,15 @@ */ #include +#include /* * Check if a write lock can be grabbed, or wait and recheck until available. */ static void -zfs_range_lock_writer(znode_t *zp, rl_t *new) +zfs_range_lock_writer(zfs_rlock_t *zrl, rl_t *new, znode_t *zp) { - avl_tree_t *tree = &zp->z_range_avl; + avl_tree_t *tree = &zrl->zr_avl; rl_t *rl; avl_index_t where; uint64_t end_size; @@ -112,17 +113,16 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new) for (;;) { /* - * Range locking is also used by zvol and uses a - * dummied up znode. However, for zvol, we don't need to - * append or grow blocksize, and besides we don't have - * a "sa" data or zfs_sb_t - so skip that processing. + * Range locking is also used by zvol. However, for zvol, we + * don't need to append or grow blocksize, so skip that + * processing. * * Yes, this is ugly, and would be solved by not handling * grow or append in range lock code. If that was done then * we could make the range locking code generically available * to other non-zfs consumers. */ - if (!zp->z_is_zvol) { /* caller is ZPL */ + if (zp) { /* caller is ZPL */ /* * If in append mode pick up the current end of file. * This is done under z_range_lock to avoid races. @@ -175,7 +175,7 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new) cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); rl->r_write_wanted = B_TRUE; } - cv_wait(&rl->r_wr_cv, &zp->z_range_lock); + cv_wait(&rl->r_wr_cv, &zrl->zr_mutex); /* reset to original */ new->r_off = off; @@ -353,9 +353,9 @@ zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) * Check if a reader lock can be grabbed, or wait and recheck until available. */ static void -zfs_range_lock_reader(znode_t *zp, rl_t *new) +zfs_range_lock_reader(zfs_rlock_t *zrl, rl_t *new) { - avl_tree_t *tree = &zp->z_range_avl; + avl_tree_t *tree = &zrl->zr_avl; rl_t *prev, *next; avl_index_t where; uint64_t off = new->r_off; @@ -378,7 +378,7 @@ zfs_range_lock_reader(znode_t *zp, rl_t *new) cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); prev->r_read_wanted = B_TRUE; } - cv_wait(&prev->r_rd_cv, &zp->z_range_lock); + cv_wait(&prev->r_rd_cv, &zrl->zr_mutex); goto retry; } if (off + len < prev->r_off + prev->r_len) @@ -401,7 +401,7 @@ zfs_range_lock_reader(znode_t *zp, rl_t *new) cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); next->r_read_wanted = B_TRUE; } - cv_wait(&next->r_rd_cv, &zp->z_range_lock); + cv_wait(&next->r_rd_cv, &zrl->zr_mutex); goto retry; } if (off + len <= next->r_off + next->r_len) @@ -421,16 +421,20 @@ zfs_range_lock_reader(znode_t *zp, rl_t *new) * or exclusive (RL_WRITER). Returns the range lock structure * for later unlocking or reduce range (if entire file * previously locked as RL_WRITER). + * + * Filesystem should call zfs_range_lock. + * Zvol should call zvol_range_lock. */ rl_t * -zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) +zfs_range_lock_impl(zfs_rlock_t *zrl, uint64_t off, uint64_t len, + rl_type_t type, znode_t *zp) { rl_t *new; ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); new = kmem_alloc(sizeof (rl_t), KM_SLEEP); - new->r_zp = zp; + new->r_zrl = zrl; new->r_off = off; if (len + off < off) /* overflow */ len = UINT64_MAX - off; @@ -441,18 +445,18 @@ zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) new->r_write_wanted = B_FALSE; new->r_read_wanted = B_FALSE; - mutex_enter(&zp->z_range_lock); + mutex_enter(&zrl->zr_mutex); if (type == RL_READER) { /* * First check for the usual case of no locks */ - if (avl_numnodes(&zp->z_range_avl) == 0) - avl_add(&zp->z_range_avl, new); + if (avl_numnodes(&zrl->zr_avl) == 0) + avl_add(&zrl->zr_avl, new); else - zfs_range_lock_reader(zp, new); - } else - zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */ - mutex_exit(&zp->z_range_lock); + zfs_range_lock_reader(zrl, new); + } else /* RL_WRITER or RL_APPEND */ + zfs_range_lock_writer(zrl, new, zp); + mutex_exit(&zrl->zr_mutex); return (new); } @@ -474,9 +478,9 @@ zfs_range_free(void *arg) * Unlock a reader lock */ static void -zfs_range_unlock_reader(znode_t *zp, rl_t *remove, list_t *free_list) +zfs_range_unlock_reader(zfs_rlock_t *zrl, rl_t *remove, list_t *free_list) { - avl_tree_t *tree = &zp->z_range_avl; + avl_tree_t *tree = &zrl->zr_avl; rl_t *rl, *next = NULL; uint64_t len; @@ -543,7 +547,7 @@ zfs_range_unlock_reader(znode_t *zp, rl_t *remove, list_t *free_list) void zfs_range_unlock(rl_t *rl) { - znode_t *zp = rl->r_zp; + zfs_rlock_t *zrl = rl->r_zrl; list_t free_list; rl_t *free_rl; @@ -552,10 +556,10 @@ zfs_range_unlock(rl_t *rl) ASSERT(!rl->r_proxy); list_create(&free_list, sizeof (rl_t), offsetof(rl_t, rl_node)); - mutex_enter(&zp->z_range_lock); + mutex_enter(&zrl->zr_mutex); if (rl->r_type == RL_WRITER) { /* writer locks can't be shared or split */ - avl_remove(&zp->z_range_avl, rl); + avl_remove(&zrl->zr_avl, rl); if (rl->r_write_wanted) cv_broadcast(&rl->r_wr_cv); @@ -568,9 +572,9 @@ zfs_range_unlock(rl_t *rl) * lock may be shared, let zfs_range_unlock_reader() * release the zp->z_range_lock lock and free the rl_t */ - zfs_range_unlock_reader(zp, rl, &free_list); + zfs_range_unlock_reader(zrl, rl, &free_list); } - mutex_exit(&zp->z_range_lock); + mutex_exit(&zrl->zr_mutex); while ((free_rl = list_head(&free_list)) != NULL) { list_remove(&free_list, free_rl); @@ -588,17 +592,17 @@ zfs_range_unlock(rl_t *rl) void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) { - znode_t *zp = rl->r_zp; + zfs_rlock_t *zrl = rl->r_zrl; /* Ensure there are no other locks */ - ASSERT(avl_numnodes(&zp->z_range_avl) == 1); + ASSERT(avl_numnodes(&zrl->zr_avl) == 1); ASSERT(rl->r_off == 0); ASSERT(rl->r_type == RL_WRITER); ASSERT(!rl->r_proxy); ASSERT3U(rl->r_len, ==, UINT64_MAX); ASSERT3U(rl->r_cnt, ==, 1); - mutex_enter(&zp->z_range_lock); + mutex_enter(&zrl->zr_mutex); rl->r_off = off; rl->r_len = len; @@ -607,7 +611,7 @@ zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) if (rl->r_read_wanted) cv_broadcast(&rl->r_rd_cv); - mutex_exit(&zp->z_range_lock); + mutex_exit(&zrl->zr_mutex); } /* diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 39b2ba07afd8..d9c132b1f7f7 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -113,9 +113,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); - mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&zp->z_range_avl, zfs_range_compare, - sizeof (rl_t), offsetof(rl_t, r_node)); + zfs_rlock_init(&zp->z_range_lock); zp->z_dirlocks = NULL; zp->z_acl_cached = NULL; @@ -137,8 +135,7 @@ zfs_znode_cache_destructor(void *buf, void *arg) rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); rw_destroy(&zp->z_xattr_lock); - avl_destroy(&zp->z_range_avl); - mutex_destroy(&zp->z_range_lock); + zfs_rlock_destroy(&zp->z_range_lock); ASSERT(zp->z_dirlocks == NULL); ASSERT(zp->z_acl_cached == NULL); @@ -615,7 +612,6 @@ zfs_znode_alloc(zfs_sb_t *zsb, dmu_buf_t *db, int blksz, zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; - zp->z_is_zvol = B_FALSE; zp->z_is_mapped = B_FALSE; zp->z_is_ctldir = B_FALSE; zp->z_is_stale = B_FALSE; diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index ba482a4740eb..7b4b331abe97 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -75,7 +75,7 @@ typedef struct zvol_state { uint32_t zv_open_count; /* open counts */ uint32_t zv_changed; /* disk changed */ zilog_t *zv_zilog; /* ZIL handle */ - znode_t zv_znode; /* for range locking */ + zfs_rlock_t zv_range_lock; /* range lock */ dmu_buf_t *zv_dbuf; /* bonus handle */ dev_t zv_dev; /* device id */ struct gendisk *zv_disk; /* generic disk */ @@ -633,8 +633,8 @@ zvol_write(zvol_state_t *zv, uio_t *uio, boolean_t sync) ASSERT(zv && zv->zv_open_count > 0); - rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, - RL_WRITER); + rl = zvol_range_lock(&zv->zv_range_lock, uio->uio_loffset, + uio->uio_resid, RL_WRITER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); @@ -660,7 +660,7 @@ zvol_write(zvol_state_t *zv, uio_t *uio, boolean_t sync) if (error) break; } - zfs_range_unlock(rl); + zvol_range_unlock(rl); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); return (error); @@ -725,7 +725,7 @@ zvol_discard(struct bio *bio) if (start >= end) return (0); - rl = zfs_range_lock(&zv->zv_znode, start, size, RL_WRITER); + rl = zvol_range_lock(&zv->zv_range_lock, start, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); @@ -738,7 +738,7 @@ zvol_discard(struct bio *bio) ZVOL_OBJ, start, size); } - zfs_range_unlock(rl); + zvol_range_unlock(rl); return (error); } @@ -752,8 +752,8 @@ zvol_read(zvol_state_t *zv, uio_t *uio) ASSERT(zv && zv->zv_open_count > 0); - rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, - RL_READER); + rl = zvol_range_lock(&zv->zv_range_lock, uio->uio_loffset, + uio->uio_resid, RL_READER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); @@ -769,7 +769,7 @@ zvol_read(zvol_state_t *zv, uio_t *uio) break; } } - zfs_range_unlock(rl); + zvol_range_unlock(rl); return (error); } @@ -850,7 +850,7 @@ zvol_get_done(zgd_t *zgd, int error) if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); - zfs_range_unlock(zgd->zgd_rl); + zvol_range_unlock(zgd->zgd_rl); if (error == 0 && zgd->zgd_bp) zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); @@ -879,7 +879,8 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_zilog = zv->zv_zilog; - zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); + zgd->zgd_rl = zvol_range_lock(&zv->zv_range_lock, offset, size, + RL_READER); /* * Write records come in two flavors: immediate and indirect. @@ -1305,10 +1306,7 @@ zvol_alloc(dev_t dev, const char *name) zv->zv_open_count = 0; strlcpy(zv->zv_name, name, MAXNAMELEN); - mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, - sizeof (rl_t), offsetof(rl_t, r_node)); - zv->zv_znode.z_is_zvol = TRUE; + zfs_rlock_init(&zv->zv_range_lock); zv->zv_disk->major = zvol_major; zv->zv_disk->first_minor = (dev & MINORMASK); @@ -1337,8 +1335,7 @@ zvol_free(zvol_state_t *zv) ASSERT(MUTEX_HELD(&zvol_state_lock)); ASSERT(zv->zv_open_count == 0); - avl_destroy(&zv->zv_znode.z_range_avl); - mutex_destroy(&zv->zv_znode.z_range_lock); + zfs_rlock_destroy(&zv->zv_range_lock); zv->zv_disk->private_data = NULL;