From 69c252d54ffce9cd594ddc53a98957e91b307e50 Mon Sep 17 00:00:00 2001 From: Tony Nguyen Date: Thu, 3 Oct 2019 16:33:38 -0600 Subject: [PATCH] dbuf_hold_impl() cleanup to improve cached read performance Currently every dbuf_hold_impl() incurs kmem_alloc() and kmem_free() which can be costly for cached read performance. This change reverts the dbuf_hold_impl() fix stack commit, i.e. fc5bb51f08a6c91ff9ad3559d0266eeeab0b1f61 to eliminate the extra kmem_alloc() and kmem_free() operations and improve cached read performance. With the change, each dbuf_hold_impl() frame uses 40 bytes more, total of 800 for 20 recursive levels. Linux kernel stack sizes are 8K and 16K for 32bit and 64bit, respectively, so stack overrun risk is limited. Sample stack output comparisons with 50 PB file and recordsize=512 Current code 11) 2240 64 arc_alloc_buf+0x4a/0xd0 [zfs] 12) 2176 264 dbuf_read_impl.constprop.16+0x2e3/0x7f0 [zfs] 13) 1912 120 dbuf_read+0xe5/0x520 [zfs] 14) 1792 56 dbuf_hold_impl_arg+0x572/0x630 [zfs] 15) 1736 64 dbuf_hold_impl_arg+0x508/0x630 [zfs] 16) 1672 64 dbuf_hold_impl_arg+0x508/0x630 [zfs] 17) 1608 40 dbuf_hold_impl+0x23/0x40 [zfs] 18) 1568 40 dbuf_hold_level+0x32/0x60 [zfs] 19) 1528 16 dbuf_hold+0x16/0x20 [zfs] dbuf_hold_impl() cleanup 11) 2320 64 arc_alloc_buf+0x4a/0xd0 [zfs] 12) 2256 264 dbuf_read_impl.constprop.17+0x2e3/0x7f0 [zfs] 13) 1992 120 dbuf_read+0xe5/0x520 [zfs] 14) 1872 96 dbuf_hold_impl+0x50f/0x5e0 [zfs] 15) 1776 104 dbuf_hold_impl+0x4df/0x5e0 [zfs] 16) 1672 104 dbuf_hold_impl+0x4df/0x5e0 [zfs] 17) 1568 40 dbuf_hold_level+0x32/0x60 [zfs] 18) 1528 16 dbuf_hold+0x16/0x20 [zfs] Performance observations on 8K recordsize filesystem: - 8/128/1024K at 1-128 sequential cached read, ~3% improvement Testing done on Ubuntu 18.04 with 4.15 kernel, 8vCPUs and SSD storage on VMware ESX. Reviewed-by: Matt Ahrens Reviewed-by: Brian Behlendorf Signed-off-by: Tony Nguyen Closes #9351 Signed-off-by: Bryant G. Ly Conflicts: module/zfs/dbuf.c --- module/zfs/dbuf.c | 224 +++++++++++++++------------------------------- 1 file changed, 73 insertions(+), 151 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index d514dcd37c74..bb014e02f03e 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -148,29 +148,6 @@ dbuf_stats_t dbuf_stats = { continue; \ } -typedef struct dbuf_hold_arg { - /* Function arguments */ - dnode_t *dh_dn; - uint8_t dh_level; - uint64_t dh_blkid; - boolean_t dh_fail_sparse; - boolean_t dh_fail_uncached; - void *dh_tag; - dmu_buf_impl_t **dh_dbp; - /* Local variables */ - dmu_buf_impl_t *dh_db; - dmu_buf_impl_t *dh_parent; - blkptr_t *dh_bp; - int dh_err; - dbuf_dirty_record_t *dh_dr; -} dbuf_hold_arg_t; - -static dbuf_hold_arg_t *dbuf_hold_arg_create(dnode_t *dn, uint8_t level, - uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, - void *tag, dmu_buf_impl_t **dbp); -static int dbuf_hold_impl_arg(dbuf_hold_arg_t *dh); -static void dbuf_hold_arg_destroy(dbuf_hold_arg_t *dh); - static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); @@ -2757,10 +2734,10 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err; - dbuf_hold_arg_t *dh = dbuf_hold_arg_create(dn, level + 1, + + err = dbuf_hold_impl(dn, level + 1, blkid >> epbs, fail_sparse, FALSE, NULL, parentp); - err = dbuf_hold_impl_arg(dh); - dbuf_hold_arg_destroy(dh); + if (err) return (err); err = dbuf_read(*parentp, NULL, @@ -3138,24 +3115,19 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, zio_nowait(pio); } -#define DBUF_HOLD_IMPL_MAX_DEPTH 20 - /* - * Helper function for dbuf_hold_impl_arg() to copy a buffer. Handles + * Helper function for dbuf_hold_impl() to copy a buffer. Handles * the case of encrypted, compressed and uncompressed buffers by * allocating the new buffer, respectively, with arc_alloc_raw_buf(), * arc_alloc_compressed_buf() or arc_alloc_buf().* * - * NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl_arg(). + * NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl(). */ noinline static void -dbuf_hold_copy(struct dbuf_hold_arg *dh) +dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db) { - dnode_t *dn = dh->dh_dn; - dmu_buf_impl_t *db = dh->dh_db; - dbuf_dirty_record_t *dr = dh->dh_dr; + dbuf_dirty_record_t *dr = db->db_data_pending; arc_buf_t *data = dr->dt.dl.dr_data; - enum zio_compress compress_type = arc_get_compression(data); if (arc_is_encrypted(data)) { @@ -3187,163 +3159,113 @@ dbuf_hold_copy(struct dbuf_hold_arg *dh) * Returns with db_holds incremented, and db_mtx not held. * Note: dn_struct_rwlock must be held. */ -static int -dbuf_hold_impl_arg(struct dbuf_hold_arg *dh) +int +dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, + boolean_t fail_sparse, boolean_t fail_uncached, + void *tag, dmu_buf_impl_t **dbp) { - dh->dh_parent = NULL; + dmu_buf_impl_t *db, *parent = NULL; - ASSERT(dh->dh_blkid != DMU_BONUS_BLKID); - ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock)); - ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level); + /* If the pool has been created, verify the tx_sync_lock is not held */ + spa_t *spa = dn->dn_objset->os_spa; + dsl_pool_t *dp = spa->spa_dsl_pool; + if (dp != NULL) { + ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock)); + } - *(dh->dh_dbp) = NULL; + ASSERT(blkid != DMU_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT3U(dn->dn_nlevels, >, level); + + *dbp = NULL; /* dbuf_find() returns with db_mtx held */ - dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object, - dh->dh_level, dh->dh_blkid); + db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); - if (dh->dh_db == NULL) { - dh->dh_bp = NULL; + if (db == NULL) { + blkptr_t *bp = NULL; + int err; - if (dh->dh_fail_uncached) + if (fail_uncached) return (SET_ERROR(ENOENT)); - ASSERT3P(dh->dh_parent, ==, NULL); - dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid, - dh->dh_fail_sparse, &dh->dh_parent, &dh->dh_bp); - if (dh->dh_fail_sparse) { - if (dh->dh_err == 0 && - dh->dh_bp && BP_IS_HOLE(dh->dh_bp)) - dh->dh_err = SET_ERROR(ENOENT); - if (dh->dh_err) { - if (dh->dh_parent) - dbuf_rele(dh->dh_parent, NULL); - return (dh->dh_err); + ASSERT3P(parent, ==, NULL); + err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); + if (fail_sparse) { + if (err == 0 && bp && BP_IS_HOLE(bp)) + err = SET_ERROR(ENOENT); + if (err) { + if (parent) + dbuf_rele(parent, NULL); + return (err); } } - if (dh->dh_err && dh->dh_err != ENOENT) - return (dh->dh_err); - dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid, - dh->dh_parent, dh->dh_bp); + if (err && err != ENOENT) + return (err); + db = dbuf_create(dn, level, blkid, parent, bp); } - if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) { - mutex_exit(&dh->dh_db->db_mtx); + if (fail_uncached && db->db_state != DB_CACHED) { + mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } - if (dh->dh_db->db_buf != NULL) { - arc_buf_access(dh->dh_db->db_buf); - ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data); + if (db->db_buf != NULL) { + arc_buf_access(db->db_buf); + ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); } - ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf)); + ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); /* * If this buffer is currently syncing out, and we are are * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ - if (dh->dh_db->db_level == 0 && - dh->dh_db->db_blkid != DMU_BONUS_BLKID && - dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT && - dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) { - dh->dh_dr = dh->dh_db->db_data_pending; - if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf) - dbuf_hold_copy(dh); + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + dn->dn_object != DMU_META_DNODE_OBJECT && + db->db_state == DB_CACHED && db->db_data_pending) { + dbuf_dirty_record_t *dr = db->db_data_pending; + if (dr->dt.dl.dr_data == db->db_buf) + dbuf_hold_copy(dn, db); } - if (multilist_link_active(&dh->dh_db->db_cache_link)) { - ASSERT(zfs_refcount_is_zero(&dh->dh_db->db_holds)); - ASSERT(dh->dh_db->db_caching_status == DB_DBUF_CACHE || - dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE); + if (multilist_link_active(&db->db_cache_link)) { + ASSERT(zfs_refcount_is_zero(&db->db_holds)); + ASSERT(db->db_caching_status == DB_DBUF_CACHE || + db->db_caching_status == DB_DBUF_METADATA_CACHE); - multilist_remove( - dbuf_caches[dh->dh_db->db_caching_status].cache, - dh->dh_db); + multilist_remove(dbuf_caches[db->db_caching_status].cache, db); (void) zfs_refcount_remove_many( - &dbuf_caches[dh->dh_db->db_caching_status].size, - dh->dh_db->db.db_size, dh->dh_db); + &dbuf_caches[db->db_caching_status].size, + db->db.db_size, db); - if (dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE) { + if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMPDOWN(metadata_cache_count); } else { - DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]); + DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level], - dh->dh_db->db.db_size); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], + db->db.db_size); } - dh->dh_db->db_caching_status = DB_NO_CACHE; + db->db_caching_status = DB_NO_CACHE; } - (void) zfs_refcount_add(&dh->dh_db->db_holds, dh->dh_tag); - DBUF_VERIFY(dh->dh_db); - mutex_exit(&dh->dh_db->db_mtx); + (void) zfs_refcount_add(&db->db_holds, tag); + DBUF_VERIFY(db); + mutex_exit(&db->db_mtx); /* NOTE: we can't rele the parent until after we drop the db_mtx */ - if (dh->dh_parent) - dbuf_rele(dh->dh_parent, NULL); + if (parent) + dbuf_rele(parent, NULL); - ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn); - ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid); - ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level); - *(dh->dh_dbp) = dh->dh_db; + ASSERT3P(DB_DNODE(db), ==, dn); + ASSERT3U(db->db_blkid, ==, blkid); + ASSERT3U(db->db_level, ==, level); + *dbp = db; return (0); } -/* - * dbuf_hold_impl_arg() is called recursively, via dbuf_findbp(). There can - * be as many recursive calls as there are levels of on-disk indirect blocks, - * but typically only 0-2 recursive calls. To minimize the stack frame size, - * the recursive function's arguments and "local variables" are allocated on - * the heap as the dbuf_hold_arg_t. - */ -int -dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, - boolean_t fail_sparse, boolean_t fail_uncached, - void *tag, dmu_buf_impl_t **dbp) -{ - dbuf_hold_arg_t *dh = dbuf_hold_arg_create(dn, level, blkid, - fail_sparse, fail_uncached, tag, dbp); - - int error = dbuf_hold_impl_arg(dh); - - dbuf_hold_arg_destroy(dh); - - return (error); -} - -static dbuf_hold_arg_t * -dbuf_hold_arg_create(dnode_t *dn, uint8_t level, uint64_t blkid, - boolean_t fail_sparse, boolean_t fail_uncached, - void *tag, dmu_buf_impl_t **dbp) -{ - dbuf_hold_arg_t *dh = kmem_alloc(sizeof (*dh), KM_SLEEP); - dh->dh_dn = dn; - dh->dh_level = level; - dh->dh_blkid = blkid; - - dh->dh_fail_sparse = fail_sparse; - dh->dh_fail_uncached = fail_uncached; - - dh->dh_tag = tag; - dh->dh_dbp = dbp; - - dh->dh_db = NULL; - dh->dh_parent = NULL; - dh->dh_bp = NULL; - dh->dh_err = 0; - dh->dh_dr = NULL; - - return (dh); -} - -static void -dbuf_hold_arg_destroy(dbuf_hold_arg_t *dh) -{ - kmem_free(dh, sizeof (*dh)); -} - dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) {