Skip to content

Commit

Permalink
dbuf_hold_impl() cleanup to improve cached read performance
Browse files Browse the repository at this point in the history
Currently every dbuf_hold_impl() incurs kmem_alloc() and kmem_free()
which can be costly for cached read performance.

This change reverts the dbuf_hold_impl() fix stack commit, i.e.
fc5bb51 to eliminate the extra
kmem_alloc() and kmem_free() operations and improve cached read
performance. With the change, each dbuf_hold_impl() frame uses 40 bytes
more, total of 800 for 20 recursive levels. Linux kernel stack sizes are
8K and 16K for 32bit and 64bit, respectively, so stack overrun risk is
limited.

Sample stack output comparisons with 50 PB file and recordsize=512
Current code
 11)     2240      64   arc_alloc_buf+0x4a/0xd0 [zfs]
 12)     2176     264   dbuf_read_impl.constprop.16+0x2e3/0x7f0 [zfs]
 13)     1912     120   dbuf_read+0xe5/0x520 [zfs]
 14)     1792      56   dbuf_hold_impl_arg+0x572/0x630 [zfs]
 15)     1736      64   dbuf_hold_impl_arg+0x508/0x630 [zfs]
 16)     1672      64   dbuf_hold_impl_arg+0x508/0x630 [zfs]
 17)     1608      40   dbuf_hold_impl+0x23/0x40 [zfs]
 18)     1568      40   dbuf_hold_level+0x32/0x60 [zfs]
 19)     1528      16   dbuf_hold+0x16/0x20 [zfs]

dbuf_hold_impl() cleanup
 11)     2320      64   arc_alloc_buf+0x4a/0xd0 [zfs]
 12)     2256     264   dbuf_read_impl.constprop.17+0x2e3/0x7f0 [zfs]
 13)     1992     120   dbuf_read+0xe5/0x520 [zfs]
 14)     1872      96   dbuf_hold_impl+0x50f/0x5e0 [zfs]
 15)     1776     104   dbuf_hold_impl+0x4df/0x5e0 [zfs]
 16)     1672     104   dbuf_hold_impl+0x4df/0x5e0 [zfs]
 17)     1568      40   dbuf_hold_level+0x32/0x60 [zfs]
 18)     1528      16   dbuf_hold+0x16/0x20 [zfs]

Performance observations on 8K recordsize filesystem:
- 8/128/1024K at 1-128 sequential cached read, ~3% improvement

Testing done on Ubuntu 18.04 with 4.15 kernel, 8vCPUs and SSD storage on
VMware ESX.

Reviewed-by: Matt Ahrens <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Tony Nguyen <[email protected]>
Closes #9351
  • Loading branch information
tonynguien authored and behlendorf committed Oct 3, 2019
1 parent 73cdcc6 commit 64b6c47
Showing 1 changed file with 69 additions and 154 deletions.
223 changes: 69 additions & 154 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -148,29 +148,6 @@ dbuf_stats_t dbuf_stats = {
continue; \
}

typedef struct dbuf_hold_arg {
/* Function arguments */
dnode_t *dh_dn;
uint8_t dh_level;
uint64_t dh_blkid;
boolean_t dh_fail_sparse;
boolean_t dh_fail_uncached;
void *dh_tag;
dmu_buf_impl_t **dh_dbp;
/* Local variables */
dmu_buf_impl_t *dh_db;
dmu_buf_impl_t *dh_parent;
blkptr_t *dh_bp;
int dh_err;
dbuf_dirty_record_t *dh_dr;
} dbuf_hold_arg_t;

static dbuf_hold_arg_t *dbuf_hold_arg_create(dnode_t *dn, uint8_t level,
uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp);
static int dbuf_hold_impl_arg(dbuf_hold_arg_t *dh);
static void dbuf_hold_arg_destroy(dbuf_hold_arg_t *dh);

static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);

Expand Down Expand Up @@ -2805,10 +2782,10 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
} else if (level < nlevels-1) {
/* this block is referenced from an indirect block */
int err;
dbuf_hold_arg_t *dh = dbuf_hold_arg_create(dn, level + 1,

err = dbuf_hold_impl(dn, level + 1,
blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
err = dbuf_hold_impl_arg(dh);
dbuf_hold_arg_destroy(dh);

if (err)
return (err);
err = dbuf_read(*parentp, NULL,
Expand Down Expand Up @@ -3228,24 +3205,19 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
zio_nowait(pio);
}

#define DBUF_HOLD_IMPL_MAX_DEPTH 20

/*
* Helper function for dbuf_hold_impl_arg() to copy a buffer. Handles
* Helper function for dbuf_hold_impl() to copy a buffer. Handles
* the case of encrypted, compressed and uncompressed buffers by
* allocating the new buffer, respectively, with arc_alloc_raw_buf(),
* arc_alloc_compressed_buf() or arc_alloc_buf().*
*
* NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl_arg().
* NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl().
*/
noinline static void
dbuf_hold_copy(struct dbuf_hold_arg *dh)
dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
{
dnode_t *dn = dh->dh_dn;
dmu_buf_impl_t *db = dh->dh_db;
dbuf_dirty_record_t *dr = dh->dh_dr;
dbuf_dirty_record_t *dr = db->db_data_pending;
arc_buf_t *data = dr->dt.dl.dr_data;

enum zio_compress compress_type = arc_get_compression(data);

if (arc_is_encrypted(data)) {
Expand Down Expand Up @@ -3277,170 +3249,113 @@ dbuf_hold_copy(struct dbuf_hold_arg *dh)
* Returns with db_holds incremented, and db_mtx not held.
* Note: dn_struct_rwlock must be held.
*/
static int
dbuf_hold_impl_arg(struct dbuf_hold_arg *dh)
int
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp)
{
dh->dh_parent = NULL;

ASSERT(dh->dh_blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock));
ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level);

*(dh->dh_dbp) = NULL;
dmu_buf_impl_t *db, *parent = NULL;

/* If the pool has been created, verify the tx_sync_lock is not held */
spa_t *spa = dh->dh_dn->dn_objset->os_spa;
spa_t *spa = dn->dn_objset->os_spa;
dsl_pool_t *dp = spa->spa_dsl_pool;
if (dp != NULL) {
ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));
}

ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT3U(dn->dn_nlevels, >, level);

*dbp = NULL;

/* dbuf_find() returns with db_mtx held */
dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object,
dh->dh_level, dh->dh_blkid);
db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);

if (dh->dh_db == NULL) {
dh->dh_bp = NULL;
if (db == NULL) {
blkptr_t *bp = NULL;
int err;

if (dh->dh_fail_uncached)
if (fail_uncached)
return (SET_ERROR(ENOENT));

ASSERT3P(dh->dh_parent, ==, NULL);
dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid,
dh->dh_fail_sparse, &dh->dh_parent, &dh->dh_bp);
if (dh->dh_fail_sparse) {
if (dh->dh_err == 0 &&
dh->dh_bp && BP_IS_HOLE(dh->dh_bp))
dh->dh_err = SET_ERROR(ENOENT);
if (dh->dh_err) {
if (dh->dh_parent)
dbuf_rele(dh->dh_parent, NULL);
return (dh->dh_err);
ASSERT3P(parent, ==, NULL);
err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
if (fail_sparse) {
if (err == 0 && bp && BP_IS_HOLE(bp))
err = SET_ERROR(ENOENT);
if (err) {
if (parent)
dbuf_rele(parent, NULL);
return (err);
}
}
if (dh->dh_err && dh->dh_err != ENOENT)
return (dh->dh_err);
dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid,
dh->dh_parent, dh->dh_bp);
if (err && err != ENOENT)
return (err);
db = dbuf_create(dn, level, blkid, parent, bp);
}

if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) {
mutex_exit(&dh->dh_db->db_mtx);
if (fail_uncached && db->db_state != DB_CACHED) {
mutex_exit(&db->db_mtx);
return (SET_ERROR(ENOENT));
}

if (dh->dh_db->db_buf != NULL) {
arc_buf_access(dh->dh_db->db_buf);
ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data);
if (db->db_buf != NULL) {
arc_buf_access(db->db_buf);
ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
}

ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf));
ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));

/*
* If this buffer is currently syncing out, and we are
* still referencing it from db_data, we need to make a copy
* of it in case we decide we want to dirty it again in this txg.
*/
if (dh->dh_db->db_level == 0 &&
dh->dh_db->db_blkid != DMU_BONUS_BLKID &&
dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT &&
dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) {
dh->dh_dr = dh->dh_db->db_data_pending;
if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf)
dbuf_hold_copy(dh);
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
dn->dn_object != DMU_META_DNODE_OBJECT &&
db->db_state == DB_CACHED && db->db_data_pending) {
dbuf_dirty_record_t *dr = db->db_data_pending;
if (dr->dt.dl.dr_data == db->db_buf)
dbuf_hold_copy(dn, db);
}

if (multilist_link_active(&dh->dh_db->db_cache_link)) {
ASSERT(zfs_refcount_is_zero(&dh->dh_db->db_holds));
ASSERT(dh->dh_db->db_caching_status == DB_DBUF_CACHE ||
dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE);
if (multilist_link_active(&db->db_cache_link)) {
ASSERT(zfs_refcount_is_zero(&db->db_holds));
ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
db->db_caching_status == DB_DBUF_METADATA_CACHE);

multilist_remove(
dbuf_caches[dh->dh_db->db_caching_status].cache,
dh->dh_db);
multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
(void) zfs_refcount_remove_many(
&dbuf_caches[dh->dh_db->db_caching_status].size,
dh->dh_db->db.db_size, dh->dh_db);
&dbuf_caches[db->db_caching_status].size,
db->db.db_size, db);

if (dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE) {
if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
DBUF_STAT_BUMPDOWN(metadata_cache_count);
} else {
DBUF_STAT_BUMPDOWN(cache_levels[dh->dh_db->db_level]);
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
DBUF_STAT_DECR(cache_levels_bytes[dh->dh_db->db_level],
dh->dh_db->db.db_size);
DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
db->db.db_size);
}
dh->dh_db->db_caching_status = DB_NO_CACHE;
db->db_caching_status = DB_NO_CACHE;
}
(void) zfs_refcount_add(&dh->dh_db->db_holds, dh->dh_tag);
DBUF_VERIFY(dh->dh_db);
mutex_exit(&dh->dh_db->db_mtx);
(void) zfs_refcount_add(&db->db_holds, tag);
DBUF_VERIFY(db);
mutex_exit(&db->db_mtx);

/* NOTE: we can't rele the parent until after we drop the db_mtx */
if (dh->dh_parent)
dbuf_rele(dh->dh_parent, NULL);
if (parent)
dbuf_rele(parent, NULL);

ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn);
ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid);
ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level);
*(dh->dh_dbp) = dh->dh_db;
ASSERT3P(DB_DNODE(db), ==, dn);
ASSERT3U(db->db_blkid, ==, blkid);
ASSERT3U(db->db_level, ==, level);
*dbp = db;

return (0);
}

/*
* dbuf_hold_impl_arg() is called recursively, via dbuf_findbp(). There can
* be as many recursive calls as there are levels of on-disk indirect blocks,
* but typically only 0-2 recursive calls. To minimize the stack frame size,
* the recursive function's arguments and "local variables" are allocated on
* the heap as the dbuf_hold_arg_t.
*/
int
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp)
{
dbuf_hold_arg_t *dh = dbuf_hold_arg_create(dn, level, blkid,
fail_sparse, fail_uncached, tag, dbp);

int error = dbuf_hold_impl_arg(dh);

dbuf_hold_arg_destroy(dh);

return (error);
}

static dbuf_hold_arg_t *
dbuf_hold_arg_create(dnode_t *dn, uint8_t level, uint64_t blkid,
boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp)
{
dbuf_hold_arg_t *dh = kmem_alloc(sizeof (*dh), KM_SLEEP);
dh->dh_dn = dn;
dh->dh_level = level;
dh->dh_blkid = blkid;

dh->dh_fail_sparse = fail_sparse;
dh->dh_fail_uncached = fail_uncached;

dh->dh_tag = tag;
dh->dh_dbp = dbp;

dh->dh_db = NULL;
dh->dh_parent = NULL;
dh->dh_bp = NULL;
dh->dh_err = 0;
dh->dh_dr = NULL;

return (dh);
}

static void
dbuf_hold_arg_destroy(dbuf_hold_arg_t *dh)
{
kmem_free(dh, sizeof (*dh));
}

dmu_buf_impl_t *
dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
{
Expand Down

0 comments on commit 64b6c47

Please sign in to comment.