Skip to content

Commit

Permalink
Consider dnode_t allocations in dbuf cache size accounting
Browse files Browse the repository at this point in the history
Entries in the dbuf cache contribute only the size of the dbuf data to
the cache size. Attached "user" data is not counted. This can lead to
the data currently "owned" by the cache consuming more memory accounting
appears to show. In some cases (eg a metadnode data block with all child
dnode_t slots allocated), the actual size can be as much as 3x as what
the cache believes it to be.

This is arguably correct behaviour, as the cache is only tracking the
size of the dbuf data, not even the overhead of the dbuf_t. On the other
hand, in the above case of dnodes, evicting cached metadnode dbufs is
the only current way to reclaim the dnode objects, and can lead to the
situation where the dbuf cache appears to be comfortably within its
target memory window and yet is holding enormous amounts of slab memory
that cannot be reclaimed.

This commit adds a facility for a dbuf user to artificially inflate the
apparent size of the dbuf for caching purposes. This at least allows for
cache tuning to be adjusted to match something closer to the real memory
overhead.

metadnode dbufs carry a >1KiB allocation per dnode in their user data.
This informs the dbuf cache machinery of that fact, allowing it to make
better decisions when evicting dbufs.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Rob Norris <[email protected]>
Closes openzfs#15511
robn authored Nov 17, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
1 parent 6c6fae6 commit 92dc4ad
Showing 5 changed files with 102 additions and 26 deletions.
20 changes: 11 additions & 9 deletions cmd/dbufstat.in
Original file line number Diff line number Diff line change
@@ -37,7 +37,7 @@ import re

bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"]
bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize",
"meta", "state", "dbholds", "dbc", "list", "atype", "flags",
"usize", "meta", "state", "dbholds", "dbc", "list", "atype", "flags",
"count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2",
"l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype",
"data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"]
@@ -47,17 +47,17 @@ dhdr = ["pool", "objset", "object", "dtype", "cached"]
dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs",
"bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct",
"indirect", "bonus", "spill"]
dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds",
"dbc", "list", "atype", "flags", "count", "asize", "access",
"mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
"l2_comp", "aholds"]
dincompat = ["level", "blkid", "offset", "dbsize", "usize", "meta", "state",
"dbholds", "dbc", "list", "atype", "flags", "count", "asize",
"access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
"l2_asize", "l2_comp", "aholds"]

thdr = ["pool", "objset", "dtype", "cached"]
txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect",
"bonus", "spill"]
tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state",
"dbc", "dbholds", "list", "atype", "flags", "count", "asize",
"access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
tincompat = ["object", "level", "blkid", "offset", "dbsize", "usize", "meta",
"state", "dbc", "dbholds", "list", "atype", "flags", "count",
"asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
"l2_asize", "l2_comp", "aholds", "btype", "data_bs", "meta_bs",
"bsize", "lvls", "dholds", "blocks", "dsize"]

@@ -70,6 +70,7 @@ cols = {
"blkid": [8, -1, "block number of buffer"],
"offset": [12, 1024, "offset in object of buffer"],
"dbsize": [7, 1024, "size of buffer"],
"usize": [7, 1024, "size of attached user data"],
"meta": [4, -1, "is this buffer metadata?"],
"state": [5, -1, "state of buffer (read, cached, etc)"],
"dbholds": [7, 1000, "number of holds on buffer"],
@@ -399,6 +400,7 @@ def update_dict(d, k, line, labels):
key = line[labels[k]]

dbsize = int(line[labels['dbsize']])
usize = int(line[labels['usize']])
blkid = int(line[labels['blkid']])
level = int(line[labels['level']])

@@ -416,7 +418,7 @@ def update_dict(d, k, line, labels):
d[pool][objset][key]['indirect'] = 0
d[pool][objset][key]['spill'] = 0

d[pool][objset][key]['cached'] += dbsize
d[pool][objset][key]['cached'] += dbsize + usize

if blkid == -1:
d[pool][objset][key]['bonus'] += dbsize
13 changes: 13 additions & 0 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
@@ -652,6 +652,9 @@ typedef struct dmu_buf_user {
*/
taskq_ent_t dbu_tqent;

/* Size of user data, for inclusion in dbuf_cache accounting. */
uint64_t dbu_size;

/*
* This instance's eviction function pointers.
*
@@ -733,6 +736,16 @@ void *dmu_buf_replace_user(dmu_buf_t *db,
*/
void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);

/*
* User data size accounting. This can be used to artifically inflate the size
* of the dbuf during cache accounting, so that dbuf_evict_thread evicts enough
* to satisfy memory reclaim requests. It's not used for anything else, and
* defaults to 0.
*/
uint64_t dmu_buf_user_size(dmu_buf_t *db);
void dmu_buf_add_user_size(dmu_buf_t *db, uint64_t nadd);
void dmu_buf_sub_user_size(dmu_buf_t *db, uint64_t nsub);

/*
* Returns the user data (dmu_buf_user_t *) associated with this dbuf.
*/
63 changes: 55 additions & 8 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
@@ -569,6 +569,21 @@ dbuf_evict_user(dmu_buf_impl_t *db)
*dbu->dbu_clear_on_evict_dbufp = NULL;
#endif

if (db->db_caching_status != DB_NO_CACHE) {
/*
* This is a cached dbuf, so the size of the user data is
* included in its cached amount. We adjust it here because the
* user data has already been detached from the dbuf, and the
* sync functions are not supposed to touch it (the dbuf might
* not exist anymore by the time the sync functions run.
*/
uint64_t size = dbu->dbu_size;
(void) zfs_refcount_remove_many(
&dbuf_caches[db->db_caching_status].size, size, db);
if (db->db_caching_status == DB_DBUF_CACHE)
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
}

/*
* There are two eviction callbacks - one that we call synchronously
* and one that we invoke via a taskq. The async one is useful for
@@ -770,12 +785,12 @@ dbuf_evict_one(void)
if (db != NULL) {
multilist_sublist_remove(mls, db);
multilist_sublist_unlock(mls);
uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
(void) zfs_refcount_remove_many(
&dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
&dbuf_caches[DB_DBUF_CACHE].size, size, db);
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
db->db.db_size);
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
db->db_caching_status = DB_NO_CACHE;
dbuf_destroy(db);
@@ -3002,6 +3017,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
db->db_caching_status == DB_DBUF_METADATA_CACHE);

multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);

ASSERT0(dmu_buf_user_size(&db->db));
(void) zfs_refcount_remove_many(
&dbuf_caches[db->db_caching_status].size,
db->db.db_size, db);
@@ -3749,17 +3766,17 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_caching_status == DB_DBUF_METADATA_CACHE);

multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);

uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
(void) zfs_refcount_remove_many(
&dbuf_caches[db->db_caching_status].size,
db->db.db_size, db);
&dbuf_caches[db->db_caching_status].size, size, db);

if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
DBUF_STAT_BUMPDOWN(metadata_cache_count);
} else {
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
db->db.db_size);
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
}
db->db_caching_status = DB_NO_CACHE;
}
@@ -3978,7 +3995,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
db->db_caching_status = dcs;

multilist_insert(&dbuf_caches[dcs].cache, db);
uint64_t db_size = db->db.db_size;
uint64_t db_size = db->db.db_size +
dmu_buf_user_size(&db->db);
size = zfs_refcount_add_many(
&dbuf_caches[dcs].size, db_size, db);
uint8_t db_level = db->db_level;
@@ -4074,6 +4092,35 @@ dmu_buf_get_user(dmu_buf_t *db_fake)
return (db->db_user);
}

uint64_t
dmu_buf_user_size(dmu_buf_t *db_fake)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
if (db->db_user == NULL)
return (0);
return (atomic_load_64(&db->db_user->dbu_size));
}

void
dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
ASSERT3P(db->db_user, !=, NULL);
ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
atomic_add_64(&db->db_user->dbu_size, nadd);
}

void
dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
ASSERT3P(db->db_user, !=, NULL);
ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
atomic_sub_64(&db->db_user->dbu_size, nsub);
}

void
dmu_buf_user_evict_wait(void)
{
13 changes: 7 additions & 6 deletions module/zfs/dbuf_stats.c
Original file line number Diff line number Diff line change
@@ -46,14 +46,14 @@ static int
dbuf_stats_hash_table_headers(char *buf, size_t size)
{
(void) snprintf(buf, size,
"%-96s | %-119s | %s\n"
"%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-5s %-5s %-7s %3s | "
"%-105s | %-119s | %s\n"
"%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-8s %-5s %-5s %-7s %3s | "
"%-5s %-5s %-9s %-6s %-8s %-12s "
"%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-6s | "
"%-6s %-6s %-8s %-8s %-6s %-6s %-6s %-8s %-8s\n",
"dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
"blkid", "offset", "dbsize", "meta", "state", "dbholds", "dbc",
"list", "atype", "flags", "count", "asize", "access",
"blkid", "offset", "dbsize", "usize", "meta", "state", "dbholds",
"dbc", "list", "atype", "flags", "count", "asize", "access",
"mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
"l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs",
"bsize", "lvls", "dholds", "blocks", "dsize");
@@ -75,8 +75,8 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
__dmu_object_info_from_dnode(dn, &doi);

nwritten = snprintf(buf, size,
"%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-5d %-5d "
"%-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
"%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-8llu "
"%-5d %-5d %-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
"%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-6lu | "
"%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-6lu %-8llu %-8llu\n",
/* dmu_buf_impl_t */
@@ -87,6 +87,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
(longlong_t)db->db_blkid,
(u_longlong_t)db->db.db_offset,
(u_longlong_t)db->db.db_size,
(u_longlong_t)dmu_buf_user_size(&db->db),
!!dbuf_is_metadata(db),
db->db_state,
(ulong_t)zfs_refcount_count(&db->db_holds),
19 changes: 16 additions & 3 deletions module/zfs/dnode.c
Original file line number Diff line number Diff line change
@@ -1237,9 +1237,11 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
return (B_TRUE);
}

static void
static uint_t
dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
{
uint_t reclaimed = 0;

ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);

for (int i = idx; i < idx + slots; i++) {
@@ -1251,8 +1253,11 @@ dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
dnode_destroy(dnh->dnh_dnode);
dnh->dnh_dnode = DN_SLOT_FREE;
reclaimed++;
}
}

return (reclaimed);
}

void
@@ -1565,6 +1570,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
} else {
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
dmu_buf_add_user_size(&db->db,
sizeof (dnode_t));
}
}

@@ -1622,15 +1629,21 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
* to be freed. Single slot dnodes can be safely
* re-purposed as a performance optimization.
*/
if (slots > 1)
dnode_reclaim_slots(dnc, idx + 1, slots - 1);
if (slots > 1) {
uint_t reclaimed =
dnode_reclaim_slots(dnc, idx + 1, slots - 1);
if (reclaimed > 0)
dmu_buf_sub_user_size(&db->db,
reclaimed * sizeof (dnode_t));
}

dnh = &dnc->dnc_children[idx];
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
dn = dnh->dnh_dnode;
} else {
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
dmu_buf_add_user_size(&db->db, sizeof (dnode_t));
}

mutex_enter(&dn->dn_mtx);

0 comments on commit 92dc4ad

Please sign in to comment.