Skip to content

Commit

Permalink
Dynamically size dbuf hash mutex array
Browse files Browse the repository at this point in the history
Incorrectly sizing the array of hash locks used to protect the
dbuf hash table can lead to contention and reduce performance.
We could unconditionally allocate a larger array for the locks
but it's wasteful, particularly for a low-memory system.
Instead, dynamically allocate the array of locks and scale
it based on total system memory.

Additionally, add a new `dbuf_mutex_cache_shift` module option
which can be used to override the hash lock array size.  This is
disabled by default (dbuf_mutex_hash_shift=0) and can only be
set at module load time.  The minimum target array size is set
to 8192, this matches the current constant value.

Note that the count of the dbuf hash table and count of the
mutex array were added to the /proc/spl/kstat/zfs/dbufstats
kstat.

Finally, this change removes the _KERNEL conditional checks.
These were not required since for the user space build there
is no difference between the kmem and vmem interfaces.

Reviewed-by: Ryan Moeller <[email protected]>
Reviewed-by: Richard Yao <[email protected]>
Reviewed-by: Tony Hutter <[email protected]>
Reviewed-by: Alexander Motin <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes openzfs#13928
  • Loading branch information
behlendorf authored and snajpa committed Oct 22, 2022
1 parent cfb6524 commit 259511a
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 36 deletions.
9 changes: 5 additions & 4 deletions include/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -321,13 +321,14 @@ typedef struct dmu_buf_impl {
uint8_t db_dirtycnt;
} dmu_buf_impl_t;

/* Note: the dbuf hash table is exposed only for the mdb module */
#define DBUF_MUTEXES 2048
#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
#define DBUF_HASH_MUTEX(h, idx) \
(&(h)->hash_mutexes[(idx) & ((h)->hash_mutex_mask)])

typedef struct dbuf_hash_table {
uint64_t hash_table_mask;
uint64_t hash_mutex_mask;
dmu_buf_impl_t **hash_table;
kmutex_t hash_mutexes[DBUF_MUTEXES] ____cacheline_aligned;
kmutex_t *hash_mutexes;
} dbuf_hash_table_t;

typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t);
Expand Down
6 changes: 6 additions & 0 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ Set the size of the dbuf metadata cache
.Pq Sy dbuf_metadata_cache_max_bytes
to a log2 fraction of the target ARC size.
.
.It Sy dbuf_mutex_cache_shift Ns = Ns Sy 0 Pq uint
Set the size of the mutex array for the dbuf cache.
When set to
.Sy 0
the array is dynamically sized based on total system memory.
.
.It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq int
dnode slots allocated in a single operation as a power of 2.
The default value minimizes lock contention for the bulk operation performed.
Expand Down
86 changes: 54 additions & 32 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,11 @@ typedef struct dbuf_stats {
* already created and in the dbuf hash table.
*/
kstat_named_t hash_insert_race;
/*
* Number of entries in the hash table dbuf and mutex arrays.
*/
kstat_named_t hash_table_count;
kstat_named_t hash_mutex_count;
/*
* Statistics about the size of the metadata dbuf cache.
*/
Expand Down Expand Up @@ -130,6 +135,8 @@ dbuf_stats_t dbuf_stats = {
{ "hash_chains", KSTAT_DATA_UINT64 },
{ "hash_chain_max", KSTAT_DATA_UINT64 },
{ "hash_insert_race", KSTAT_DATA_UINT64 },
{ "hash_table_count", KSTAT_DATA_UINT64 },
{ "hash_mutex_count", KSTAT_DATA_UINT64 },
{ "metadata_cache_count", KSTAT_DATA_UINT64 },
{ "metadata_cache_size_bytes", KSTAT_DATA_UINT64 },
{ "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 },
Expand Down Expand Up @@ -231,6 +238,9 @@ unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX;
int dbuf_cache_shift = 5;
int dbuf_metadata_cache_shift = 6;

/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
static uint32_t dbuf_mutex_cache_shift = 0;

static unsigned long dbuf_cache_target_bytes(void);
static unsigned long dbuf_metadata_cache_target_bytes(void);

Expand Down Expand Up @@ -780,6 +790,7 @@ static int
dbuf_kstat_update(kstat_t *ksp, int rw)
{
dbuf_stats_t *ds = ksp->ks_data;
dbuf_hash_table_t *h = &dbuf_hash_table;

if (rw == KSTAT_WRITE)
return (SET_ERROR(EACCES));
Expand Down Expand Up @@ -809,6 +820,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw)
wmsum_value(&dbuf_sums.hash_chains);
ds->hash_insert_race.value.ui64 =
wmsum_value(&dbuf_sums.hash_insert_race);
ds->hash_table_count.value.ui64 = h->hash_table_mask + 1;
ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1;
ds->metadata_cache_count.value.ui64 =
wmsum_value(&dbuf_sums.metadata_cache_count);
ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
Expand All @@ -821,9 +834,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw)
void
dbuf_init(void)
{
uint64_t hsize = 1ULL << 16;
uint64_t hmsize, hsize = 1ULL << 16;
dbuf_hash_table_t *h = &dbuf_hash_table;
int i;

/*
* The hash table is big enough to fill one eighth of physical memory
Expand All @@ -834,29 +846,42 @@ dbuf_init(void)
while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
hsize <<= 1;

retry:
h->hash_table_mask = hsize - 1;
#if defined(_KERNEL)
h->hash_table = NULL;
while (h->hash_table == NULL) {
h->hash_table_mask = hsize - 1;

h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
if (h->hash_table == NULL)
hsize >>= 1;

ASSERT3U(hsize, >=, 1ULL << 10);
}

/*
* Large allocations which do not require contiguous pages
* should be using vmem_alloc() in the linux kernel
* The hash table buckets are protected by an array of mutexes where
* each mutex is reponsible for protecting 128 buckets. A minimum
* array size of 8192 is targeted to avoid contention.
*/
h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
#else
h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
#endif
if (h->hash_table == NULL) {
/* XXX - we should really return an error instead of assert */
ASSERT(hsize > (1ULL << 10));
hsize >>= 1;
goto retry;
if (dbuf_mutex_cache_shift == 0)
hmsize = MAX(hsize >> 7, 1ULL << 13);
else
hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24);

h->hash_mutexes = NULL;
while (h->hash_mutexes == NULL) {
h->hash_mutex_mask = hmsize - 1;

h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t),
KM_SLEEP);
if (h->hash_mutexes == NULL)
hmsize >>= 1;
}

dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
sizeof (dmu_buf_impl_t),
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);

for (i = 0; i < DBUF_MUTEXES; i++)
for (int i = 0; i < hmsize; i++)
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);

dbuf_stats_init(h);
Expand All @@ -883,7 +908,7 @@ dbuf_init(void)

wmsum_init(&dbuf_sums.cache_count, 0);
wmsum_init(&dbuf_sums.cache_total_evicts, 0);
for (i = 0; i < DN_MAX_LEVELS; i++) {
for (int i = 0; i < DN_MAX_LEVELS; i++) {
wmsum_init(&dbuf_sums.cache_levels[i], 0);
wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);
}
Expand All @@ -899,7 +924,7 @@ dbuf_init(void)
KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);
if (dbuf_ksp != NULL) {
for (i = 0; i < DN_MAX_LEVELS; i++) {
for (int i = 0; i < DN_MAX_LEVELS; i++) {
snprintf(dbuf_stats.cache_levels[i].name,
KSTAT_STRLEN, "cache_level_%d", i);
dbuf_stats.cache_levels[i].data_type =
Expand All @@ -919,21 +944,16 @@ void
dbuf_fini(void)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
int i;

dbuf_stats_destroy();

for (i = 0; i < DBUF_MUTEXES; i++)
for (int i = 0; i < (h->hash_mutex_mask + 1); i++)
mutex_destroy(&h->hash_mutexes[i]);
#if defined(_KERNEL)
/*
* Large allocations which do not require contiguous pages
* should be using vmem_free() in the linux kernel
*/

vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
#else
kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
#endif
vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) *
sizeof (kmutex_t));

kmem_cache_destroy(dbuf_kmem_cache);
taskq_destroy(dbu_evict_taskq);

Expand All @@ -960,7 +980,7 @@ dbuf_fini(void)

wmsum_fini(&dbuf_sums.cache_count);
wmsum_fini(&dbuf_sums.cache_total_evicts);
for (i = 0; i < DN_MAX_LEVELS; i++) {
for (int i = 0; i < DN_MAX_LEVELS; i++) {
wmsum_fini(&dbuf_sums.cache_levels[i]);
wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);
}
Expand Down Expand Up @@ -5057,6 +5077,8 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW,
"Set the size of the dbuf cache to a log2 fraction of arc size.");

ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW,
"Set the size of the dbuf metadata cache to a log2 fraction of arc "
"size.");
"Set size of dbuf metadata cache to log2 fraction of arc size.");

ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
"Set size of dbuf cache mutex array as log2 shift.");
/* END CSTYLED */

0 comments on commit 259511a

Please sign in to comment.