Skip to content

Commit

Permalink
re #13729 assign each ARC hash bucket its own mutex
Browse files Browse the repository at this point in the history
In ARC the number of buckets in buffer header hash table is
proportional to the size of physical RAM.
The number of locks protecting headers in the buckets is fixed to 256 though.
Hence, on systems with large memory (>= 128GB) too many unrelated buffer
headers are protected by the same mutex.
When the memory in the system is fragmented this may cause a deadlock:
- An arc_read thread may be trying to allocate a 128k buffer while holding
a header lock.
- The allocation uses KM_PUSHPAGE option that blocks the thread if no contigous
chunk of requested size is available.
- ARC eviction thread that is supposed to evict some buffers would call
an evict callback on one of the buffers.
- Before freing the memory, the callback will attempt to take a lock on buffer
header.
- Incidentally, this buffer header will be protected by the same lock as
the one in arc_read() thread.

The solution in this patch is not perfect - that is, it protects all headers
in the hash bucket by the same lock.
However, a probability of collision is very low and does not depend on memory
size.
By the same argument, padding locks to cacheline looks like a waste of memory
here since the probability of contention on a cacheline is quite low, given
the number of buckets, number of locks per cacheline (4) and the fact that
the hash function (crc64 % hash table size) is supposed to be a very good
randomizer.

This effect on memory usage is as follows:
Per hash table size n,
- Original code uses 16K + 16 + n * 8 bytes of memory
- This fix uses 2 * n * 8 + 8 bytes of memory
- The net memory overhead is therefore n * 8 - 16K - 8 bytes
The value of n grows proportionally to physical memory size.
For 128GB of physical memory it is 2M, so the memory overhead is
16M - 16K - 8 bytes.
For smaller memory configurations the overhead is proportionally smaller, and
for larger memory configurations it is propottionally bigger.

The patch has been tested for 30+ hours using vdbench script that reproduces
hang with original code 100% of times in 20-30 minutes.
  • Loading branch information
Ilya Usvyatsky committed May 28, 2013
1 parent ec5ce0b commit 530ed44
Showing 1 changed file with 18 additions and 25 deletions.
43 changes: 18 additions & 25 deletions usr/src/uts/common/fs/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -539,28 +539,21 @@ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
* Hash table routines
*/

#define HT_LOCK_PAD 64

struct ht_lock {
kmutex_t ht_lock;
#ifdef _KERNEL
unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
#endif
struct ht_table {
arc_buf_hdr_t *hdr;
kmutex_t lock;
};

#define BUF_LOCKS 256
typedef struct buf_hash_table {
uint64_t ht_mask;
arc_buf_hdr_t **ht_table;
struct ht_lock ht_locks[BUF_LOCKS];
struct ht_table *ht_table;
} buf_hash_table_t;

static buf_hash_table_t buf_hash_table;

#define BUF_HASH_INDEX(spa, dva, birth) \
(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
#define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_table[idx].lock)
#define HDR_LOCK(hdr) \
(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))

Expand Down Expand Up @@ -697,7 +690,7 @@ buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
arc_buf_hdr_t *buf;

mutex_enter(hash_lock);
for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
for (buf = buf_hash_table.ht_table[idx].hdr; buf != NULL;
buf = buf->b_hash_next) {
if (BUF_EQUAL(spa, dva, birth, buf)) {
*lockp = hash_lock;
Expand Down Expand Up @@ -726,14 +719,14 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
ASSERT(!HDR_IN_HASH_TABLE(buf));
*lockp = hash_lock;
mutex_enter(hash_lock);
for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
for (fbuf = buf_hash_table.ht_table[idx].hdr, i = 0; fbuf != NULL;
fbuf = fbuf->b_hash_next, i++) {
if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
return (fbuf);
}

buf->b_hash_next = buf_hash_table.ht_table[idx];
buf_hash_table.ht_table[idx] = buf;
buf->b_hash_next = buf_hash_table.ht_table[idx].hdr;
buf_hash_table.ht_table[idx].hdr = buf;
buf->b_flags |= ARC_IN_HASH_TABLE;

/* collect some hash table performance data */
Expand All @@ -760,7 +753,7 @@ buf_hash_remove(arc_buf_hdr_t *buf)
ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
ASSERT(HDR_IN_HASH_TABLE(buf));

bufp = &buf_hash_table.ht_table[idx];
bufp = &buf_hash_table.ht_table[idx].hdr;
while ((fbuf = *bufp) != buf) {
ASSERT(fbuf != NULL);
bufp = &fbuf->b_hash_next;
Expand All @@ -772,8 +765,8 @@ buf_hash_remove(arc_buf_hdr_t *buf)
/* collect some hash table performance data */
ARCSTAT_BUMPDOWN(arcstat_hash_elements);

if (buf_hash_table.ht_table[idx] &&
buf_hash_table.ht_table[idx]->b_hash_next == NULL)
if (buf_hash_table.ht_table[idx].hdr &&
buf_hash_table.ht_table[idx].hdr->b_hash_next == NULL)
ARCSTAT_BUMPDOWN(arcstat_hash_chains);
}

Expand All @@ -788,10 +781,10 @@ buf_fini(void)
{
int i;

for (i = 0; i < buf_hash_table.ht_mask + 1; i++)
mutex_destroy(&buf_hash_table.ht_table[i].lock);
kmem_free(buf_hash_table.ht_table,
(buf_hash_table.ht_mask + 1) * sizeof (void *));
for (i = 0; i < BUF_LOCKS; i++)
mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
(buf_hash_table.ht_mask + 1) * sizeof (struct ht_table));
kmem_cache_destroy(hdr_cache);
kmem_cache_destroy(buf_cache);
}
Expand Down Expand Up @@ -890,7 +883,7 @@ buf_init(void)
retry:
buf_hash_table.ht_mask = hsize - 1;
buf_hash_table.ht_table =
kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
kmem_zalloc(hsize * sizeof (struct ht_table), KM_NOSLEEP);
if (buf_hash_table.ht_table == NULL) {
ASSERT(hsize > (1ULL << 8));
hsize >>= 1;
Expand All @@ -906,8 +899,8 @@ buf_init(void)
for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);

for (i = 0; i < BUF_LOCKS; i++) {
mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
for (i = 0; i < hsize; i++) {
mutex_init(&buf_hash_table.ht_table[i].lock,
NULL, MUTEX_DEFAULT, NULL);
}
}
Expand Down

0 comments on commit 530ed44

Please sign in to comment.