re #13729 assign each ARC hash bucket its own mutex

In ARC the number of buckets in buffer header hash table is proportional to the size of physical RAM. The number of locks protecting headers in the buckets is fixed to 256 though. Hence, on systems with large memory (>= 128GB) too many unrelated buffer headers are protected by the same mutex. When the memory in the system is fragmented this may cause a deadlock: - An arc_read thread may be trying to allocate a 128k buffer while holding a header lock. - The allocation uses KM_PUSHPAGE option that blocks the thread if no contigous chunk of requested size is available. - ARC eviction thread that is supposed to evict some buffers would call an evict callback on one of the buffers. - Before freing the memory, the callback will attempt to take a lock on buffer header. - Incidentally, this buffer header will be protected by the same lock as the one in arc_read() thread. The solution in this patch is not perfect - that is, it protects all headers in the hash bucket by the same lock. However, a probability of collision is very low and does not depend on memory size. By the same argument, padding locks to cacheline looks like a waste of memory here since the probability of contention on a cacheline is quite low, given the number of buckets, number of locks per cacheline (4) and the fact that the hash function (crc64 % hash table size) is supposed to be a very good randomizer. This effect on memory usage is as follows: Per hash table size n, - Original code uses 16K + 16 + n * 8 bytes of memory - This fix uses 2 * n * 8 + 8 bytes of memory - The net memory overhead is therefore n * 8 - 16K - 8 bytes The value of n grows proportionally to physical memory size. For 128GB of physical memory it is 2M, so the memory overhead is 16M - 16K - 8 bytes. For smaller memory configurations the overhead is proportionally smaller, and for larger memory configurations it is propottionally bigger. The patch has been tested for 30+ hours using vdbench script that reproduces hang with original code 100% of times in 20-30 minutes.
Nexenta · May 28, 2013 · 530ed44 · 530ed44
1 parent ec5ce0b
commit 530ed44
Showing 1 changed file with 18 additions and 25 deletions.
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
@@ -539,28 +539,21 @@ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
  * Hash table routines
  */
 
-#define	HT_LOCK_PAD	64
-
-struct ht_lock {
-	kmutex_t	ht_lock;
-#ifdef _KERNEL
-	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
-#endif
+struct ht_table {
+	arc_buf_hdr_t	*hdr;
+	kmutex_t	lock;
 };
 
-#define	BUF_LOCKS 256
 typedef struct buf_hash_table {
 	uint64_t ht_mask;
-	arc_buf_hdr_t **ht_table;
-	struct ht_lock ht_locks[BUF_LOCKS];
+	struct ht_table *ht_table;
 } buf_hash_table_t;
 
 static buf_hash_table_t buf_hash_table;
 
 #define	BUF_HASH_INDEX(spa, dva, birth) \
 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
-#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
-#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
+#define	BUF_HASH_LOCK(idx) (&buf_hash_table.ht_table[idx].lock)
 #define	HDR_LOCK(hdr) \
 	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
@@ -697,7 +690,7 @@ buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 	arc_buf_hdr_t *buf;
 
 	mutex_enter(hash_lock);
-	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
+	for (buf = buf_hash_table.ht_table[idx].hdr; buf != NULL;
 	    buf = buf->b_hash_next) {
 		if (BUF_EQUAL(spa, dva, birth, buf)) {
 			*lockp = hash_lock;
@@ -726,14 +719,14 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 	ASSERT(!HDR_IN_HASH_TABLE(buf));
 	*lockp = hash_lock;
 	mutex_enter(hash_lock);
-	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
+	for (fbuf = buf_hash_table.ht_table[idx].hdr, i = 0; fbuf != NULL;
 	    fbuf = fbuf->b_hash_next, i++) {
 		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 			return (fbuf);
 	}
 
-	buf->b_hash_next = buf_hash_table.ht_table[idx];
-	buf_hash_table.ht_table[idx] = buf;
+	buf->b_hash_next = buf_hash_table.ht_table[idx].hdr;
+	buf_hash_table.ht_table[idx].hdr = buf;
 	buf->b_flags |= ARC_IN_HASH_TABLE;
 
 	/* collect some hash table performance data */
@@ -760,7 +753,7 @@ buf_hash_remove(arc_buf_hdr_t *buf)
 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 	ASSERT(HDR_IN_HASH_TABLE(buf));
 
-	bufp = &buf_hash_table.ht_table[idx];
+	bufp = &buf_hash_table.ht_table[idx].hdr;
 	while ((fbuf = *bufp) != buf) {
 		ASSERT(fbuf != NULL);
 		bufp = &fbuf->b_hash_next;
@@ -772,8 +765,8 @@ buf_hash_remove(arc_buf_hdr_t *buf)
 	/* collect some hash table performance data */
 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 
-	if (buf_hash_table.ht_table[idx] &&
-	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
+	if (buf_hash_table.ht_table[idx].hdr &&
+	    buf_hash_table.ht_table[idx].hdr->b_hash_next == NULL)
 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 }
 
@@ -788,10 +781,10 @@ buf_fini(void)
 {
 	int i;
 
+	for (i = 0; i < buf_hash_table.ht_mask + 1; i++)
+		mutex_destroy(&buf_hash_table.ht_table[i].lock);
 	kmem_free(buf_hash_table.ht_table,
-	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
-	for (i = 0; i < BUF_LOCKS; i++)
-		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+	    (buf_hash_table.ht_mask + 1) * sizeof (struct ht_table));
 	kmem_cache_destroy(hdr_cache);
 	kmem_cache_destroy(buf_cache);
 }
@@ -890,7 +883,7 @@ buf_init(void)
 retry:
 	buf_hash_table.ht_mask = hsize - 1;
 	buf_hash_table.ht_table =
-	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+	    kmem_zalloc(hsize * sizeof (struct ht_table), KM_NOSLEEP);
 	if (buf_hash_table.ht_table == NULL) {
 		ASSERT(hsize > (1ULL << 8));
 		hsize >>= 1;
@@ -906,8 +899,8 @@ buf_init(void)
 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 
-	for (i = 0; i < BUF_LOCKS; i++) {
-		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
+	for (i = 0; i < hsize; i++) {
+		mutex_init(&buf_hash_table.ht_table[i].lock,
 		    NULL, MUTEX_DEFAULT, NULL);
 	}
 }